Description of soskstest

0001 % - object for testing normality using a Kolmogorov-Smirnov test
0002 % copyright 2009-2012 Blair Armstrong, Christine Watson, David Plaut
0003 %
0004 %    This file is part of SOS
0005 %
0006 %    SOS is free software: you can redistribute it and/or modify
0007 %    it for academic and non-commercial purposes
0008 %    under the terms of the GNU General Public License as published by
0009 %    the Free Software Foundation, either version 3 of the License, or
0010 %    (at your option) any later version.  For commercial or for-profit
0011 %    uses, please contact the authors (sos@cnbc.cmu.edu).
0012 %
0013 %    SOS is distributed in the hope that it will be useful,
0014 %    but WITHOUT ANY WARRANTY; without even the implied warranty of
0015 %    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0016 %    GNU General Public License for more details.
0017 
0018 %    You should have received a copy of the GNU General Public License
0019 %    along with SOS (see COPYING.txt).
0020 %    If not, see <http://www.gnu.org/licenses/>.
0021 
0022 
0023 classdef soskstest < handle
0024     % runs user-specified Kolmogorov-Smirnov tests to evaluate whether the
0025     % distribution of values on a given dimension is uniform.
0026     %   Note: test may be under-powered for sample sizes < 10.  Requires at
0027     %   least 2 bins in the histogram.
0028     %   Requiring a p > 0.001 to infer that a distribution is uniform is
0029     %   probably more than sufficiently in most cases.  Considerably smaller
0030     %   values may still yeild quite acceptable results.
0031 
0032     %
0033     %PROPERTIES
0034     %     sosObj % sos object the test is associated with
0035     %     name % string label to associate with the test
0036     %     s1 % sample1
0037     %     s1ColName % column name of data in sample1
0038     %     s1Col   % column index in sample1
0039     %     type    % type of ztest (currently only 'matchUniform')
0040     %     runSpecificTest % handle to specific test method
0041     %     desiredpvalCondition % indicates the desired outcome of the test, either NaN, <= or >=
0042     %     desiredpvalConditionHandle  % handle to the function that will evaluate whether the desiredpValCondition was met
0043     %     desiredpval % desired p-val to test against
0044     %     tail % tail of test. % Does nothing currently; must be modified for 'upper and 'lower
0045     %     lastp % last p-value that was calcualted
0046     %
0047     %METHODS:
0048     %   sosCorrelTest(sosObj,varargin)  %constructs an soskstest object of the specified type.
0049     %   constructMatchCorrelztest(obj,sosObj,varargin)  %initialize object
0050     %   [userHypothesis, prob, label] = runTest(obj,varargin) %runs the test
0051     %   [userHypothesis, prob, label] =  runkstest(obj,varargin) % runs a%   kstest
0052     %
0053     %METHODS (Static)
0054     %   userHypothesis = returnNaN(~,~)  % returns NaN
0055     %   flag = validTestType(str) % returns 1 if the name of type of test is 'matchUniform'; error otherwise.
0056 
0057     
0058     %% PROPERTIES
0059     properties
0060         sosObj % sos object the test is associated with
0061         name % string label to associate with the test
0062         s1 % sample1
0063         s1ColName % column name of data in sample1
0064         s1Col   % column index in sample1
0065         type    % type of test('matchUniform')
0066         runSpecificTest % handle to specific test method
0067         desiredpvalCondition % indicates the desired outcome of the test, either NaN, <= or >=
0068         desiredpvalConditionHandle  % handle to the function that will evaluate whether the desiredpValCondition was met
0069         desiredpval % desired p-val to test against
0070         tail % tail of test.  %currently does nothing, but must be recoded as 'upper' and 'lower'
0071         label % label denoting what is being tested
0072         pdSpread % 'sample' or 'allItems', as in the entropy constraint
0073         nbin % number of bins in the pdSpread
0074         lastp % last p-value reported by the test
0075     end % properties
0076 
0077    %% Properties (Constant)
0078     properties (Constant)
0079         smallVal = 0.00000001 % small value added to avoid case where a value is exactly equal the min or max range of the distribution
0080     end
0081     
0082     
0083     methods
0084         
0085         %% sosCorrelTest CONSTRUCTOR
0086         function obj = soskstest(varargin)
0087             %constructs an soskstest object of the specified type.
0088             %
0089             %PARAMETERS:
0090             % sosObj - sosObject test will be associated with
0091             % 'type'/string, type of test - ('matchUniform' only currently)
0092             % 'sample1'/sample - a sample associated with sosObj.
0093             % ... Other parameters as required by the constructor for the
0094             % specific type of t-test requested.  See those constructors
0095             % for additional info.
0096             %
0097             % Returns a kstest object.
0098             
0099 
0100             % perform basic checks for universally required components of
0101             % the kstest
0102             p = inputParser;
0103             p.addParamValue('sosObj', 'null',...
0104                 @(sosObj)strcmp(class(sosObj),'sos'));
0105             p.addParamValue('type','null', ...
0106                 @(type)soskstest.validTestType(type));
0107             p.addParamValue('sample1','null', ...
0108                             @(sample1)strcmp(class(sample1),'sample'));            
0109             p.addParamValue('name','noname',@(name)ischar(name));
0110             
0111             %keep all of the parameters for specific kstests
0112             p.KeepUnmatched = true;
0113             p.parse(varargin{:});
0114 
0115             
0116             
0117             %basic checks are passed, construct the specific type of t-test
0118             %that has been requested
0119 
0120            sosObj = p.Results.sosObj; %#ok<PROP>
0121            
0122             if(strcmp(p.Results.type,'matchUniform'))
0123                 obj.constructkstest(sosObj,varargin{:}) %#ok<PROP>
0124             else 
0125                error(['Specified <type>: ',p.Results.type, ...
0126                         ' is not supported']);  
0127             end
0128 
0129             obj.lastp = NaN;
0130             obj.sosObj = p.Results.sosObj;
0131             
0132             % add the test's name.
0133             if any(strcmp(p.UsingDefaults,'name'))
0134                  
0135                  numTests = length(obj.sosObj.sosstattests);
0136                 
0137              obj.name = ['kstest_',num2str(numTests+1)];  
0138              else
0139                  obj.name = p.Results.name;  
0140              end
0141             
0142         end % Constructor
0143         
0144 
0145         %% [userHypothesis, prob, label] = runTest(obj,varargin) METHOD
0146         function [userHypothesis, prob, label] = runTest(obj,varargin)
0147             %runs the kstest
0148             %
0149             % PARAMETERS:
0150             % 'reportStyle'/'short'|'full'/'none' - style of report to be
0151             %           printed.  Either none, short or full
0152             %
0153             % RETURNS:
0154             %   userHypothesis - whether the user's hypothesis has been met or not. NaN if no user hypothesis for the test.
0155             %   prob - p-value from the kstest
0156             %   label - string label denoting what was tested.
0157             
0158             [userHypothesis, prob, label] = obj.runSpecificTest(varargin);
0159             obj.lastp = prob;
0160         end
0161         
0162         
0163         %% [userHypothesis, prob, label] =  runkstest(varargin) METHOD
0164         function [userHypothesis, prob, label] = ...
0165                 runkstest(obj,varargin)
0166             %runs a kstest test
0167             %
0168             % PARAMETERS:
0169             % 'reportStyle'/'short'|'full' - style of report to be printed.  Either short or long
0170             %
0171             % RETURNS:
0172             %   userHypothesis - whether the user's hypothesis has been met or not. NaN if no user hypothesis for the test.
0173             %   prob - p-value from the ttest
0174             %   label - string label denoting what was tested.
0175                      
0176             
0177             varargin = varargin{1};
0178                       
0179             p = inputParser;
0180             
0181             p.addParamValue('reportStyle','short', ...
0182                     @(reportStyle)any([strcmp(reportStyle,'short') ...
0183                                     strcmp(reportStyle,'full') ...
0184                                     strcmp(reportStyle,'none')]));
0185             p.parse(varargin{:});
0186 
0187             
0188             reportStyle = p.Results.reportStyle;
0189 
0190 
0191             
0192             % calculates the initial entropy value for the scores in
0193             % sample1<sample1Col>
0194             
0195             scores = (obj.s1.zdata{obj.s1Col})';
0196  
0197             rawScores = (obj.s1.data{obj.s1Col})';
0198             
0199             % get min and max values for the probability distribution:
0200             %this procedure is different depending on whether we are using
0201             %the sample as the range or allItems
0202             
0203             if strcmp(obj.pdSpread,'sample')
0204                 minVal = min(scores);
0205                 maxVal = max(scores);    
0206                 minRaw = min(rawScores);
0207                 maxRaw = max(rawScores);
0208             elseif strcmp(obj.pdSpread,'allItems')
0209                 %check in the population
0210                 minVal = min([obj.s1.population.zdata{obj.s1Col}]);
0211                 maxVal = max([obj.s1.population.zdata{obj.s1Col}]);
0212                 minRaw = min([obj.s1.population.data{obj.s1Col}]);
0213                 maxRaw = max([obj.s1.population.data{obj.s1Col}]);
0214                 
0215                 %check in all samples associated with the population (this
0216                 %includes the original one
0217                 for i=1:length(obj.s1.population.samples)
0218                     minVal = min([minVal; ...
0219                         obj.s1.population.samples(i).zdata{obj.s1Col}]);
0220                     maxVal = max([maxVal; ...
0221                         obj.s1.population.samples(i).zdata{obj.s1Col}]);  
0222                     minRaw = min([minRaw; ...
0223                         obj.s1.population.samples(i).data{obj.s1Col}]);
0224                     maxRaw = max([maxRaw; ...
0225                         obj.s1.population.samples(i).data{obj.s1Col}]);                      
0226                 end
0227             else
0228                 error('Unsupported pdSpread in kstest.  pdSpread must be "sample" or "allItems"');
0229             end
0230                 
0231             % add a smallVal to avoid boundary cases
0232             minVal = minVal - obj.smallVal;
0233             maxVal = maxVal + obj.smallVal;
0234             
0235             %now have min and max values, determine size of each bin:
0236             
0237             spread  = maxVal - minVal;            
0238             binSize = spread/obj.nbin;
0239             
0240             %smallVal added so that the last score is included (simulating
0241             %<= rather than <
0242             bins = (minVal+0.5*binSize):binSize:(maxVal-0.5*binSize)+obj.smallVal;
0243               
0244             % scale values to 0-1
0245             pd = hist(scores,bins)/length(scores);
0246                           
0247             pdScores = [];            
0248             for i=1:length(pd)
0249                for j=1:pd(i)*length(scores) 
0250                    pdScores = [pdScores; bins(i)]; %#ok<AGROW>
0251                end
0252             end
0253            
0254             
0255             yScores = cdf('Uniform',bins,min(bins),max(bins));
0256             
0257             
0258            [h,prob,stats,cutoff] = kstest(pdScores, [bins; yScores]');     %#ok<ASGLU,NASGU>
0259 
0260     %       confirms that a uniform distribution fails to reject the null
0261     %       [h,prob,ksstat,cutoff] = kstest(bins, [bins; yScores]');
0262             
0263             
0264             % calculate entropy
0265             ent = pd*log((pd+1)')/length(pd);     
0266             ent = -1* (ent - 1/length(pd)*log(1/length(pd)+1)) / ...
0267                 (1*log(2)/length(pd) - 1/length(pd)*log(1/length(pd)+1));
0268       
0269              
0270             userHypothesis = obj.desiredpvalConditionHandle(...
0271                                   prob,obj.desiredpval);
0272              
0273              label = [obj.s1.name, '{',obj.s1ColName, '}-', ...
0274                     '{Uniform}'];
0275              
0276              if (isnan(userHypothesis))
0277                  printHyp = 'N/A';
0278              elseif userHypothesis == 1
0279                  printHyp = 'PASS';
0280              elseif userHypothesis == 0
0281                  printHyp = 'FAIL';
0282              end
0283                  
0284              
0285              if (strcmp(reportStyle,'short'))
0286                 verbosePrint([' UserHyp: ', printHyp, '; ', label, ': ', ...
0287                      'ks[',obj.type,'](',num2str(length(obj.s1.zdata{obj.s1Col})),') = ', ...
0288                      num2str(stats), ', p = ', num2str(prob), ...
0289                      ' p-des: ',num2str(obj.desiredpval)], ...
0290                      'soskstest_runMatchUniformkstest');
0291              elseif (strcmp(reportStyle,'full'))
0292                  verbosePrint([' UserHyp: ', printHyp , ...
0293                      '; ', label, ': ', ...
0294                      'ks[',obj.type,'](',num2str(length(obj.s1.zdata{obj.s1Col})),') = ', ...
0295                      num2str(stats), ', p = ', num2str(prob), ...  
0296                      ' p-des: ',num2str(obj.desiredpval), ...
0297                      ' ent = ', num2str(ent),', targmin = ',num2str(minRaw), ...
0298                      ', targmax = ',num2str(maxRaw)], ...
0299                      'soskstest_runMatchUniformkstest');
0300              end                            
0301         end 
0302 
0303         
0304  
0305         
0306         %% constructkstest(sosObj,varargin) METHOD
0307         function constructkstest(obj,sosObj,varargin)
0308             %initialize a kstest object.
0309             %
0310             % PARAMETERS:
0311             % Required:
0312             %   sosObj - sos Object test is to be associated with
0313             %   sample1 - a sample object
0314             %   s1ColName - name of data column in sample1
0315             %
0316             % Optional:
0317             %   desiredpvalCondition/string - desired condition for the ttest
0318             %       pval. Either it should exceed (=>) some value, be less
0319             %       than '<=' some condition, or be 'N/A' if there is no
0320             %       desired condition.  Default is N/A.  Note that the
0321             %       ordering of '=' and '<' is important, so though '<=' is
0322             %       valid, '=<' is not.
0323             %   desiredpval - desired p-value to evaluate the condition
0324             %       against.  Defaults to 0.05
0325             %   tail - tail of test - left/right/both
0326             %   pdSpread -'sample' or 'allItems', as in the entropy
0327             %   constraint
0328             %   nbins - number of bins (min2, defaults to #items)
0329             %
0330             % RETURNS:
0331             %   Configured sosCorrelTest object.
0332             
0333 
0334             p = inputParser;
0335 
0336             p.addRequired('sosObj', ...
0337                 @(sosObj)strcmp(class(sosObj),'sos'));
0338             p.addParamValue('type','null', ...
0339                 @(type)soskstest.validTestType(type));
0340             p.addParamValue('sample1','null', ...
0341                         @(sample1)strcmp(class(sample1),'sample'));
0342             p.addParamValue('s1ColName',NaN, ...
0343                 @(s1ColName)ischar(s1ColName));
0344             p.addParamValue('desiredpvalCondition','N/A', ...
0345                 @(desiredpvalCondition)any([strcmp(desiredpvalCondition,'<='), ...
0346                             strcmp(desiredpvalCondition,'=>'), ...
0347                             strcmp(desiredpvalCondition,'N/A')]));
0348             p.addParamValue('desiredpval', 0.05, ...
0349                 @(desiredpval)validateattributes(desiredpval, ...
0350                     {'numeric'}, ...
0351                     {'scalar', 'positive', '>=', 0, '<=', 1}));
0352             p.addParamValue('tail','both', ...
0353                 @(tail)any([strcmp(tail,'both'), strcmp(tail,'left'),strcmp(tail,'right')]));
0354             p.addParamValue('pdSpread','null', ...
0355                 @(pdSpread)any([strcmp(pdSpread,'sample'), strcmp(pdSpread,'allItems')]));            
0356             p.addParamValue('name','noname',@(name)ischar(name)); % dealt with in the main constructor
0357             p.addParamValue('nbin',2,@(nbin)validateattributes(nbin, {'numeric'}, ...
0358                 {'scalar', 'integer', 'positive', '>', 0}));
0359             p.parse(sosObj,varargin{:});
0360 
0361            
0362             
0363 
0364             sample1 = p.Results.sample1;
0365             s1ColName = p.Results.s1ColName; %#ok<PROP>
0366             obj.desiredpvalCondition = p.Results.desiredpvalCondition;
0367             obj.pdSpread = p.Results.pdSpread;
0368             
0369             
0370             if any(strcmp(p.UsingDefaults,'nbin'))
0371                 obj.nbin = p.Results.sample1.n;
0372             else
0373                 if p.Results.nbin <= p.Results.sample1.n
0374                     obj.nbin = p.Results.nbin;
0375                 else
0376                     error('Number of bins must be <= number of observations in the sample');
0377                 end
0378             end
0379             
0380             if obj.nbin < 2
0381                 error ('There must be at least 2 bins in the entropy calculation');
0382             end
0383             
0384             
0385             if strcmp(obj.desiredpvalCondition,'N/A')
0386                 obj.desiredpvalConditionHandle = @sosksest.returnNaN;
0387             elseif strcmp(obj.desiredpvalCondition,'<=')
0388                 obj.desiredpvalConditionHandle = @le;
0389             elseif strcmp(obj.desiredpvalCondition,'=>')
0390                 obj.desiredpvalConditionHandle = @ge;
0391             end
0392 
0393             % can't have a desiredpval without a desiredpvalcondition
0394             if strcmp(obj.desiredpvalCondition,'N/A')
0395                 obj.desiredpval = NaN;
0396             else
0397                 obj.desiredpval = p.Results.desiredpval;
0398             end
0399 
0400 
0401             % perform additional checks on the objects
0402             
0403             present1 = sosObj.containsSample(sample1);
0404             if (present1 == 0 )
0405                 error('sos sosObject does not contain sample1');
0406             end
0407 
0408             col1 = sample1.colName2colNum(s1ColName); %#ok<PROP>
0409             if(col1 == -1)
0410                 error('<s1ColName> not a column of data in <sample1>');
0411             end
0412             
0413             if isempty(sample1.data)
0414                 error('sample 1 does not contain items - did you fill it yet?');
0415             end
0416  
0417 
0418             
0419             if sample1.n < 2
0420                 % see note in the calculation for why the following is the
0421                 % case
0422                 error('The ks requires at least 2 items in the sample');
0423             end
0424             
0425             
0426             obj.s1 = sample1;
0427             obj.s1ColName = s1ColName; %#ok<PROP>
0428             obj.s1Col = col1;
0429             obj.type = p.Results.type;
0430             obj.tail = p.Results.tail;
0431 
0432             obj.runSpecificTest = @obj.runkstest;
0433             
0434             obj.label = [obj.s1.name, '{',obj.s1ColName, '}-{Uniform}', ...
0435                      ...
0436                     ':ks[',obj.type,']'];            
0437           
0438                 
0439         end
0440  
0441  
0442     end
0443     
0444      
0445     methods (Static)
0446         
0447         %% userHypothesis = returnNaN(~,~) STATIC FUNCTION
0448         function userHypothesis = returnNaN(~,~)
0449             % returns NaN
0450             userHypothesis = NaN;
0451         end
0452 
0453         %%  flag = validTestType(str) STATIC FUNCTION
0454         function flag = validTestType(str)
0455             % returns 1 if the name of type of test is 'matchUniform'; error otherwise.
0456             
0457             flag = 0; %#ok<NASGU>
0458             
0459             if(ischar(str) == false)
0460                 error('<Type> must be "matchUniform"');
0461             end
0462             
0463             if (strcmp(str,'matchUniform'))
0464                 flag = 1;   
0465             else
0466                 error('<Type> must be "matchUniform"');
0467             end
0468             
0469         end
0470     end
0471     
0472 end
0473
soskstest

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE