Description of dataFrame

0001 % - dataframe object
0002 %
0003 % copyright 2009-2012 Blair Armstrong, Christine Watson, David Plaut
0004 %
0005 %    This file is part of SOS
0006 %
0007 %    SOS is free software: you can redistribute it and/or modify
0008 %    it for academic and non-commercial purposes
0009 %    under the terms of the GNU General Public License as published by
0010 %    the Free Software Foundation, either version 3 of the License, or
0011 %    (at your option) any later version.  For commercial or for-profit
0012 %    uses, please contact the authors (sos@cnbc.cmu.edu).
0013 %
0014 %    SOS is distributed in the hope that it will be useful,
0015 %    but WITHOUT ANY WARRANTY; without even the implied warranty of
0016 %    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0017 %    GNU General Public License for more details.
0018 
0019 %    You should have received a copy of the GNU General Public License
0020 %    along with SOS (see COPYING.txt).
0021 %    If not, see <http://www.gnu.org/licenses/>.
0022 
0023 
0024 classdef dataFrame <  handle
0025     %% Creates a dataframe object.  Parent of population and sample.
0026     %
0027     % NOTE: Objects of this class are generally not generated directly; rather,
0028     % children such as population and sample generate objects which inherit
0029     % the characteristics of dataFrame objects.
0030     %
0031     % dataframe objects are used to store and manipulate sample and
0032     % population data, and other general information about samples and
0033     % populations such as their source files, where to save the dataframe,
0034     % and so on.
0035     %
0036     % To allow for reference- as opposed to value- based passing of
0037     % dataFrame objects, this class inherits from handle.
0038     %
0039     %PROPERTIES
0040     %    src - the name of the source file to be read in (if applicable)
0041     %    data - the actual data (a cell array of cell arrays/arrays)
0042     %    isHeader - logical indicating whether src has a header line (recommended!)
0043     %    header - the header info, each col stored in a cell array
0044     %    isFormatting - logical indicating whether src has formatting information in the header (recommended!)
0045     %    format - the format info, each col stored in a cell array
0046     %    sosObj - the SOS object currently associated with the dataframe
0047     %    zdata - the normalized data
0048     %    outFile - target output file name
0049     %
0050     %PROPERTIES (Constant)
0051     %   supportedFormats - cell array of supported data formats
0052     %
0053     %METHODS
0054     %   dataFrame(varargin) - Constructor - see it's doc for args
0055     %   item = popItem(obj,itemIndex) - pops and returns the item from {itemIndex}
0056     %   item = appendItem(obj,item) - appends {item} to dataFrame
0057     %   colNum = colName2colNum(colName) - finds the column number (index) of the named column
0058     %   writeData() % writes the header and data from the dataframe to obj.outFile
0059     %
0060     %METHODS (STATIC)
0061     %   a = aContainsb(a,b)  - ensures that data from dataframe obj 'a' contains the same rows as 'b'.
0062     %   [a,b] = mergeHeaders(a,b) -  merges the headers from dataFrames a and b
0063     %   [a,b] = aContainsbData(a,b) - ensures that data from dataframe obj 'a' contains the same data as 'b'.
0064     %   p = dataFrameInputParser() - returns an input parser for the dataFrame constructor args
0065     %   [data,header,format] = readDataFrameData(fileName,isHeader,isFormatting) - reads the data for a dataframe from a file
0066     %   percent = overlap(df1,df2) % calculates the percent of overlapping items out of the total number of items in df1+df2.
0067     %
0068     %METHODS (STATIC,Acess = private)
0069     %   p = parseConstructorArgs(varargin) - parses the dataFrame constructor arguments
0070         
0071     %% PROPERTIES
0072     properties   
0073         fileName % the name of the source file to be read in (if applicable)
0074         data % the actual data (a cell array of cell arrays/arrays)
0075         isHeader % logical indicating whether src has a header line (recommended!)
0076         header % the header info, each col stored in a cell array
0077         isFormatting % logical indicating whether src has formatting information in the header (recommended!)
0078         format % the format info, each col stored in a cell array
0079         sosObj % the SOS object currently associated with the dataframe
0080         zdata % the normalized data
0081         outFile % target output file name
0082     end %properties
0083     
0084     %%Properties (Constant)
0085     properties (Constant)
0086         supportedFormats = {'s' 'f'}; % cell array of supported data formats
0087     end
0088         
0089         
0090     
0091     methods
0092         
0093         %% dataframe CONSTRUCTOR
0094         function obj = dataFrame(varargin)
0095             %Constructor - Creates a dataFrame object
0096             %
0097             % CALL:
0098             % dataFrame(['filename',<string>, 'isHeader',<logical>, 'isFormatting',<logical>, 'outFile',<string>])
0099             %
0100             % SYNOPSIS:
0101             % Constructor - Creates a dataFrame object
0102             %
0103             % PARAMETERS:
0104             % OPTIONAL:
0105             %   fileName/string - param/string-value pair indicating the
0106             %       name of the file where data for the dataFrame is
0107             %       stored.  Defaults to NaN.
0108             %   isHeader/logical - param/logical-value pair indicating if
0109             %       the source file has a header.  Defaults to false.
0110             %   isFormatting/logical -  param/logical-value indicating if
0111             %       the source file has formating.  Defaults to false.
0112             %   outFile - param/string-value pair indicating the name
0113             %       (inc. path, if other than pwd is desired) of
0114             %       of file to save the residual population in after
0115             %       optimization has been completed.  Outfile is not
0116             %       validated until write.  Defaults to 'null'
0117             %
0118             % EXAMPLE:
0119             %   d = dataFrame(); % creates an empty dataframe
0120             %
0121             
0122             p = dataFrame.parseConstructorArgs(varargin);
0123 
0124             obj.fileName = p.Results.fileName;
0125             obj.isHeader = p.Results.isHeader;
0126             obj.isFormatting = p.Results.isFormatting;
0127             obj.outFile = p.Results.outFile;
0128             
0129             obj.header = {};
0130             obj.format = {};
0131 
0132             %if there is a valid input file to read from, do so
0133             if(strcmp(obj.fileName,'null') == false)
0134               [obj.data, obj.header, obj.format] = ...
0135                  dataFrame.readDataFrameData(obj.fileName,obj.isHeader,obj.isFormatting);
0136             end     
0137         end % dataFrame
0138 
0139     
0140         %% item = popItem(itemIndex) METHOD
0141         function item = popItem(obj,itemIndex)
0142             % pops and returns the item from {itemIndex}
0143             %
0144             %CALL:
0145             % item =  <dataFrameObj>.popItem(itemIndex)
0146             %
0147             %SYNOPSIS:
0148             % pops and returns the item at {itemIndex}
0149             %
0150             %PARAMETERS:
0151             %   itemIndex - the row index of the to-be-popped item
0152             
0153             
0154             p = inputParser;
0155             
0156             p.addRequired('obj');
0157             p.addRequired('itemIndex',@(itemIndex)validateattributes(itemIndex, {'numeric'}, ...
0158                 {'scalar', 'integer', 'positive','>' 0}));
0159             p.parse(obj,itemIndex);
0160             
0161             if(isempty(obj.data) == true)
0162                 error('No data in sample object - cannot pop item');
0163             elseif itemIndex > length(obj.data{1})
0164                 error('{itemIndex} exceeds row range of data array');
0165             end
0166             
0167             
0168             item = cell(1,length(obj.data));
0169 
0170             for i=1:length(obj.data)
0171                 cellpart = obj.data{i}(itemIndex);
0172                 item(i) = {cellpart};
0173                 obj.data{i}(itemIndex) = [];
0174             end
0175         end % popItem
0176         
0177         %% appendItem(item) STATIC METHOD
0178         function item = appendItem(obj,item)
0179             % appends {item} to dataFrame
0180             %
0181             % Warning!!!  Method does not confirm that the column structure
0182             % of {item} (i.e., what columns of data are in what order)
0183             % match that of the sample's data.  This should be the case if
0184             % SOS is manipulating the items since it will only insert items
0185             % into the sample that belong to the corresponding population,
0186             % which should be in sync.  However, manual invocation of this
0187             % method does not have this guarantee.  Use carefully!
0188             %
0189             %CALL:
0190             % <dataFrameObj>.insertItem(item)
0191             %
0192             %SYOPSIS:
0193             %appends{item} to data in dataFrame
0194             %
0195             %PARAMETERS:
0196             %item - an item (row) consistent with the population
0197             %
0198             
0199             if(isempty(obj.data))
0200                 obj.data = item;
0201             else
0202                 for i=1:length(obj.header)
0203                     obj.data{i} = vertcat(obj.data{i},item{i}(1));
0204                 end
0205             end
0206         end    
0207   
0208         %% colName2colNum
0209         function colNum = colName2colNum(obj, colName)
0210             %finds the column number (index) of the named column
0211             %
0212             %CALL:
0213             % <dataFrameObj>.colName2colNum(colName);
0214             %
0215             %PARAMETERS:
0216             % colName - name of column
0217             %
0218             %RETURNS:
0219             % colNum with that colName, -1 if not found
0220             colNum = -1;
0221             
0222             for i=1:length(obj.header)
0223                 if (strcmp(obj.header{i},colName) == 1)
0224                    colNum = i; 
0225                    break;
0226                 end                        
0227             end
0228         end
0229         
0230         
0231         %% writeData() METHOD
0232         function writeData(obj)
0233             %writes the header and data from the dataframe to obj.outFile
0234                        
0235             % the name meets these very basic checks, try to open the file
0236             
0237             try
0238                 fid = fopen(obj.outFile,'w');
0239             catch exception
0240                 error(['Could not open file: ', obj.outFile]);
0241             end
0242             
0243             % write the data to the file
0244             try
0245                 %write the header
0246                 for i=1:length(obj.header)    
0247                     fprintf(fid,'%s|%s\t',char(obj.header{i}), ...
0248                            regexprep(char(obj.format{i}),'%',''));
0249                 end
0250                 
0251                 fprintf(fid,'\r\n');
0252                 
0253                 if(isempty(obj.data) == false)
0254                     if(isempty(obj.data{1}) == false)
0255                         for i=1:length(obj.data{1})
0256                             for j=1:length(obj.data)
0257                                 if(strcmp(obj.format{j},'%s'))
0258                                     fprintf(fid,'%s\t',char(obj.data{j}(i)));
0259                                 elseif(strcmp(obj.format{j},'%f'))
0260                                     % try to format ints nicely
0261                                     if int32(obj.data{j}(i)) == obj.data{j}(i)
0262                                         fprintf(fid,'%d\t',obj.data{j}(i));
0263                                     else
0264                                         %try to format other numbers nicely
0265                                         fprintf(fid,'%s\t',num2str(obj.data{j}(i)));
0266                                     end
0267                                     
0268                                 else
0269                                    error('Unrecognized column format'); 
0270                                 end
0271                             end
0272                             fprintf(fid,'\r\n');
0273                         end
0274                     end
0275                 end
0276                 
0277                 
0278             catch exception
0279                 try
0280                     fclose(fid);
0281                 catch exception2 %#ok<NASGU>
0282                 end
0283                 
0284                 disp(exception);
0285                 error(['Error while writing to file: ', obj.outFile]);
0286                 
0287             end
0288       
0289             fclose(fid);
0290             
0291             verbosePrint(['Data written to file: ', obj.outFile], ...
0292                 'dataFrame_writeData_done');
0293         end %writeData()
0294     end
0295     
0296     
0297     
0298     methods (Static)
0299         
0300         %% a = aContainsb(a,b) STATIC METHOD
0301         function a = aContainsb(a,b)
0302             %ensures that data from dataframe obj 'a' contains the same rows as 'b'.
0303             %
0304             %CALL:
0305             % dataFrame.aContainsb(a,b) % where a and b are dataframe objects
0306             %
0307             %SYNOPSIS:
0308             %ensures that data from dataframe obj 'a' contains the same rows as 'b'.
0309             %If 'a' does not contain said rows, they are added to the 'a',
0310             %filled with the correct 'null' token for either strings
0311             %(literal string 'null') or NaN for floats.  Also updates the
0312             %header and format information for 'a' to reflect these new additions.
0313             %Returns updated a.
0314             %
0315             %PARAMETERS:
0316             %   a - a dataframe object
0317             %   b - a dataframe object
0318             %
0319             %EXAMPLE:
0320             %   dataFrame.aContainsb(a,b) % where a and b are dataframe objects
0321    
0322             for i=1:length(b.header)
0323                 isPresent = false;
0324 
0325                 for j=1:length(a.header)
0326                     if (strcmp(b.header{i},a.header{j}) == 1)
0327                        isPresent = true; 
0328                        break;
0329                     end                        
0330                 end
0331 
0332                 if(isPresent == false)     
0333                    a.header = [a.header b.header(i)];
0334                    a.format = [a.format b.format(i)];
0335 
0336                    %need to fill the corresponding rows with NaN or
0337                    %Null, if there is already data in the array
0338                     if(isempty(a.data) == false)
0339                        if(strcmp(a.format{length(a.format)},'%s') ==1) 
0340                            emptyArray = nullArray(length(a.data{1}));
0341                        elseif (strcmp(a.format{length(a.format)},'%f') ==1)  
0342                            emptyArray = NaNArray(length(a.data{1}));
0343                        else
0344                             error('Unable to fill in empty column because format is invalid');
0345                        end
0346 
0347                        %merge in the empty column
0348                        a.data = [a.data {emptyArray}];
0349                     end
0350                 end                  
0351             end         
0352         end
0353                
0354                 
0355         %% [a,b] = mergeHeaders(a,b) STATIC METHOD
0356         function [a,b] = mergeHeaders(a,b)
0357             % merges the headers from dataFrames a and b
0358             %
0359             % CALL:
0360             %   [a, b] = dataFrame.mergeHeaders(a,b)
0361             %
0362             %PARAMETERS:
0363             %   a - a dataFrame object
0364             %   b - a dataFrame object
0365             %
0366             %RETURNS:
0367             % [a, b] with merged headers
0368             %
0369             %Example:
0370             %   [sample,population] =
0371             %       dataFrame.mergeHeaders(sample,population);
0372 
0373             %check that each column in a is in b
0374             a = dataFrame.aContainsb(a,b);
0375             %check that each column in b is in a
0376             b = dataFrame.aContainsb(b,a);    
0377 
0378         end        
0379         
0380         
0381         %% [a,b] = aContainsbData(a,b) STATIC METHOD
0382         function [a,b] = aContainsbData(a,b)
0383            %ensures that data from dataframe obj 'a' contains the same data as 'b'.
0384             %
0385             %CALL:
0386             % dataFrame.aContainsbData(a,b) % where a and b are dataframe objects
0387             %
0388             %SYNOPSIS:
0389             % ensures that a contains all of the data in b.  Currently
0390             % useds as part of the normalization function to create a
0391             % dataframe with all of the data in the SOS object.
0392             %
0393             %PARAMETERS:
0394             %   a - a dataframe object
0395             %   b - a dataframe object
0396             %
0397             %EXAMPLE:
0398             %   dataFrame.aContainsbData(a,b) % where a and b are dataframe
0399             %   objects
0400             
0401             if(isempty(a.data) == true)
0402                a.data = {}; 
0403             end
0404 
0405             if(isempty(b.data) == false) %only need to merge if b contains data
0406                 l=length(b.data{1});
0407                 for i=1:length(a.header)
0408                     index = -1;
0409                     
0410                     for j=1:length(b.header)
0411                          if (strcmp(b.header{j},a.header{i}) == 1)
0412                              %there is data about the column in a stored in b
0413                              index=j;
0414                              break;
0415                          end
0416                     end
0417 
0418                     if index == -1
0419                        %no data about the column in a in column of b; add in a
0420                        %blank column of the appropriate length
0421                        if(strcmp(a.format{i},'%s') ==1)            
0422                            emptyArray = nullArray(length(l));
0423                        elseif (strcmp(a.format{i},'%f') ==1)  
0424                            emptyArray = NaNArray(length(l));
0425                        else
0426                             error('Merging only supported for data types %s and %f');
0427                        end
0428 
0429 
0430                        if(isempty(a.data))
0431                            a.data{i} = {emptyArray};
0432                        else
0433                            if (length(a.data) >= i)
0434                                 a.data{i} = vertcat(a.data{i},emptyArray);
0435                            else
0436                                a.data{i} = {emptyArray};
0437                            end
0438                        end
0439 
0440                     else
0441                         %need to move data over
0442                         if(isempty(a.data))
0443                             a.data{i} = b.data{index};
0444                         else
0445                             if(length(a.data) >= i)
0446                                 
0447                                 a.data{i} = vertcat(a.data{i},b.data{index});
0448                             else
0449                                a.data{i} = b.data{index};
0450                             end
0451                         end
0452                     end
0453                 end
0454             end
0455         end        
0456 
0457         
0458         
0459         %% p = dataFrameInputParser() STATIC METHOD
0460         function p = dataFrameInputParser()
0461             % returns an input parser for the dataFrame constructor args
0462             %
0463             %CALL:
0464             % p = dataFrame.dataFrameInputParser()
0465             %
0466             %SYNOPSIS:
0467             % returns an input parser for the dataFrame constructor args
0468             %
0469             %EXAMPLE:
0470             % p = dataFrame.dataFrameInputParser()
0471             %
0472             
0473              p = inputParser;
0474 
0475              %use NaN as null, since matlab doesn't support standard
0476              %NULL
0477              p.addParamValue('fileName','null',@(fileName)validFileNameOrNull(fileName));
0478              p.addParamValue('isHeader',false, ...
0479                 @(isHeader)validLogical(isHeader));
0480             p.addParamValue('isFormatting',false, ...
0481                 @(isFormatting)validLogical(isFormatting));
0482             p.addParamValue('outFile',NaN); 
0483 
0484         end
0485             
0486  
0487         %% [data,header,format] = readDataFrameData(fileName,isHeader,isFormatting)  STATIC METHOD
0488         function [data,header,format] = ...
0489                         readDataFrameData(fileName,isHeader,isFormatting)
0490             % reads the data for a dataframe from a file
0491             %
0492             %CALL:
0493             %   [data,header,format] = readDataFrameData(fileName,isHeader,isFormatting)
0494             %
0495             %SYNOPSIS:
0496             %Reads the data for a dataframe from a file.  Will
0497             %automatically attempt to generate header and formatting
0498             %information if it is not present in the file.  The algorithm's
0499             %ability to do so is quite basic though, so it is strongly
0500             %reccomended that the file contain header and formatting
0501             %information.
0502             %
0503             %PARAMETERS:
0504             %   fileName - string containing the location of the file to be read in
0505             %   isHeader - logical indicating if the file has header info
0506             %   isFormatting - logical indicating if the file has formatting info
0507             %
0508             %RETURNS:
0509             %   data - the data from the file
0510             %   header - header, either from file or automatically generated
0511             %   format - format, either from file or automatically generated
0512             %
0513             %EXAMPLE:
0514             %[data,header,format] = ... readDataFrameData('p1.txt',true,true)
0515             
0516     
0517             if isHeader == false && isFormatting == true
0518                 error('Formatting information cannot be included without header information');
0519             end
0520                         
0521             verbosePrint(['Reading data from file: ',fileName], ...
0522                 'dataFrame_readDataFrameData_reading'); 
0523 
0524             %variable {filename} has already been checked in the constructor as
0525             %being a valid filename, but some IO problems might still occur.
0526             try 
0527                 fid = fopen(fileName,'r');
0528             catch exception
0529                 exception = MException('IOError:InvalidFile', ...
0530                 strcat('dataFrame: Error when opening: ', fileName));
0531                 throw(exception);
0532             end
0533 
0534             %some compleities to deal with depending on whether headers /
0535             %formatting have been manually specified
0536             if(isHeader)  
0537                 verbosePrint('  Reading user-specified header', ...
0538                         'dataFrame_readDataFrameData_HeaderPresent');        
0539                 headerLine = fgetl(fid);
0540                 headerLine = textscan(headerLine,'%s');
0541                 headerLine = headerLine{1};
0542 
0543                 if(isFormatting)
0544                     verbosePrint('  Reading user-specified formatting', ...
0545                         'dataFrame_readDataFrameData_FormattingPresent');    
0546                     for i=1:length(headerLine)
0547                         parseFormat =  regexp(headerLine(i),'\|', 'split','once');
0548                         parseFormat = parseFormat{1};
0549                         header{i} = parseFormat(1); %#ok<AGROW>
0550 
0551                         try
0552                             format{i} = strcat('%',parseFormat(2)); %#ok<AGROW>
0553                         catch %#ok<CTCH>
0554                             exception = MException('HeaderError:MissingFormat', ...
0555                             strcat('dataFrame: Format not specified for a',...
0556                             'variable. This can also happen if you have ',...
0557                             'whitespace in your header / data'));
0558                             throw(exception); 
0559                         end
0560 
0561                         %confirm that the format is valid
0562                         validFormat = false;
0563                         for j=1:length(dataFrame.supportedFormats)
0564                             if(char(parseFormat(2)) == dataFrame.supportedFormats{j})
0565                                 validFormat = true;
0566                             end
0567                         end
0568                             
0569                         if validFormat == false
0570                             exception = MException('HeaderError:InvalidFormat', ...
0571                             strcat('dataFrame: Variable format invalid'));
0572                             throw(exception); 
0573                         end
0574                     end
0575 
0576                 else %user-specified header, but no format information
0577                     for i=1:length(headerLine)
0578                         header{i} = headerLine(i); %#ok<AGROW>
0579                     end
0580                 end
0581             else % no header or formatting information specified
0582                 
0583                 verbosePrint('  Automatically generating header', ...
0584                     'dataFrame_readDataFrameData_HeaderAbsent'); 
0585                 
0586                 headerLine = fgetl(fid);
0587                 headerLine = textscan(headerLine,'%s');
0588                 headerLine = headerLine{1};
0589                 for i = 1:length(headerLine)
0590                     header{i} = strcat('v',num2str(i)); %#ok<AGROW>
0591                 end
0592             end % if there was a header
0593 
0594             %if no formatting information supplied, must try to derive it
0595             if(isFormatting == false)
0596                verbosePrint('  Automatically generating format', ...
0597                    'dataFrame_readDataFrameData_FormattingAbsent');   
0598                
0599                firstDataLine = fgetl(fid);
0600                firstDataLine =  textscan(firstDataLine,'%s');
0601                firstDataLine = firstDataLine{1};
0602                
0603                for i=1:length(firstDataLine)
0604                     try
0605                         conv = str2double(firstDataLine(i));
0606                         if isnan(conv) == false
0607                             format{i} = '%f'; %#ok<AGROW>
0608                         else
0609                             format{i} = '%s'; %#ok<AGROW>
0610                         end
0611                     catch %#ok<CTCH>
0612                         format{i} = '%s'; %#ok<AGROW>
0613                     end
0614 
0615                end
0616                
0617             end % isFormatting == false
0618 
0619             
0620             %at last, we can read in the data.
0621             %we reset to the first line in the file; go to the second line
0622             %if there was header information.
0623             fseek(fid,0,'bof');
0624             
0625             if isHeader == true
0626                 fgetl(fid);
0627             end
0628             
0629             % create a meta-representation of each string using the newly
0630             % derived formatting information
0631             formatStr = '';
0632             for i=1:length(format)
0633                 formatStr = strcat(formatStr,char(format{i}));
0634             end
0635 
0636             %we can now read in the data
0637             try
0638                 %read in the data.  Note that if there is a header present
0639                 %but the user said that there was not, this next line may
0640                 %not work properly!  Unfortunately, there does not appear
0641                 %to be a straightforward workaround for this issue
0642                 %presently.
0643                 data = textscan(fid,formatStr,'Delimiter','\t');
0644             catch exception
0645                 exception = MException('FormatError:IncorrectFormat', ...
0646                 strcat('dataFrame: Data does not conform to column format'));
0647                 throw(exception); 
0648             end
0649 
0650             %need to manually enter the last rows as NaN if they are
0651             %missing.  By definition, if there was a row, the first value
0652             %must be present.
0653             nrow = length(data{1});
0654 
0655             for i=2:length(data);
0656                 if (length(data{i}) < nrow)
0657                     data{i} = vertcat(data{i},NaN);
0658                 end
0659             end
0660 
0661             %done with the file.
0662             fclose(fid);
0663             verbosePrint('Done reading in data', ...
0664                     'dataFrame_readDataFrameData_DoneReadingData');  
0665                 
0666             [data,header,format]; %#ok<VUNUS> % variables to return
0667         end  % readDataFrameData
0668         
0669         
0670         
0671         %% function percent = overlap(df1,df2) STATIC METHOD
0672         function percent = overlap(df1,df2)
0673             % calculates the percent of overlapping items out of the total
0674             % number of items in df1+df2.  The method will generate errors
0675             % if df1 and df2 do not contain items and if their
0676             % header/formatting information is non-identical (it should be
0677             % if they are both samples from the same optimization script).
0678             % This algorithm also assumes that every item within each
0679             % dataframe is unique.  It may produce incorrect results if
0680             % there are multiple copies of the same item within a
0681             % particular dataframe.
0682             %
0683             % PARAMETERS:
0684             %   df1 -first dataframe
0685             %   df2 -second dataframe
0686             %
0687             % RETURNS:
0688             %   percent - percent of shared items as a function of the
0689             %               total number of items
0690             
0691             %validate variables
0692             if any(strcmp(superclasses(df1),'dataFrame')) == 0
0693                 error('argument 1 is not a dataFrame');
0694             end
0695             
0696             if any(strcmp(superclasses(df2),'dataFrame')) == 0
0697                 error('argument 2 is not a dataFrame');
0698             end            
0699             
0700             
0701             % make sure that both dataFrames have at least one column
0702             
0703             if isempty(df1.header)
0704                 error([df1.name, ' does not contain any header information']);
0705             end
0706             
0707             if isempty(df2.header)
0708                 error([df2.name, ' does not contain any header information']);
0709             end            
0710             
0711             
0712             % make sure that each dataframe contains some data
0713             if isempty(df1.data)
0714                 error([df1.name, ' does not contain any data']);
0715             end
0716 
0717            if isempty(df2.data)
0718                 error([df2.name, ' does not contain any data']);
0719            end
0720            
0721            if isempty(df1.data{1})
0722                error([df1.name, ' does not contain any data']);
0723            end
0724            
0725            if isempty(df2.data{1})
0726                error([df2.name, ' does not contain any data']);
0727            end
0728            
0729            % make sure that both dataFrames have the same header
0730            % information
0731            
0732            if length(df1.header) ~= length(df2.header)
0733                error('dataFrames must contain the same number of column headers');
0734            end
0735            
0736            for i=1:length(df1.header)
0737                if(strcmp(df1.header(i),df2.header(i)))
0738                    error(['dataFrame header column: ', num2str(i), ' do not match']);
0739                end
0740                %check formatting as well
0741                if(strcmp(df1.format(i),df2.format(i)))
0742                    error(['dataFrame format for column: ', num2str(i), ' do not match']);
0743                end
0744            end
0745 
0746            %both dataFrames have at least some data in them and match in
0747            %all other respect.  Now they can
0748            %be compared to see how much overlap exists between them.
0749            
0750            total = length(df1.data{1}) + length(df2.data{1});
0751            overlap = 0;
0752            
0753            for i=1:length(df1.data{1})
0754                itemMatch = false;
0755                
0756                for j=1:length(df2.data{1})
0757                    rowMatch = true;
0758                    
0759                    for k=1:length(df1.header)
0760 
0761                        if strcmp(df1.format{k},'%s')
0762                            if strcmp(df1.data{k}(i),df2.data{k}(j)) == 0
0763                                rowMatch = false;
0764                            end
0765                        elseif strcmp(df1.format{k},'%f')
0766                            if df1.data{k}(i) ~= df2.data{k}(j)
0767                                rowMatch = false;
0768                            end
0769                        else
0770                            error(['Unrecognized column format: ',df1.format{k}{1}]);
0771                        end
0772              
0773                    end
0774                    
0775                    % if rowMatch is still true at this point, then we have
0776                    % an identical entry in both samples.  Cound that
0777                    
0778                    if rowMatch == true
0779                        itemMatch = true;
0780                        break;
0781                    end 
0782                end
0783                
0784                if itemMatch == true
0785                    overlap = overlap + 2;
0786                end
0787            end
0788            
0789            percent = overlap/total*100;
0790            
0791            verbosePrint(['Overlap between ',df1.name,' and ', df2.name, ': ',...
0792                         num2str(percent),'%'],'dataFrame_overlap_percent');
0793            
0794             
0795         end
0796         
0797         
0798    end
0799     
0800     methods (Static, Access = private)
0801 
0802         %% parseConstructorArgs PRIVATE STATIC METHOD
0803         function p = parseConstructorArgs(varargin)
0804             %parses the dataFrame constructor arguments
0805             %
0806             % CALL:
0807             % p = dataFrame.parseConstructorArgs(varargin);
0808             %
0809             %parses the arguments from the dataFrame constructor.  Default
0810             %values are substituted where appropriate.  Returns a struct
0811             %with the parsed args
0812             %
0813             %PARAMETERS:
0814             % SAME as dataFrame CONSTRUCTOR
0815             %
0816             %RETURNS:
0817             %    p - parsed constructor input
0818             
0819              varargin = varargin{1};
0820 
0821              p = dataFrame.dataFrameInputParser();
0822              p.parse(varargin{:});
0823         end % parseConstructorArgs
0824     end
0825     
0826 end %dataFrame
0827 
0828 
0829 
0830
dataFrame

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SUBFUNCTIONS

SOURCE CODE