0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024 classdef dataFrame < handle
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072 properties
0073 fileName
0074 data
0075 isHeader
0076 header
0077 isFormatting
0078 format
0079 sosObj
0080 zdata
0081 outFile
0082 end
0083
0084
0085 properties (Constant)
0086 supportedFormats = {'s' 'f'};
0087 end
0088
0089
0090
0091 methods
0092
0093
0094 function obj = dataFrame(varargin)
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122 p = dataFrame.parseConstructorArgs(varargin);
0123
0124 obj.fileName = p.Results.fileName;
0125 obj.isHeader = p.Results.isHeader;
0126 obj.isFormatting = p.Results.isFormatting;
0127 obj.outFile = p.Results.outFile;
0128
0129 obj.header = {};
0130 obj.format = {};
0131
0132
0133 if(strcmp(obj.fileName,'null') == false)
0134 [obj.data, obj.header, obj.format] = ...
0135 dataFrame.readDataFrameData(obj.fileName,obj.isHeader,obj.isFormatting);
0136 end
0137 end
0138
0139
0140
0141 function item = popItem(obj,itemIndex)
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154 p = inputParser;
0155
0156 p.addRequired('obj');
0157 p.addRequired('itemIndex',@(itemIndex)validateattributes(itemIndex, {'numeric'}, ...
0158 {'scalar', 'integer', 'positive','>' 0}));
0159 p.parse(obj,itemIndex);
0160
0161 if(isempty(obj.data) == true)
0162 error('No data in sample object - cannot pop item');
0163 elseif itemIndex > length(obj.data{1})
0164 error('{itemIndex} exceeds row range of data array');
0165 end
0166
0167
0168 item = cell(1,length(obj.data));
0169
0170 for i=1:length(obj.data)
0171 cellpart = obj.data{i}(itemIndex);
0172 item(i) = {cellpart};
0173 obj.data{i}(itemIndex) = [];
0174 end
0175 end
0176
0177
0178 function item = appendItem(obj,item)
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199 if(isempty(obj.data))
0200 obj.data = item;
0201 else
0202 for i=1:length(obj.header)
0203 obj.data{i} = vertcat(obj.data{i},item{i}(1));
0204 end
0205 end
0206 end
0207
0208
0209 function colNum = colName2colNum(obj, colName)
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220 colNum = -1;
0221
0222 for i=1:length(obj.header)
0223 if (strcmp(obj.header{i},colName) == 1)
0224 colNum = i;
0225 break;
0226 end
0227 end
0228 end
0229
0230
0231
0232 function writeData(obj)
0233
0234
0235
0236
0237 try
0238 fid = fopen(obj.outFile,'w');
0239 catch exception
0240 error(['Could not open file: ', obj.outFile]);
0241 end
0242
0243
0244 try
0245
0246 for i=1:length(obj.header)
0247 fprintf(fid,'%s|%s\t',char(obj.header{i}), ...
0248 regexprep(char(obj.format{i}),'%',''));
0249 end
0250
0251 fprintf(fid,'\r\n');
0252
0253 if(isempty(obj.data) == false)
0254 if(isempty(obj.data{1}) == false)
0255 for i=1:length(obj.data{1})
0256 for j=1:length(obj.data)
0257 if(strcmp(obj.format{j},'%s'))
0258 fprintf(fid,'%s\t',char(obj.data{j}(i)));
0259 elseif(strcmp(obj.format{j},'%f'))
0260
0261 if int32(obj.data{j}(i)) == obj.data{j}(i)
0262 fprintf(fid,'%d\t',obj.data{j}(i));
0263 else
0264
0265 fprintf(fid,'%s\t',num2str(obj.data{j}(i)));
0266 end
0267
0268 else
0269 error('Unrecognized column format');
0270 end
0271 end
0272 fprintf(fid,'\r\n');
0273 end
0274 end
0275 end
0276
0277
0278 catch exception
0279 try
0280 fclose(fid);
0281 catch exception2
0282 end
0283
0284 disp(exception);
0285 error(['Error while writing to file: ', obj.outFile]);
0286
0287 end
0288
0289 fclose(fid);
0290
0291 verbosePrint(['Data written to file: ', obj.outFile], ...
0292 'dataFrame_writeData_done');
0293 end
0294 end
0295
0296
0297
0298 methods (Static)
0299
0300
0301 function a = aContainsb(a,b)
0302
0303
0304
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322 for i=1:length(b.header)
0323 isPresent = false;
0324
0325 for j=1:length(a.header)
0326 if (strcmp(b.header{i},a.header{j}) == 1)
0327 isPresent = true;
0328 break;
0329 end
0330 end
0331
0332 if(isPresent == false)
0333 a.header = [a.header b.header(i)];
0334 a.format = [a.format b.format(i)];
0335
0336
0337
0338 if(isempty(a.data) == false)
0339 if(strcmp(a.format{length(a.format)},'%s') ==1)
0340 emptyArray = nullArray(length(a.data{1}));
0341 elseif (strcmp(a.format{length(a.format)},'%f') ==1)
0342 emptyArray = NaNArray(length(a.data{1}));
0343 else
0344 error('Unable to fill in empty column because format is invalid');
0345 end
0346
0347
0348 a.data = [a.data {emptyArray}];
0349 end
0350 end
0351 end
0352 end
0353
0354
0355
0356 function [a,b] = mergeHeaders(a,b)
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372
0373
0374 a = dataFrame.aContainsb(a,b);
0375
0376 b = dataFrame.aContainsb(b,a);
0377
0378 end
0379
0380
0381
0382 function [a,b] = aContainsbData(a,b)
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393
0394
0395
0396
0397
0398
0399
0400
0401 if(isempty(a.data) == true)
0402 a.data = {};
0403 end
0404
0405 if(isempty(b.data) == false)
0406 l=length(b.data{1});
0407 for i=1:length(a.header)
0408 index = -1;
0409
0410 for j=1:length(b.header)
0411 if (strcmp(b.header{j},a.header{i}) == 1)
0412
0413 index=j;
0414 break;
0415 end
0416 end
0417
0418 if index == -1
0419
0420
0421 if(strcmp(a.format{i},'%s') ==1)
0422 emptyArray = nullArray(length(l));
0423 elseif (strcmp(a.format{i},'%f') ==1)
0424 emptyArray = NaNArray(length(l));
0425 else
0426 error('Merging only supported for data types %s and %f');
0427 end
0428
0429
0430 if(isempty(a.data))
0431 a.data{i} = {emptyArray};
0432 else
0433 if (length(a.data) >= i)
0434 a.data{i} = vertcat(a.data{i},emptyArray);
0435 else
0436 a.data{i} = {emptyArray};
0437 end
0438 end
0439
0440 else
0441
0442 if(isempty(a.data))
0443 a.data{i} = b.data{index};
0444 else
0445 if(length(a.data) >= i)
0446
0447 a.data{i} = vertcat(a.data{i},b.data{index});
0448 else
0449 a.data{i} = b.data{index};
0450 end
0451 end
0452 end
0453 end
0454 end
0455 end
0456
0457
0458
0459
0460 function p = dataFrameInputParser()
0461
0462
0463
0464
0465
0466
0467
0468
0469
0470
0471
0472
0473 p = inputParser;
0474
0475
0476
0477 p.addParamValue('fileName','null',@(fileName)validFileNameOrNull(fileName));
0478 p.addParamValue('isHeader',false, ...
0479 @(isHeader)validLogical(isHeader));
0480 p.addParamValue('isFormatting',false, ...
0481 @(isFormatting)validLogical(isFormatting));
0482 p.addParamValue('outFile',NaN);
0483
0484 end
0485
0486
0487
0488 function [data,header,format] = ...
0489 readDataFrameData(fileName,isHeader,isFormatting)
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516
0517 if isHeader == false && isFormatting == true
0518 error('Formatting information cannot be included without header information');
0519 end
0520
0521 verbosePrint(['Reading data from file: ',fileName], ...
0522 'dataFrame_readDataFrameData_reading');
0523
0524
0525
0526 try
0527 fid = fopen(fileName,'r');
0528 catch exception
0529 exception = MException('IOError:InvalidFile', ...
0530 strcat('dataFrame: Error when opening: ', fileName));
0531 throw(exception);
0532 end
0533
0534
0535
0536 if(isHeader)
0537 verbosePrint(' Reading user-specified header', ...
0538 'dataFrame_readDataFrameData_HeaderPresent');
0539 headerLine = fgetl(fid);
0540 headerLine = textscan(headerLine,'%s');
0541 headerLine = headerLine{1};
0542
0543 if(isFormatting)
0544 verbosePrint(' Reading user-specified formatting', ...
0545 'dataFrame_readDataFrameData_FormattingPresent');
0546 for i=1:length(headerLine)
0547 parseFormat = regexp(headerLine(i),'\|', 'split','once');
0548 parseFormat = parseFormat{1};
0549 header{i} = parseFormat(1);
0550
0551 try
0552 format{i} = strcat('%',parseFormat(2));
0553 catch
0554 exception = MException('HeaderError:MissingFormat', ...
0555 strcat('dataFrame: Format not specified for a',...
0556 'variable. This can also happen if you have ',...
0557 'whitespace in your header / data'));
0558 throw(exception);
0559 end
0560
0561
0562 validFormat = false;
0563 for j=1:length(dataFrame.supportedFormats)
0564 if(char(parseFormat(2)) == dataFrame.supportedFormats{j})
0565 validFormat = true;
0566 end
0567 end
0568
0569 if validFormat == false
0570 exception = MException('HeaderError:InvalidFormat', ...
0571 strcat('dataFrame: Variable format invalid'));
0572 throw(exception);
0573 end
0574 end
0575
0576 else
0577 for i=1:length(headerLine)
0578 header{i} = headerLine(i);
0579 end
0580 end
0581 else
0582
0583 verbosePrint(' Automatically generating header', ...
0584 'dataFrame_readDataFrameData_HeaderAbsent');
0585
0586 headerLine = fgetl(fid);
0587 headerLine = textscan(headerLine,'%s');
0588 headerLine = headerLine{1};
0589 for i = 1:length(headerLine)
0590 header{i} = strcat('v',num2str(i));
0591 end
0592 end
0593
0594
0595 if(isFormatting == false)
0596 verbosePrint(' Automatically generating format', ...
0597 'dataFrame_readDataFrameData_FormattingAbsent');
0598
0599 firstDataLine = fgetl(fid);
0600 firstDataLine = textscan(firstDataLine,'%s');
0601 firstDataLine = firstDataLine{1};
0602
0603 for i=1:length(firstDataLine)
0604 try
0605 conv = str2double(firstDataLine(i));
0606 if isnan(conv) == false
0607 format{i} = '%f';
0608 else
0609 format{i} = '%s';
0610 end
0611 catch
0612 format{i} = '%s';
0613 end
0614
0615 end
0616
0617 end
0618
0619
0620
0621
0622
0623 fseek(fid,0,'bof');
0624
0625 if isHeader == true
0626 fgetl(fid);
0627 end
0628
0629
0630
0631 formatStr = '';
0632 for i=1:length(format)
0633 formatStr = strcat(formatStr,char(format{i}));
0634 end
0635
0636
0637 try
0638
0639
0640
0641
0642
0643 data = textscan(fid,formatStr,'Delimiter','\t');
0644 catch exception
0645 exception = MException('FormatError:IncorrectFormat', ...
0646 strcat('dataFrame: Data does not conform to column format'));
0647 throw(exception);
0648 end
0649
0650
0651
0652
0653 nrow = length(data{1});
0654
0655 for i=2:length(data);
0656 if (length(data{i}) < nrow)
0657 data{i} = vertcat(data{i},NaN);
0658 end
0659 end
0660
0661
0662 fclose(fid);
0663 verbosePrint('Done reading in data', ...
0664 'dataFrame_readDataFrameData_DoneReadingData');
0665
0666 [data,header,format];
0667 end
0668
0669
0670
0671
0672 function percent = overlap(df1,df2)
0673
0674
0675
0676
0677
0678
0679
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690
0691
0692 if any(strcmp(superclasses(df1),'dataFrame')) == 0
0693 error('argument 1 is not a dataFrame');
0694 end
0695
0696 if any(strcmp(superclasses(df2),'dataFrame')) == 0
0697 error('argument 2 is not a dataFrame');
0698 end
0699
0700
0701
0702
0703 if isempty(df1.header)
0704 error([df1.name, ' does not contain any header information']);
0705 end
0706
0707 if isempty(df2.header)
0708 error([df2.name, ' does not contain any header information']);
0709 end
0710
0711
0712
0713 if isempty(df1.data)
0714 error([df1.name, ' does not contain any data']);
0715 end
0716
0717 if isempty(df2.data)
0718 error([df2.name, ' does not contain any data']);
0719 end
0720
0721 if isempty(df1.data{1})
0722 error([df1.name, ' does not contain any data']);
0723 end
0724
0725 if isempty(df2.data{1})
0726 error([df2.name, ' does not contain any data']);
0727 end
0728
0729
0730
0731
0732 if length(df1.header) ~= length(df2.header)
0733 error('dataFrames must contain the same number of column headers');
0734 end
0735
0736 for i=1:length(df1.header)
0737 if(strcmp(df1.header(i),df2.header(i)))
0738 error(['dataFrame header column: ', num2str(i), ' do not match']);
0739 end
0740
0741 if(strcmp(df1.format(i),df2.format(i)))
0742 error(['dataFrame format for column: ', num2str(i), ' do not match']);
0743 end
0744 end
0745
0746
0747
0748
0749
0750 total = length(df1.data{1}) + length(df2.data{1});
0751 overlap = 0;
0752
0753 for i=1:length(df1.data{1})
0754 itemMatch = false;
0755
0756 for j=1:length(df2.data{1})
0757 rowMatch = true;
0758
0759 for k=1:length(df1.header)
0760
0761 if strcmp(df1.format{k},'%s')
0762 if strcmp(df1.data{k}(i),df2.data{k}(j)) == 0
0763 rowMatch = false;
0764 end
0765 elseif strcmp(df1.format{k},'%f')
0766 if df1.data{k}(i) ~= df2.data{k}(j)
0767 rowMatch = false;
0768 end
0769 else
0770 error(['Unrecognized column format: ',df1.format{k}{1}]);
0771 end
0772
0773 end
0774
0775
0776
0777
0778 if rowMatch == true
0779 itemMatch = true;
0780 break;
0781 end
0782 end
0783
0784 if itemMatch == true
0785 overlap = overlap + 2;
0786 end
0787 end
0788
0789 percent = overlap/total*100;
0790
0791 verbosePrint(['Overlap between ',df1.name,' and ', df2.name, ': ',...
0792 num2str(percent),'%'],'dataFrame_overlap_percent');
0793
0794
0795 end
0796
0797
0798 end
0799
0800 methods (Static, Access = private)
0801
0802
0803 function p = parseConstructorArgs(varargin)
0804
0805
0806
0807
0808
0809
0810
0811
0812
0813
0814
0815
0816
0817
0818
0819 varargin = varargin{1};
0820
0821 p = dataFrame.dataFrameInputParser();
0822 p.parse(varargin{:});
0823 end
0824 end
0825
0826 end
0827
0828
0829
0830