%% Step 1 % The University of Texas at Austin, CS 395T, Spring 2008, Prof. William H. Press load yeastarray_t2.txt; size(yeastarray_t2) %ans = 500 300 yclip = prctile(yeastarray_t2(:),[1,99]) %yclip = -204 244 data = max(yclip(1),min(yclip(2),yeastarray_t2)); dmean= mean(data,1); dstd = std(data,1); data = (data - repmat(dmean,[size(data,1),1]))./repmat(dstd,[size(data,1),1]); genecolormap = [min(1,(1:64)/32); 1-abs(1-(1:64)/32); min(1,(64-(1:64))/32)]'; figure(1),clf,colormap(genecolormap); image(20*data+32) %% Step 2 % (XV)T (XV) = (US)T (US) = ST (UTU)S = S2 [U S V] = svd(data, 0); pcacoords = U*S; plot(pcacoords(:,1),pcacoords(:,2),'r.') axis equal %% Step 3 % The squares of the SV?s are proportional to the portion of the total % variance (L2 norm of X) that each accounts for. ssq = diag(S).^2; figure(1), clf,plot(ssq) figure(2),clf,semilogy(ssq,'.b'), hold %% Step 4 fakedata = randn(500,300); [Uf Sf Vf] = svd(fakedata,0); sfsq = diag(Sf).^2; semilogy(sfsq,'.r') %% Step 5 % For the data in this example, a sensible use of PCA (i.e., SVD) would be to % project the data into the subspace of the first ~20 SVs, where we can be % sure that it is not noise. % Recall the original data set figure(1),clf, colormap(genecolormap); image(20*data+32) % Truncate the first 20 singular values/vectors strunc = diag(S); strunc(21:end) = 0; filtdata20 = U*diag(strunc)*V'; figure(2),clf,colormap(genecolormap); image(20*filtdata20+32) % Truncate the first 5 singular values/vectors strunc(6:end) = 0; filtdata5 = U*diag(strunc)*V'; figure(3),clf,colormap(genecolormap); image(20*filtdata5+32) %% Step 6 % How to interpret the singular vectors? % The first three vectors u are ?eigengenes?, the linear % combination of genes that explain the most data. figure(1),clf,plot(U(:,1:3)) % The first three vectors v are ?eigenarrays?, the linear % combination of experiments that explain the most data. figure(2),clf,plot(V(:,1:3)) %% Step 7 % Toy example pdata = randn(500,300); pdata(101:200,51:100) = pdata(101:200,51:100) + 1; pdata(301:400,201:250) = pdata(301:400,201:250) - 1; pmean = mean(pdata,1); pstd = std(pdata,1); pdata = (pdata - repmat(pmean,[size(pdata,1),1]))./repmat(pstd,[size(pdata,1),1]); colormap(genecolormap) image(20*pdata+32) [Up Sp Vp] = svd(pdata,0); spsq = diag(Sp).^2; figure(3), clf,semilogy(spsq(1:50),'.b') % So should we expect the eigengenes/eigenarrays to show the separate main effects? figure(1),clf,plot(Up(:,1:2)),figure(2),clf,plot(Up(:,1:3)) figure(3),clf,plot(Vp(:,1:2)),figure(4),plot(Vp(:,1:3))