%% Step 1 
% The University of Texas at Austin, CS 395T, Spring 2008, Prof. William H. Press
load yeastarray_t2.txt;
size(yeastarray_t2)
%ans = 500 300
yclip = prctile(yeastarray_t2(:),[1,99])
%yclip = -204 244
data = max(yclip(1),min(yclip(2),yeastarray_t2));
dmean= mean(data,1);
dstd = std(data,1);
data = (data - repmat(dmean,[size(data,1),1]))./repmat(dstd,[size(data,1),1]);
genecolormap = [min(1,(1:64)/32); 1-abs(1-(1:64)/32); min(1,(64-(1:64))/32)]';
figure(1),clf,colormap(genecolormap);
image(20*data+32)

%% Step 2
% (XV)T (XV) = (US)T (US) = ST (UTU)S = S2
[U S V] = svd(data, 0);
pcacoords = U*S;
plot(pcacoords(:,1),pcacoords(:,2),'r.')
axis equal


%% Step 3
% The squares of the SV?s are proportional to the portion of the total
% variance (L2 norm of X) that each accounts for.

ssq = diag(S).^2;
figure(1), clf,plot(ssq)
figure(2),clf,semilogy(ssq,'.b'), hold

%% Step 4 
fakedata = randn(500,300);
[Uf Sf Vf] = svd(fakedata,0);
sfsq = diag(Sf).^2;
semilogy(sfsq,'.r')

%% Step 5
% For the data in this example, a sensible use of PCA (i.e., SVD) would be to
% project the data into the subspace of the first ~20 SVs, where we can be
% sure that it is not noise.
% Recall the original data set
figure(1),clf, colormap(genecolormap);
image(20*data+32)

% Truncate the first 20 singular values/vectors
strunc = diag(S);
strunc(21:end) = 0;
filtdata20 = U*diag(strunc)*V';
figure(2),clf,colormap(genecolormap);
image(20*filtdata20+32)

% Truncate the first 5 singular values/vectors
strunc(6:end) = 0;
filtdata5 = U*diag(strunc)*V';
figure(3),clf,colormap(genecolormap);
image(20*filtdata5+32)

%% Step 6
% How to interpret the singular vectors?
% The first three vectors u are ?eigengenes?, the linear
% combination of genes that explain the most data.

figure(1),clf,plot(U(:,1:3))

% The first three vectors v are ?eigenarrays?, the linear
% combination of experiments that explain the most data.
figure(2),clf,plot(V(:,1:3))

%% Step 7
% Toy example
pdata = randn(500,300);
pdata(101:200,51:100) = pdata(101:200,51:100) + 1;
pdata(301:400,201:250) = pdata(301:400,201:250) - 1;
pmean = mean(pdata,1);
pstd = std(pdata,1);
pdata = (pdata - repmat(pmean,[size(pdata,1),1]))./repmat(pstd,[size(pdata,1),1]);
colormap(genecolormap)
image(20*pdata+32)

[Up Sp Vp] = svd(pdata,0);
spsq = diag(Sp).^2;
figure(3), clf,semilogy(spsq(1:50),'.b')
% So should we expect the eigengenes/eigenarrays to show the separate main effects?

figure(1),clf,plot(Up(:,1:2)),figure(2),clf,plot(Up(:,1:3))

figure(3),clf,plot(Vp(:,1:2)),figure(4),plot(Vp(:,1:3))