%% Description
% Data set: proteomics
% Method: multi-profile alignment with Gaussian process prior
%
% ----- data_matrix_proteomics.mat -----
% proData: binned matrix 20 x 1000 x 3200 (sample x RT points x mz bins)
% timeRes: registered time index
% nbSamp: # of samples (20)
% nbRT: # of RT points (1000)
% nbMZ: # of mz bins (3200)
% mzLow: mz lower bound for each bin
% mzHi: mz upper bound for each bin
%% Required utilities
% ----- attached functions -----
% coda() to calculate MCQ values
%
% ----- functions from elsewhere -----
% inv_posdef(), randnorm(), scale_rows(), ndsum() from Tom Minka's Lightspeed toolbox,
% downloaded at http://research.microsoft.com/en-us/um/people/minka/software/lightspeed/
% randraw() from File Exchange at MATLAB Central,
% downloaded at http://www.mathworks.com/matlabcentral/fileexchange/7309
% bsplinebasis() from Scott Gaffney's CCToolbox,
% downloaded at http://www.ics.uci.edu/~sgaffney/software/CCT/
% apcluster() by Frey Lab,
% downloaded at http://www.psi.toronto.edu/index.php?q=affinity%20propagation
% GPML toolbox (v3.1) by Carl Edward Rasmussen and Hannes Nickisch
% downloaded at http://www.gaussianprocess.org/gpml/code/matlab/doc/index.html
%% Load the proteomic data set
clear all; close all; clc;
load data_matrix_proteomics.mat
[nbSamp,nbRT,nbMZ] = size(proData);
%% 1st-phase screening based on MCQ
binICs = proData;
winQ = 3;
% considering the worst case
mcq = coda(squeeze(binICs(1,:,:)),winQ);
for i = 2:nbSamp
mcq = min([mcq; coda(squeeze(binICs(i,:,:)),winQ)],[],1);
end
idxQ = find(mcq>=0.9);
binQICs = binICs(:,:,idxQ);
mcqQ = mcq(idxQ);
nbQBin = numel(idxQ);
% scale the remaining chromatograms (profile is more important than abosolute amount)
for b = 1:nbQBin
for i = 1:nbSamp
binQICs(i,:,b) = binQICs(i,:,b)./ndsum(binQICs(i,:,b),2);
end
end
%% 2nd-phase screening based on reproducibility among samples (with xcorr)
xcMat = zeros(nbSamp,nbSamp,nbQBin);
xcAve = zeros(1,nbQBin);
rootE = zeros(1,nbSamp); % root of energy used for normalization
winX = 40;
for b = 1:nbQBin
for i = 1:nbSamp
rootE(i) = sqrt(sum((binQICs(i,:,b).*binQICs(i,:,b))));
xcMat(i,i,b) = 1;
end
for i = 1:nbSamp
for j = i+1:nbSamp
tmpXC = xcorr(binQICs(i,:,b),binQICs(j,:,b),winX);
idxXC = maxind(tmpXC);
if isempty(idxXC)
xcMat(i,j,b) = 0;
else
idxXC(find(tmpXC(idxXC)<0.5*max(tmpXC(idxXC)))) = [];
[~,idxx] = min(abs(idxXC-(winX+1)));
xcMat(i,j,b) = tmpXC(idxXC(idxx))/(rootE(i)*rootE(j));
end
xcMat(j,i,b) = xcMat(i,j,b);
end
end
xcAve(b) = (ndsum(xcMat(:,:,b),1:2)-nbSamp)/(nbSamp*nbSamp-nbSamp);
end
idxC = find(xcAve>=0.85);
binQCICs = binQICs(:,:,idxC);
mcqQC = mcqQ(idxC);
nbQCBin = numel(idxC);
xcAveQC = xcAve(idxC);
%% identify exemplars using affinity propagation (correlation coeff. as similarity)
ccQC = zeros(nbQCBin);
for b = 1:nbSamp
ccQC = ccQC + corr(squeeze(binQCICs(b,:,:)));
end
ccQC = ccQC/nbSamp;
sim = zeros(nbQCBin*nbQCBin-nbQCBin,3);
cnt = 1;
for b = 1:nbQCBin
for p = [1:b-1,b+1:nbQCBin]
sim(cnt,1)=b; sim(cnt,2)=p; sim(cnt,3)=ccQC(b,p);
cnt=cnt+1;
end
end
prefSim = mean(sim(:,3));
[idxExmp,~,~,~] = apcluster(sim,prefSim);
%% agglomerative clustering of the exemplars (based on overlapping level)
nbExmp = numel(unique(idxExmp));
exmpICs = binQCICs(:,:,unique(idxExmp));
tmpNum = nbExmp;
tmpICs = exmpICs;
tmpIdx = cell(1,tmpNum);
for b = 1:tmpNum
tmpIdx{b} = b;
end
for l = 1:nbExmp-1
tmpDis = zeros(tmpNum*(tmpNum-1)/2,3);
cnt = 1;
for b = 1:tmpNum-1
for p = b+1:tmpNum
tmpDis(cnt,1) = b;
tmpDis(cnt,2) = p;
for i = 1:nbSamp
tmpDis(cnt,3) = tmpDis(cnt,3) + sum(min(squeeze(tmpICs(i,:,[b,p])),[],2));
end
cnt = cnt+1;
end
end
[layer(l).ovp, idxPair] = min(tmpDis(:,3));
tmpICs(:,:,tmpDis(idxPair,1)) = tmpICs(:,:,tmpDis(idxPair,1))+tmpICs(:,:,tmpDis(idxPair,2));
tmpICs(:,:,tmpDis(idxPair,2)) = [];
tmpIdx{tmpDis(idxPair,1)} = [tmpIdx{tmpDis(idxPair,1)},tmpIdx{tmpDis(idxPair,2)}];
tmpIdx(tmpDis(idxPair,2)) = [];
tmpNum = tmpNum-1;
layer(l).num = tmpNum;
layer(l).idx = tmpIdx;
end
nbEIC = 4; % pre-defined value
EICs = zeros(nbSamp,nbRT,nbEIC);
for b=1:4
EICs(:,:,b) = ndsum(exmpICs(:,:,layer(numel(layer)-nbEIC+1).idx{b}),3);
end
EICs = EICs*10; % scale to a range of [0,10]
EICs = permute(EICs,[2 1 3]);
timeGrid = (1:nbRT)';
clear chrom proData agg aggPair bb binEdge binICs binQCICs binQICs ccQC cnt ...
exmpICs idxAgg idxC idxExmp idxQ idxXC idxx mcq mcqQ mcqQC ...
mzHi mzLow nbExmp nbMZ nbBin nbQBin nbQCBin p prefAgg prefSim rootE ...
sim tmpXC winQ winX xcAve xcAveQC xcMat xcMin xcMinQC i j
%% Compile internal standard reference
timeIS = textread('time_standard_proteomics.txt');
timeGrid = (1:nbRT)';
timeGridScaled = timeGrid./nbRT;
timeScaled = (timeIS-10)./105; % scale RT [10,115] to [0,1]
timeRef = nanmean(timeScaled,2); % reference time for the internal standard
meanMapTime = zeros(nbRT,nbSamp);
stdMapTime = zeros(nbRT,nbSamp);
%% GP hyperparameters set
covfunc = @covSEiso; hyp.cov = log([0.05; 0.1]);
likfunc = @likGauss; hyp.lik = log(0.05);
meanfunc = {@meanSum, {@meanLinear, @meanConst}}; hyp.mean = [1; 0];
for i = 1:nbSamp
idxTime = find(~isnan(timeScaled(:,i))); % ignore absent peaks
[meanMapTime(:,i), stdMapTime(:,i)] = gp(hyp, @infExact, meanfunc, covfunc, likfunc, ...
timeScaled(idxTime,i), timeRef(idxTime), timeGridScaled);
end
meanMapTime = meanMapTime.*nbRT;
stdMapTime = stdMapTime.*nbRT;
varMapTime = stdMapTime.^2;
%% B-spline
order = 3;
timeExt = timeGrid;
denKnotReg = 0.5; % density of knots for prototype function (0.25--0.75)
denKnotMap = 0.025; % density of knots for mapping function (<= 0.1)
ptStart = timeExt(1);
ptEnd = timeExt(end);
lenBS = length(timeExt);
nbKnotReg = ceil(lenBS*denKnotReg);
knotsReg = unique(linspace(ptStart,ptEnd,nbKnotReg));
knotsReg = [knotsReg(1)*ones(1,order) knotsReg(2:(end-1)) ...
knotsReg(end)*ones(1,order)];
nbReg = length(knotsReg)-order;
BSReg = bsplinebasis(knotsReg,order,timeExt);
muReg = zeros(nbReg,1);
nbMap = ceil((timeGrid(end)-timeGrid(1))*denKnotMap);
varKnot = unique(round(linspace(timeGrid(1),timeGrid(end),nbMap)))';
meanMap = meanMapTime(varKnot,:); % mean of mapping function coeff by GP
meanMap(1,:) = varKnot(1); % fixed initial point
meanMap(end,:) = varKnot(end); % fixed ending point
for i = 1:nbSamp
map(i).coeff = meanMap(:,i);
map(i).acpt = zeros(4,nbRT-1);
end
%% Hyperparameters
hyMuScale = 1;
hyMuShift = 0;
hyTauScale = 1/0.5;
hyTauShift = 1/0.5;
hyShapeScale = 0.1;
hyRateScale = 1;
hyShapeShift = 0.1;
hyRateShift = 1;
hyShapePsi = 0.1;
hyRatePsi = 1;
hyShapeEpsilon = 0.1;
hyRateEpsilon = 0.2;
SigmaReg = diag([2*ones(1,nbReg-1) 1],0) + diag(-1*ones(1,nbReg-1),1) ...
+diag(-1*ones(1,nbReg-1),-1);
%% MCMC setting/initialization
nbMCMC = 15000;
% Space allocation MCMC runs
spMuScale = zeros(1,nbMCMC); % a0
spMuShift = zeros(1,nbMCMC); % c0
spScale = zeros(nbSamp,nbMCMC); % ai
spShift = zeros(nbSamp,nbMCMC); % ci
spTauScale = zeros(1,nbMCMC); % 1/var(ai)
spTauShift = zeros(1,nbMCMC); % 1/var(ci)
spTauEpsilon = zeros(1,nbMCMC); % 1/var(ei)
spTauPsi = zeros(1,nbMCMC); % 1/var for regression coeff
spReg = zeros(nbReg,nbEIC,nbMCMC); % regression coeff (prototype function)
spMap = zeros(nbMap,nbSamp,nbMCMC);% mapping function coeff
% Initial value assignment
spMuScale(1) = hyMuScale;
spMuShift(1) = hyMuShift;
spScale(:,1) = hyMuScale*ones(nbSamp,1);
spShift(:,1) = hyMuShift*ones(nbSamp,1);
spTauScale(1) = hyShapeScale/hyRateScale;
spTauShift(1) = hyShapeShift/hyRateShift;
spTauEpsilon(1) = hyShapeEpsilon/hyRateEpsilon;
spTauPsi(1) = hyShapePsi/hyRatePsi;
spMap(:,:,1) = meanMap; % mean of mapping function coeff by GP
% Metropolis step
stepMH1 = 3;
stepMH2 = 10;
%% Run MCMC
BSTilt = repmat(BSReg,nbSamp,1); % BS_i (space allocation)
vecScale = ones(nbSamp*nbRT,1);
vecShift = ones(nbSamp*nbRT,1);
vecEICs = zeros(nbSamp*nbRT,nbEIC);
for b = 1:nbEIC
vecEICs(:,b) = reshape(EICs(:,:,b),nbSamp*nbRT,1);
end
rng default % reset the random seed
tic % initialize timer
for mc = 2:nbMCMC
% Matrix manipulation
idxMat = interp1(varKnot, spMap(:,:,mc-1), timeGrid);
BSTilt = interp1(timeExt,BSReg,idxMat(:));
repScale = repmat(spScale(:,mc-1)',nbRT,1);
vecScale = repScale(:);
SBSTilt = scale_rows(BSTilt,vecScale); % a_i*BS_i from lightspeed
repShift = repmat(spShift(:,mc-1)',nbRT,1);
vecShift = repShift(:);
%% Gibbs sampling goes below
% regression coefficients of prototype function
invCovReg = SigmaReg*spTauPsi(mc-1);
tmpCov = inv_posdef(invCovReg + (SBSTilt'*SBSTilt)*spTauEpsilon(mc-1)); % from lightspeed
for b = 1:nbEIC
tmpMuVec = tmpCov*SBSTilt'*(vecEICs(:,b)-vecShift)*spTauEpsilon(mc-1);
spReg(:,b,mc) = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed
end
% a0
tmpVar = (hyTauScale + nbSamp*spTauScale(mc-1))^(-1);
tmpMu = tmpVar*(hyMuScale*hyTauScale + sum(spScale(:,mc-1))*spTauScale(mc-1));
spMuScale(mc) = tmpMu + sqrt(tmpVar)*randn(1);
% c0
tmpVar = (hyTauShift + nbSamp*spTauShift(mc-1))^(-1);
tmpMu = tmpVar*(hyMuShift*hyTauShift + sum(spShift(:,mc-1))*spTauShift(mc-1));
spMuShift(mc) = tmpMu + sqrt(tmpVar)*randn(1);
% (ai, ci)
for i = 1:nbSamp
tmpMat = [reshape(BSTilt((i-1)*nbRT+1:i*nbRT,:)*spReg(:,:,mc),nbRT*nbEIC,1), ...
ones(nbRT*nbEIC,1)];
tmpCov = inv_posdef(diag([spTauScale(mc-1) spTauShift(mc-1)]) + spTauEpsilon(mc-1)*tmpMat'*tmpMat); % from lightspeed
tmpMuVec = tmpCov*(diag([spTauScale(mc-1) spTauShift(mc-1)])*[spMuScale(mc);spMuShift(mc)]...
+ spTauEpsilon(mc-1)*tmpMat'*reshape(EICs(:,i,:),nbRT*nbEIC,1));
tmpSp = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed
spScale(i,mc) = tmpSp(1);
spShift(i,mc) = tmpSp(2);
end
% 1/var(ei)
tmpShape = hyShapeEpsilon + 0.5*nbRT*nbSamp*nbEIC;
repScale = repmat(spScale(:,mc)',nbRT,1);
vecScale = repScale(:);
SBSTilt = scale_rows(BSTilt,vecScale); % from lightspeed
repShift = repmat(spShift(:,mc)',nbRT,1);
vecShift = repShift(:);
vecTICHat = reshape(SBSTilt*spReg(:,:,mc),nbRT*nbSamp*nbEIC,1) + repmat(vecShift,nbEIC,1);
tmpRate = hyRateEpsilon + 0.5*(vecEICs(:)-vecTICHat)'*(vecEICs(:)-vecTICHat);
spTauEpsilon(mc) = gamrnd(tmpShape,1/tmpRate);
% 1/var(ai)
tmpShape = hyShapeScale + 0.5*nbSamp;
tmpRate = hyRateScale + 0.5*sum((spScale(:,mc)-spMuScale(mc)).^2);
spTauScale(mc) = gamrnd(tmpShape,1/tmpRate);
% 1/var(ci)
tmpShape = hyShapeShift + 0.5*nbSamp;
tmpRate = hyRateShift + 0.5*sum((spShift(:,mc)-spMuShift(mc)).^2);
spTauShift(mc) = gamrnd(tmpShape,1/tmpRate);
% 1/var for the prototype function
tmpShape = hyShapePsi + 0.5*nbReg*nbEIC;
tmpMat1 = spReg(:,1,mc)*spReg(:,1,mc)';
for b=2:nbEIC
tmpMat1 = tmpMat1 + spReg(:,b,mc)*spReg(:,b,mc)';
end
tmpRate = hyRatePsi + 0.5*trace(tmpMat1*SigmaReg);
spTauPsi(mc) = gamrnd(tmpShape,1/tmpRate);
%% Metropolis-Hastings algo
for i = 1:nbSamp
tmpMap = map(i).coeff;
tmpIdx = interp1(varKnot,tmpMap,timeGrid);
tmpBSReg = interp1(timeExt,BSReg,tmpIdx);
for b = 1:nbEIC
tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);
end
tmpEvaln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);
% generating blocks
rInd = randi(3);
switch rInd
case 1
rBound = 1;
case 2
rBound = 0.5;
case 3
rBound = 0.25;
end
idxBound = find(rand(1,nbMap-3)<rBound)+2; % block ends at nbMap-1
blkMH = [2,idxBound; idxBound-1,nbMap-1];
nbBlock = size(blkMH,2);
rStep = randi(2);
switch rStep
case 1
stepMH = stepMH1;
case 2
stepMH = stepMH2;
end
for m = 1:nbBlock
tmpMapProp = tmpMap;
% identify moveable range
tmpLB = tmpMap(blkMH(1,m)-1)-tmpMap(blkMH(1,m));
tmpUB = tmpMap(blkMH(2,m)+1)-tmpMap(blkMH(2,m));
% uniform proposal reflective on the boundary
unBound = true;
stepProp = stepMH*(2*rand(1)-1);
while unBound
if stepProp > tmpUB
stepProp = 2*tmpUB-stepProp;
elseif stepProp < tmpLB
stepProp = 2*tmpLB-stepProp;
else
unBound = false;
end
end
tmpMapProp(blkMH(1,m):blkMH(2,m)) = tmpMapProp(blkMH(1,m):blkMH(2,m)) + stepProp;
tmpIdxProp = interp1(varKnot,tmpMapProp,timeGrid);
idxEval = varKnot(blkMH(1,m)-1):varKnot(blkMH(2,m)+1); % range of interest
pRatioln = -0.5*sum( ((tmpIdxProp(idxEval)-meanMapTime(idxEval,i)).^2 ...
- (tmpIdx(idxEval)-meanMapTime(idxEval,i)).^2) ...
./ varMapTime(idxEval,i) ); % log of prior odds
tmpBSReg = interp1(timeExt,BSReg,tmpIdxProp);
for b = 1:nbEIC
tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);
end
tmpEvaPropln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);
lRatioln = tmpEvaPropln-tmpEvaln;
switch rStep
case 1
map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;
case 2
map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;
end
if rand(1) < min([1, exp(lRatioln+pRatioln)])
tmpMap = tmpMapProp;
tmpIdx = tmpIdxProp;
tmpEvaln = tmpEvaPropln;
switch rStep
case 1
map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;
case 2
map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;
end
end
end
map(i).coeff = tmpMap;
spMap(:,i,mc) = tmpMap;
end
if rem(mc,500)==0
fprintf('Iteration %d, time %d \r', mc,toc/60);
end
end
%% retention time correction
postKnot = ndsum(spMap(:,:,5001:15000),3)/10000; % initial 5000 samples as burn-in
postMap = interp1(varKnot, postKnot, timeGrid);
crtRT = zeros(nbRT,nbSamp);
for i = 1:nbSamp
crtRT(:,i) = interp1(timeGrid,timeRes,postMap(:,i),'pchip');
end
for i = 1:nbSamp
in_text = ['sima\P1_' num2str(i) '.txt'];
out_text = ['gpmp4\P1_' num2str(i) '.txt'];
tmp = textread(in_text);
newRT = interp1(timeRes,crtRT(:,i),tmp(:,4),'pchip');
tmp(:,4) = round(newRT*100)/100;
dlmwrite(out_text, tmp, 'delimiter', '\t', 'precision', 10, 'newline', 'pc');
end