bioinfo_proteo

%% Description

% Data set: proteomics

% Method: multi-profile alignment with Gaussian process prior

% ----- data_matrix_proteomics.mat -----

% proData: binned matrix 20 x 1000 x 3200 (sample x RT points x mz bins)

% timeRes: registered time index

% nbSamp: # of samples (20)

% nbRT: # of RT points (1000)

% nbMZ: # of mz bins (3200)

% mzLow: mz lower bound for each bin

% mzHi: mz upper bound for each bin

%% Required utilities

% ----- attached functions -----

% coda() to calculate MCQ values

% ----- functions from elsewhere -----

% inv_posdef(), randnorm(), scale_rows(), ndsum() from Tom Minka's Lightspeed toolbox,

% downloaded at http://research.microsoft.com/en-us/um/people/minka/software/lightspeed/

% randraw() from File Exchange at MATLAB Central,

% downloaded at http://www.mathworks.com/matlabcentral/fileexchange/7309

% bsplinebasis() from Scott Gaffney's CCToolbox,

% downloaded at http://www.ics.uci.edu/~sgaffney/software/CCT/

% apcluster() by Frey Lab,

% downloaded at http://www.psi.toronto.edu/index.php?q=affinity%20propagation

% GPML toolbox (v3.1) by Carl Edward Rasmussen and Hannes Nickisch

% downloaded at http://www.gaussianprocess.org/gpml/code/matlab/doc/index.html

%% Load the proteomic data set

clear all; close all; clc;

load data_matrix_proteomics.mat

[nbSamp,nbRT,nbMZ] = size(proData);

%% 1st-phase screening based on MCQ

binICs = proData;

winQ = 3;

% considering the worst case

mcq = coda(squeeze(binICs(1,:,:)),winQ);

for i = 2:nbSamp

mcq = min([mcq; coda(squeeze(binICs(i,:,:)),winQ)],[],1);

end

idxQ = find(mcq>=0.9);

binQICs = binICs(:,:,idxQ);

mcqQ = mcq(idxQ);

nbQBin = numel(idxQ);

% scale the remaining chromatograms (profile is more important than abosolute amount)

for b = 1:nbQBin

for i = 1:nbSamp

binQICs(i,:,b) = binQICs(i,:,b)./ndsum(binQICs(i,:,b),2);

end

%% 2nd-phase screening based on reproducibility among samples (with xcorr)

xcMat = zeros(nbSamp,nbSamp,nbQBin);

xcAve = zeros(1,nbQBin);

rootE = zeros(1,nbSamp); % root of energy used for normalization

winX = 40;

for b = 1:nbQBin

for i = 1:nbSamp

rootE(i) = sqrt(sum((binQICs(i,:,b).*binQICs(i,:,b))));

xcMat(i,i,b) = 1;

end

for i = 1:nbSamp

for j = i+1:nbSamp

tmpXC = xcorr(binQICs(i,:,b),binQICs(j,:,b),winX);

idxXC = maxind(tmpXC);

if isempty(idxXC)

xcMat(i,j,b) = 0;

else

idxXC(find(tmpXC(idxXC)<0.5*max(tmpXC(idxXC)))) = [];

[~,idxx] = min(abs(idxXC-(winX+1)));

xcMat(i,j,b) = tmpXC(idxXC(idxx))/(rootE(i)*rootE(j));

end

xcMat(j,i,b) = xcMat(i,j,b);

end

xcAve(b) = (ndsum(xcMat(:,:,b),1:2)-nbSamp)/(nbSamp*nbSamp-nbSamp);

end

idxC = find(xcAve>=0.85);

binQCICs = binQICs(:,:,idxC);

mcqQC = mcqQ(idxC);

nbQCBin = numel(idxC);

xcAveQC = xcAve(idxC);

%% identify exemplars using affinity propagation (correlation coeff. as similarity)

ccQC = zeros(nbQCBin);

for b = 1:nbSamp

ccQC = ccQC + corr(squeeze(binQCICs(b,:,:)));

end

ccQC = ccQC/nbSamp;

sim = zeros(nbQCBin*nbQCBin-nbQCBin,3);

cnt = 1;

for b = 1:nbQCBin

for p = [1:b-1,b+1:nbQCBin]

sim(cnt,1)=b; sim(cnt,2)=p; sim(cnt,3)=ccQC(b,p);

cnt=cnt+1;

end

prefSim = mean(sim(:,3));

[idxExmp,~,~,~] = apcluster(sim,prefSim);

%% agglomerative clustering of the exemplars (based on overlapping level)

nbExmp = numel(unique(idxExmp));

exmpICs = binQCICs(:,:,unique(idxExmp));

tmpNum = nbExmp;

tmpICs = exmpICs;

tmpIdx = cell(1,tmpNum);

for b = 1:tmpNum

tmpIdx{b} = b;

end

for l = 1:nbExmp-1

tmpDis = zeros(tmpNum*(tmpNum-1)/2,3);

cnt = 1;

for b = 1:tmpNum-1

for p = b+1:tmpNum

tmpDis(cnt,1) = b;

tmpDis(cnt,2) = p;

for i = 1:nbSamp

tmpDis(cnt,3) = tmpDis(cnt,3) + sum(min(squeeze(tmpICs(i,:,[b,p])),[],2));

end

cnt = cnt+1;

end

[layer(l).ovp, idxPair] = min(tmpDis(:,3));

tmpICs(:,:,tmpDis(idxPair,1)) = tmpICs(:,:,tmpDis(idxPair,1))+tmpICs(:,:,tmpDis(idxPair,2));

tmpICs(:,:,tmpDis(idxPair,2)) = [];

tmpIdx{tmpDis(idxPair,1)} = [tmpIdx{tmpDis(idxPair,1)},tmpIdx{tmpDis(idxPair,2)}];

tmpIdx(tmpDis(idxPair,2)) = [];

tmpNum = tmpNum-1;

layer(l).num = tmpNum;

layer(l).idx = tmpIdx;

end

nbEIC = 4; % pre-defined value

EICs = zeros(nbSamp,nbRT,nbEIC);

for b=1:4

EICs(:,:,b) = ndsum(exmpICs(:,:,layer(numel(layer)-nbEIC+1).idx{b}),3);

end

EICs = EICs*10; % scale to a range of [0,10]

EICs = permute(EICs,[2 1 3]);

timeGrid = (1:nbRT)';

clear chrom proData agg aggPair bb binEdge binICs binQCICs binQICs ccQC cnt ...

exmpICs idxAgg idxC idxExmp idxQ idxXC idxx mcq mcqQ mcqQC ...

mzHi mzLow nbExmp nbMZ nbBin nbQBin nbQCBin p prefAgg prefSim rootE ...

sim tmpXC winQ winX xcAve xcAveQC xcMat xcMin xcMinQC i j

%% Compile internal standard reference

timeIS = textread('time_standard_proteomics.txt');

timeGrid = (1:nbRT)';

timeGridScaled = timeGrid./nbRT;

timeScaled = (timeIS-10)./105; % scale RT [10,115] to [0,1]

timeRef = nanmean(timeScaled,2); % reference time for the internal standard

meanMapTime = zeros(nbRT,nbSamp);

stdMapTime = zeros(nbRT,nbSamp);

%% GP hyperparameters set

covfunc = @covSEiso; hyp.cov = log([0.05; 0.1]);

likfunc = @likGauss; hyp.lik = log(0.05);

meanfunc = {@meanSum, {@meanLinear, @meanConst}}; hyp.mean = [1; 0];

for i = 1:nbSamp

idxTime = find(~isnan(timeScaled(:,i))); % ignore absent peaks

[meanMapTime(:,i), stdMapTime(:,i)] = gp(hyp, @infExact, meanfunc, covfunc, likfunc, ...

timeScaled(idxTime,i), timeRef(idxTime), timeGridScaled);

end

meanMapTime = meanMapTime.*nbRT;

stdMapTime = stdMapTime.*nbRT;

varMapTime = stdMapTime.^2;

%% B-spline

order = 3;

timeExt = timeGrid;

denKnotReg = 0.5; % density of knots for prototype function (0.25--0.75)

denKnotMap = 0.025; % density of knots for mapping function (<= 0.1)

ptStart = timeExt(1);

ptEnd = timeExt(end);

lenBS = length(timeExt);

nbKnotReg = ceil(lenBS*denKnotReg);

knotsReg = unique(linspace(ptStart,ptEnd,nbKnotReg));

knotsReg = [knotsReg(1)*ones(1,order) knotsReg(2:(end-1)) ...

knotsReg(end)*ones(1,order)];

nbReg = length(knotsReg)-order;

BSReg = bsplinebasis(knotsReg,order,timeExt);

muReg = zeros(nbReg,1);

nbMap = ceil((timeGrid(end)-timeGrid(1))*denKnotMap);

varKnot = unique(round(linspace(timeGrid(1),timeGrid(end),nbMap)))';

meanMap = meanMapTime(varKnot,:); % mean of mapping function coeff by GP

meanMap(1,:) = varKnot(1); % fixed initial point

meanMap(end,:) = varKnot(end); % fixed ending point

for i = 1:nbSamp

map(i).coeff = meanMap(:,i);

map(i).acpt = zeros(4,nbRT-1);

end

%% Hyperparameters

hyMuScale = 1;

hyMuShift = 0;

hyTauScale = 1/0.5;

hyTauShift = 1/0.5;

hyShapeScale = 0.1;

hyRateScale = 1;

hyShapeShift = 0.1;

hyRateShift = 1;

hyShapePsi = 0.1;

hyRatePsi = 1;

hyShapeEpsilon = 0.1;

hyRateEpsilon = 0.2;

SigmaReg = diag([2*ones(1,nbReg-1) 1],0) + diag(-1*ones(1,nbReg-1),1) ...

+diag(-1*ones(1,nbReg-1),-1);

%% MCMC setting/initialization

nbMCMC = 15000;

% Space allocation MCMC runs

spMuScale = zeros(1,nbMCMC); % a0

spMuShift = zeros(1,nbMCMC); % c0

spScale = zeros(nbSamp,nbMCMC); % ai

spShift = zeros(nbSamp,nbMCMC); % ci

spTauScale = zeros(1,nbMCMC); % 1/var(ai)

spTauShift = zeros(1,nbMCMC); % 1/var(ci)

spTauEpsilon = zeros(1,nbMCMC); % 1/var(ei)

spTauPsi = zeros(1,nbMCMC); % 1/var for regression coeff

spReg = zeros(nbReg,nbEIC,nbMCMC); % regression coeff (prototype function)

spMap = zeros(nbMap,nbSamp,nbMCMC);% mapping function coeff

% Initial value assignment

spMuScale(1) = hyMuScale;

spMuShift(1) = hyMuShift;

spScale(:,1) = hyMuScale*ones(nbSamp,1);

spShift(:,1) = hyMuShift*ones(nbSamp,1);

spTauScale(1) = hyShapeScale/hyRateScale;

spTauShift(1) = hyShapeShift/hyRateShift;

spTauEpsilon(1) = hyShapeEpsilon/hyRateEpsilon;

spTauPsi(1) = hyShapePsi/hyRatePsi;

spMap(:,:,1) = meanMap; % mean of mapping function coeff by GP

% Metropolis step

stepMH1 = 3;

stepMH2 = 10;

%% Run MCMC

BSTilt = repmat(BSReg,nbSamp,1); % BS_i (space allocation)

vecScale = ones(nbSamp*nbRT,1);

vecShift = ones(nbSamp*nbRT,1);

vecEICs = zeros(nbSamp*nbRT,nbEIC);

for b = 1:nbEIC

vecEICs(:,b) = reshape(EICs(:,:,b),nbSamp*nbRT,1);

end

rng default % reset the random seed

tic % initialize timer

for mc = 2:nbMCMC

% Matrix manipulation

idxMat = interp1(varKnot, spMap(:,:,mc-1), timeGrid);

BSTilt = interp1(timeExt,BSReg,idxMat(:));

repScale = repmat(spScale(:,mc-1)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % a_i*BS_i from lightspeed

repShift = repmat(spShift(:,mc-1)',nbRT,1);

vecShift = repShift(:);

%% Gibbs sampling goes below

% regression coefficients of prototype function

invCovReg = SigmaReg*spTauPsi(mc-1);

tmpCov = inv_posdef(invCovReg + (SBSTilt'*SBSTilt)*spTauEpsilon(mc-1)); % from lightspeed

for b = 1:nbEIC

tmpMuVec = tmpCov*SBSTilt'*(vecEICs(:,b)-vecShift)*spTauEpsilon(mc-1);

spReg(:,b,mc) = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

end

% a0

tmpVar = (hyTauScale + nbSamp*spTauScale(mc-1))^(-1);

tmpMu = tmpVar*(hyMuScale*hyTauScale + sum(spScale(:,mc-1))*spTauScale(mc-1));

spMuScale(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% c0

tmpVar = (hyTauShift + nbSamp*spTauShift(mc-1))^(-1);

tmpMu = tmpVar*(hyMuShift*hyTauShift + sum(spShift(:,mc-1))*spTauShift(mc-1));

spMuShift(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% (ai, ci)

for i = 1:nbSamp

tmpMat = [reshape(BSTilt((i-1)*nbRT+1:i*nbRT,:)*spReg(:,:,mc),nbRT*nbEIC,1), ...

ones(nbRT*nbEIC,1)];

tmpCov = inv_posdef(diag([spTauScale(mc-1) spTauShift(mc-1)]) + spTauEpsilon(mc-1)*tmpMat'*tmpMat); % from lightspeed

tmpMuVec = tmpCov*(diag([spTauScale(mc-1) spTauShift(mc-1)])*[spMuScale(mc);spMuShift(mc)]...

+ spTauEpsilon(mc-1)*tmpMat'*reshape(EICs(:,i,:),nbRT*nbEIC,1));

tmpSp = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

spScale(i,mc) = tmpSp(1);

spShift(i,mc) = tmpSp(2);

end

% 1/var(ei)

tmpShape = hyShapeEpsilon + 0.5*nbRT*nbSamp*nbEIC;

repScale = repmat(spScale(:,mc)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % from lightspeed

repShift = repmat(spShift(:,mc)',nbRT,1);

vecShift = repShift(:);

vecTICHat = reshape(SBSTilt*spReg(:,:,mc),nbRT*nbSamp*nbEIC,1) + repmat(vecShift,nbEIC,1);

tmpRate = hyRateEpsilon + 0.5*(vecEICs(:)-vecTICHat)'*(vecEICs(:)-vecTICHat);

spTauEpsilon(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ai)

tmpShape = hyShapeScale + 0.5*nbSamp;

tmpRate = hyRateScale + 0.5*sum((spScale(:,mc)-spMuScale(mc)).^2);

spTauScale(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ci)

tmpShape = hyShapeShift + 0.5*nbSamp;

tmpRate = hyRateShift + 0.5*sum((spShift(:,mc)-spMuShift(mc)).^2);

spTauShift(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var for the prototype function

tmpShape = hyShapePsi + 0.5*nbReg*nbEIC;

tmpMat1 = spReg(:,1,mc)*spReg(:,1,mc)';

for b=2:nbEIC

tmpMat1 = tmpMat1 + spReg(:,b,mc)*spReg(:,b,mc)';

end

tmpRate = hyRatePsi + 0.5*trace(tmpMat1*SigmaReg);

spTauPsi(mc) = gamrnd(tmpShape,1/tmpRate);

%% Metropolis-Hastings algo

for i = 1:nbSamp

tmpMap = map(i).coeff;

tmpIdx = interp1(varKnot,tmpMap,timeGrid);

tmpBSReg = interp1(timeExt,BSReg,tmpIdx);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

% generating blocks

rInd = randi(3);

switch rInd

case 1

rBound = 1;

case 2

rBound = 0.5;

case 3

rBound = 0.25;

end

idxBound = find(rand(1,nbMap-3)<rBound)+2; % block ends at nbMap-1

blkMH = [2,idxBound; idxBound-1,nbMap-1];

nbBlock = size(blkMH,2);

rStep = randi(2);

switch rStep

case 1

stepMH = stepMH1;

case 2

stepMH = stepMH2;

end

for m = 1:nbBlock

tmpMapProp = tmpMap;

% identify moveable range

tmpLB = tmpMap(blkMH(1,m)-1)-tmpMap(blkMH(1,m));

tmpUB = tmpMap(blkMH(2,m)+1)-tmpMap(blkMH(2,m));

% uniform proposal reflective on the boundary

unBound = true;

stepProp = stepMH*(2*rand(1)-1);

while unBound

if stepProp > tmpUB

stepProp = 2*tmpUB-stepProp;

elseif stepProp < tmpLB

stepProp = 2*tmpLB-stepProp;

else

unBound = false;

end

tmpMapProp(blkMH(1,m):blkMH(2,m)) = tmpMapProp(blkMH(1,m):blkMH(2,m)) + stepProp;

tmpIdxProp = interp1(varKnot,tmpMapProp,timeGrid);

idxEval = varKnot(blkMH(1,m)-1):varKnot(blkMH(2,m)+1); % range of interest

pRatioln = -0.5*sum( ((tmpIdxProp(idxEval)-meanMapTime(idxEval,i)).^2 ...

- (tmpIdx(idxEval)-meanMapTime(idxEval,i)).^2) ...

./ varMapTime(idxEval,i) ); % log of prior odds

tmpBSReg = interp1(timeExt,BSReg,tmpIdxProp);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaPropln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

lRatioln = tmpEvaPropln-tmpEvaln;

switch rStep

case 1

map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

if rand(1) < min([1, exp(lRatioln+pRatioln)])

tmpMap = tmpMapProp;

tmpIdx = tmpIdxProp;

tmpEvaln = tmpEvaPropln;

switch rStep

case 1

map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

map(i).coeff = tmpMap;

spMap(:,i,mc) = tmpMap;

end

if rem(mc,500)==0

fprintf('Iteration %d, time %d \r', mc,toc/60);

end

%% retention time correction

postKnot = ndsum(spMap(:,:,5001:15000),3)/10000; % initial 5000 samples as burn-in

postMap = interp1(varKnot, postKnot, timeGrid);

crtRT = zeros(nbRT,nbSamp);

for i = 1:nbSamp

crtRT(:,i) = interp1(timeGrid,timeRes,postMap(:,i),'pchip');

end

for i = 1:nbSamp

in_text = ['sima\P1_' num2str(i) '.txt'];

out_text = ['gpmp4\P1_' num2str(i) '.txt'];

tmp = textread(in_text);

newRT = interp1(timeRes,crtRT(:,i),tmp(:,4),'pchip');

tmp(:,4) = round(newRT*100)/100;

dlmwrite(out_text, tmp, 'delimiter', '\t', 'precision', 10, 'newline', 'pc');

end