bioinfo_glyco

%% Description

% Data set: glycomics

% Method: multi-profile alignment with Gaussian process prior

% ----- data_matrix_glycomics.mat -----

% glyData: binned matrix 23 x 1000 x 3000 (sample x RT points x mz bins)

% timeRes: registered time index

% nbSamp: # of samples (23)

% nbRT: # of RT points (1000)

% nbMZ: # of mz bins (3000)

% mzLow: mz lower bound for each bin

% mzHi: mz upper bound for each bin

%% Required utilities

% ----- attached functions -----

% coda() to calculate MCQ values

% ----- functions from elsewhere -----

% inv_posdef(), randnorm(), scale_rows(), ndsum() from Tom Minka's Lightspeed toolbox,

% downloaded at http://research.microsoft.com/en-us/um/people/minka/software/lightspeed/

% randraw() from File Exchange at MATLAB Central,

% downloaded at http://www.mathworks.com/matlabcentral/fileexchange/7309

% bsplinebasis() from Scott Gaffney's CCToolbox,

% downloaded at http://www.ics.uci.edu/~sgaffney/software/CCT/

% apcluster() by Frey Lab,

% downloaded at http://www.psi.toronto.edu/index.php?q=affinity%20propagation

% GPML toolbox (v3.1) by Carl Edward Rasmussen and Hannes Nickisch

% downloaded at http://www.gaussianprocess.org/gpml/code/matlab/doc/index.html

%% Load the glycomic data set

clear all; close all; clc;

load data_matrix_glycomics.mat

[nbSamp,nbRT,nbMZ] = size(glyData);

%% 1st-phase screening based on MCQ

binICs = glyData;

winQ = 3;

% considering the worst case

mcq = coda(squeeze(binICs(1,:,:)),winQ);

for i = 2:nbSamp

mcq = min([mcq; coda(squeeze(binICs(i,:,:)),winQ)],[],1);

end

idxQ = find(mcq>=0.9);

binQICs = binICs(:,:,idxQ);

mcqQ = mcq(idxQ);

nbQBin = numel(idxQ);

% scale the remaining chromatograms (profile is more important than abosolute amount)

for b = 1:nbQBin

for i = 1:nbSamp

binQICs(i,:,b) = binQICs(i,:,b)./ndsum(binQICs(i,:,b),2);

end

%% 2nd-phase screening based on reproducibility among samples (with xcorr)

xcMat = zeros(nbSamp,nbSamp,nbQBin);

xcAve = zeros(1,nbQBin);

rootE = zeros(1,nbSamp); % root of energy used for normalization

winX = 100;

for b = 1:nbQBin

for i = 1:nbSamp

rootE(i) = sqrt(sum((binQICs(i,:,b).*binQICs(i,:,b))));

xcMat(i,i,b) = 1;

end

for i = 1:nbSamp

for j = i+1:nbSamp

tmpXC = xcorr(binQICs(i,:,b),binQICs(j,:,b),winX);

idxXC = maxind(tmpXC);

if isempty(idxXC)

xcMat(i,j,b) = 0;

else

idxXC(find(tmpXC(idxXC)<0.5*max(tmpXC(idxXC)))) = [];

[~,idxx] = min(abs(idxXC-(winX+1)));

xcMat(i,j,b) = tmpXC(idxXC(idxx))/(rootE(i)*rootE(j));

end

xcMat(j,i,b) = xcMat(i,j,b);

end

xcAve(b) = (ndsum(xcMat(:,:,b),1:2)-nbSamp)/(nbSamp*nbSamp-nbSamp);

end

idxC = find(xcAve>=0.85);

binQCICs = binQICs(:,:,idxC);

mcqQC = mcqQ(idxC);

nbQCBin = numel(idxC);

xcAveQC = xcAve(idxC);

%% Identify exemplars using affinity propagation (correlation coeff. as similarity)

ccQC = zeros(nbQCBin);

for b = 1:nbSamp

ccQC = ccQC + corr(squeeze(binQCICs(b,:,:)));

end

ccQC = ccQC/nbSamp;

sim = zeros(nbQCBin*nbQCBin-nbQCBin,3);

cnt = 1;

for b = 1:nbQCBin

for p = [1:b-1,b+1:nbQCBin]

sim(cnt,1)=b; sim(cnt,2)=p; sim(cnt,3)=ccQC(b,p);

cnt=cnt+1;

end

prefSim = mean(sim(:,3));

[idxExmp,~,~,~] = apcluster(sim,prefSim);

nbExmp = numel(unique(idxExmp));

exmpICs = binQCICs(:,:,unique(idxExmp));

%% Agglomerative clustering of the exemplars (based on overlapping level)

tmpNum = nbExmp;

tmpICs = exmpICs;

tmpIdx = cell(1,tmpNum);

for b = 1:tmpNum

tmpIdx{b} = b;

end

for l = 1:nbExmp-1

tmpDis = zeros(tmpNum*(tmpNum-1)/2,3);

cnt = 1;

for b = 1:tmpNum-1

for p = b+1:tmpNum

tmpDis(cnt,1) = b;

tmpDis(cnt,2) = p;

for i = 1:nbSamp

tmpDis(cnt,3) = tmpDis(cnt,3) + sum(min(squeeze(tmpICs(i,:,[b,p])),[],2));

end

cnt = cnt+1;

end

[layer(l).ovp, idxPair] = min(tmpDis(:,3));

tmpICs(:,:,tmpDis(idxPair,1)) = tmpICs(:,:,tmpDis(idxPair,1))+tmpICs(:,:,tmpDis(idxPair,2));

tmpICs(:,:,tmpDis(idxPair,2)) = [];

tmpIdx{tmpDis(idxPair,1)} = [tmpIdx{tmpDis(idxPair,1)},tmpIdx{tmpDis(idxPair,2)}];

tmpIdx(tmpDis(idxPair,2)) = [];

tmpNum = tmpNum-1;

layer(l).num = tmpNum;

layer(l).idx = tmpIdx;

end

nbEIC = 4; % pre-defined value

EICs = zeros(nbSamp,nbRT,nbEIC);

for b=1:nbEIC

EICs(:,:,b) = ndsum(exmpICs(:,:,layer(numel(layer)-nbEIC+1).idx{b}),3);

end

EICs = EICs*10; % scale to a range of [0,10]

EICs = permute(EICs,[2 1 3]);

timeGrid = (1:nbRT)';

clear chrom glyData agg aggPair bb binEdge binICs binQCICs binQICs ccQC cnt ...

idxAgg idxC idxExmp idxQ idxXC idxx mcq mcqQ mcqQC ...

mzHi mzLow nbExmp nbMZ nbBin nbQBin nbQCBin p prefAgg prefSim rootE ...

sim tmpXC winQ winX xcAve xcAveQC xcMat xcMin xcMinQC i j

%% Compile internal standard reference

timeIS = textread('time_standard_glycomics.txt');

timeGrid = (1:nbRT)';

timeGridScaled = timeGrid./nbRT;

timeScaled = (timeIS-10)./50; % scale RT [10,60] to [0,1]

timeRef = nanmean(timeScaled,2); % reference time for the internal standard

meanMapTime = zeros(nbRT,nbSamp);

stdMapTime = zeros(nbRT,nbSamp);

%% GP hyperparameters set

covfunc = @covSEiso; hyp.cov = log([0.05; 0.1]);

likfunc = @likGauss; hyp.lik = log(0.05);

meanfunc = {@meanSum, {@meanLinear, @meanConst}}; hyp.mean = [1; 0];

for i = 1:nbSamp

idxTime = find(~isnan(timeScaled(:,i))); % ignore absent peaks

[meanMapTime(:,i), stdMapTime(:,i)] = gp(hyp, @infExact, meanfunc, covfunc, likfunc, ...

timeScaled(idxTime,i), timeRef(idxTime), timeGridScaled);

end

meanMapTime = meanMapTime.*nbRT;

stdMapTime = stdMapTime.*nbRT;

varMapTime = stdMapTime.^2;

%% B-spline

order = 3;

timeExt = timeGrid;

denKnotReg = 0.5; % density of knots for prototype function (0.25--0.75)

denKnotMap = 0.025; % density of knots for mapping function (<= 0.1)

ptStart = timeExt(1);

ptEnd = timeExt(end);

lenBS = length(timeExt);

nbKnotReg = ceil(lenBS*denKnotReg);

knotsReg = unique(linspace(ptStart,ptEnd,nbKnotReg));

knotsReg = [knotsReg(1)*ones(1,order) knotsReg(2:(end-1)) ...

knotsReg(end)*ones(1,order)];

nbReg = length(knotsReg)-order;

BSReg = bsplinebasis(knotsReg,order,timeExt);

muReg = zeros(nbReg,1);

nbMap = ceil((timeGrid(end)-timeGrid(1))*denKnotMap);

varKnot = unique(round(linspace(timeGrid(1),timeGrid(end),nbMap)))';

meanMap = meanMapTime(varKnot,:); % mean of mapping function coeff by GP

meanMap(1,:) = varKnot(1); % fixed initial point

meanMap(end,:) = varKnot(end); % fixed ending point

for i = 1:nbSamp

map(i).coeff = meanMap(:,i);

map(i).acpt = zeros(4,nbRT-1);

end

%% Hyperparameters

hyMuScale = 1;

hyMuShift = 0;

hyTauScale = 1/0.5;

hyTauShift = 1/0.5;

hyShapeScale = 0.1;

hyRateScale = 1;

hyShapeShift = 0.1;

hyRateShift = 1;

hyShapePsi = 0.1;

hyRatePsi = 1;

hyShapeEpsilon = 0.1;

hyRateEpsilon = 0.2;

SigmaReg = diag([2*ones(1,nbReg-1) 1],0) + diag(-1*ones(1,nbReg-1),1) ...

+diag(-1*ones(1,nbReg-1),-1);

%% MCMC setting/initialization

nbMCMC = 15000;

% Space allocation MCMC runs

spMuScale = zeros(1,nbMCMC); % a0

spMuShift = zeros(1,nbMCMC); % c0

spScale = zeros(nbSamp,nbMCMC); % ai

spShift = zeros(nbSamp,nbMCMC); % ci

spTauScale = zeros(1,nbMCMC); % 1/var(ai)

spTauShift = zeros(1,nbMCMC); % 1/var(ci)

spTauEpsilon = zeros(1,nbMCMC); % 1/var(ei)

spTauPsi = zeros(1,nbMCMC); % 1/var for regression coeff

spReg = zeros(nbReg,nbEIC,nbMCMC); % regression coeff (prototype function)

spMap = zeros(nbMap,nbSamp,nbMCMC);% mapping function coeff

% Initial value assignment

spMuScale(1) = hyMuScale;

spMuShift(1) = hyMuShift;

spScale(:,1) = hyMuScale*ones(nbSamp,1);

spShift(:,1) = hyMuShift*ones(nbSamp,1);

spTauScale(1) = hyShapeScale/hyRateScale;

spTauShift(1) = hyShapeShift/hyRateShift;

spTauEpsilon(1) = hyShapeEpsilon/hyRateEpsilon;

spTauPsi(1) = hyShapePsi/hyRatePsi;

spMap(:,:,1) = meanMap; % mean of mapping function coeff by GP

% Metropolis step

stepMH1 = 3;

stepMH2 = 10;

%% Run MCMC

BSTilt = repmat(BSReg,nbSamp,1); % BS_i (space allocation)

vecScale = ones(nbSamp*nbRT,1);

vecShift = ones(nbSamp*nbRT,1);

vecEICs = zeros(nbSamp*nbRT,nbEIC);

for b = 1:nbEIC

vecEICs(:,b) = reshape(EICs(:,:,b),nbSamp*nbRT,1);

end

rng default % reset the random seed

tic % initialize timer

for mc = 2:nbMCMC

% Matrix manipulation

idxMat = interp1(varKnot, spMap(:,:,mc-1), timeGrid);

BSTilt = interp1(timeExt,BSReg,idxMat(:));

repScale = repmat(spScale(:,mc-1)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % a_i*BS_i from lightspeed

repShift = repmat(spShift(:,mc-1)',nbRT,1);

vecShift = repShift(:);

%% Gibbs sampling goes below

% regression coefficients of prototype function

invCovReg = SigmaReg*spTauPsi(mc-1);

tmpCov = inv_posdef(invCovReg + (SBSTilt'*SBSTilt)*spTauEpsilon(mc-1)); % from lightspeed

for b = 1:nbEIC

tmpMuVec = tmpCov*SBSTilt'*(vecEICs(:,b)-vecShift)*spTauEpsilon(mc-1);

spReg(:,b,mc) = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

end

% a0

tmpVar = (hyTauScale + nbSamp*spTauScale(mc-1))^(-1);

tmpMu = tmpVar*(hyMuScale*hyTauScale + sum(spScale(:,mc-1))*spTauScale(mc-1));

spMuScale(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% c0

tmpVar = (hyTauShift + nbSamp*spTauShift(mc-1))^(-1);

tmpMu = tmpVar*(hyMuShift*hyTauShift + sum(spShift(:,mc-1))*spTauShift(mc-1));

spMuShift(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% (ai, ci)

for i = 1:nbSamp

tmpMat = [reshape(BSTilt((i-1)*nbRT+1:i*nbRT,:)*spReg(:,:,mc),nbRT*nbEIC,1), ...

ones(nbRT*nbEIC,1)];

tmpCov = inv_posdef(diag([spTauScale(mc-1) spTauShift(mc-1)]) + spTauEpsilon(mc-1)*tmpMat'*tmpMat); % from lightspeed

tmpMuVec = tmpCov*(diag([spTauScale(mc-1) spTauShift(mc-1)])*[spMuScale(mc);spMuShift(mc)]...

+ spTauEpsilon(mc-1)*tmpMat'*reshape(EICs(:,i,:),nbRT*nbEIC,1));

tmpSp = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

spScale(i,mc) = tmpSp(1);

spShift(i,mc) = tmpSp(2);

end

% 1/var(ei)

tmpShape = hyShapeEpsilon + 0.5*nbRT*nbSamp*nbEIC;

repScale = repmat(spScale(:,mc)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % from lightspeed

repShift = repmat(spShift(:,mc)',nbRT,1);

vecShift = repShift(:);

vecTICHat = reshape(SBSTilt*spReg(:,:,mc),nbRT*nbSamp*nbEIC,1) + repmat(vecShift,nbEIC,1);

tmpRate = hyRateEpsilon + 0.5*(vecEICs(:)-vecTICHat)'*(vecEICs(:)-vecTICHat);

spTauEpsilon(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ai)

tmpShape = hyShapeScale + 0.5*nbSamp;

tmpRate = hyRateScale + 0.5*sum((spScale(:,mc)-spMuScale(mc)).^2);

spTauScale(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ci)

tmpShape = hyShapeShift + 0.5*nbSamp;

tmpRate = hyRateShift + 0.5*sum((spShift(:,mc)-spMuShift(mc)).^2);

spTauShift(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var for the prototype function

tmpShape = hyShapePsi + 0.5*nbReg*nbEIC;

tmpMat1 = spReg(:,1,mc)*spReg(:,1,mc)';

for b=2:nbEIC

tmpMat1 = tmpMat1 + spReg(:,b,mc)*spReg(:,b,mc)';

end

tmpRate = hyRatePsi + 0.5*trace(tmpMat1*SigmaReg);

spTauPsi(mc) = gamrnd(tmpShape,1/tmpRate);

%% Metropolis-Hastings algo

for i = 1:nbSamp

tmpMap = map(i).coeff;

tmpIdx = interp1(varKnot,tmpMap,timeGrid);

tmpBSReg = interp1(timeExt,BSReg,tmpIdx);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

% generating blocks

rInd = randi(3);

switch rInd

case 1

rBound = 1;

case 2

rBound = 0.5;

case 3

rBound = 0.25;

end

idxBound = find(rand(1,nbMap-3)<rBound)+2; % block ends at nbMap-1

blkMH = [2,idxBound; idxBound-1,nbMap-1];

nbBlock = size(blkMH,2);

rStep = randi(2);

switch rStep

case 1

stepMH = stepMH1;

case 2

stepMH = stepMH2;

end

if mc <= 200

stepMH = 30; % propose big move in early MCMC iterations

end

for m = 1:nbBlock

tmpMapProp = tmpMap;

% identify moveable range

tmpLB = tmpMap(blkMH(1,m)-1)-tmpMap(blkMH(1,m));

tmpUB = tmpMap(blkMH(2,m)+1)-tmpMap(blkMH(2,m));

% uniform proposal reflective on the boundary

unBound = true;

stepProp = stepMH*(2*rand(1)-1);

while unBound

if stepProp > tmpUB

stepProp = 2*tmpUB-stepProp;

elseif stepProp < tmpLB

stepProp = 2*tmpLB-stepProp;

else

unBound = false;

end

tmpMapProp(blkMH(1,m):blkMH(2,m)) = tmpMapProp(blkMH(1,m):blkMH(2,m)) + stepProp;

tmpIdxProp = interp1(varKnot,tmpMapProp,timeGrid);

idxEval = varKnot(blkMH(1,m)-1):varKnot(blkMH(2,m)+1); % range of interest

pRatioln = -0.5*sum( ((tmpIdxProp(idxEval)-meanMapTime(idxEval,i)).^2 ...

- (tmpIdx(idxEval)-meanMapTime(idxEval,i)).^2) ...

./ varMapTime(idxEval,i) ); % log of prior odds

tmpBSReg = interp1(timeExt,BSReg,tmpIdxProp);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaPropln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

lRatioln = tmpEvaPropln-tmpEvaln;

switch rStep

case 1

map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

if rand(1) < min([1, exp(lRatioln+pRatioln)])

tmpMap = tmpMapProp;

tmpIdx = tmpIdxProp;

tmpEvaln = tmpEvaPropln;

switch rStep

case 1

map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

map(i).coeff = tmpMap;

spMap(:,i,mc) = tmpMap;

end

if rem(mc,500)==0

fprintf('Iteration %d, time %d \r', mc,toc/60);

end

%% Retention time correction

postKnot = ndsum(spMap(:,:,5001:15000),3)/10000; % initial 5000 samples as burn-in

postMap = interp1(varKnot, postKnot, timeGrid);

crtRT = zeros(nbRT,nbSamp);

for i = 1:nbSamp

crtRT(:,i) = interp1(timeGrid,timeRes,postMap(:,i),'pchip');

end

for i = 1:nbSamp

in_text = ['sima\G1_' num2str(i) '.txt'];

out_text = ['gpmp4\G1_' num2str(i) '.txt'];

tmp = textread(in_text);

newRT = interp1(timeRes,crtRT(:,i),tmp(:,4),'pchip');

tmp(:,4) = round(newRT*100)/100;

dlmwrite(out_text, tmp, 'delimiter', '\t', 'precision', 10, 'newline', 'pc');

end