%% Description

% Data set: proteomics

% Method: multi-profile alignment

%

% ----- data_matrix_proteomics.mat -----

% proData: binned matrix 20 x 1000 x 3200 (sample x RT points x mz bins)

% timeRes: registered time index

% nbSamp: # of samples (20)

% nbRT: # of RT points (1000)

% nbMZ: # of mz bins (3200)

% mzLow: mz lower bound for each bin

% mzHi: mz upper bound for each bin



%% Required utilities

% ----- attached functions -----

% coda() to calculate MCQ values

% priorRatioLn() to calculate the prior ratio (in log) when GP prior is not available

%

% ----- functions from elsewhere -----

% inv_posdef(), randnorm(), scale_rows(), ndsum() from Tom Minka's Lightspeed toolbox,

% downloaded at http://research.microsoft.com/en-us/um/people/minka/software/lightspeed/

% randraw() from File Exchange at MATLAB Central,

% downloaded at http://www.mathworks.com/matlabcentral/fileexchange/7309

% bsplinebasis() from Scott Gaffney's CCToolbox,

% downloaded at http://www.ics.uci.edu/~sgaffney/software/CCT/

% apcluster() by Frey Lab,

% downloaded at http://www.psi.toronto.edu/index.php?q=affinity%20propagation



%% Load the proteomic data set

clear all; close all; clc;


load data_matrix_proteomics.mat


[nbSamp,nbRT,nbMZ] = size(proData);



%% 1st-phase screening based on MCQ

binICs = proData;

winQ = 3;

% considering the worst case

mcq = coda(squeeze(binICs(1,:,:)),winQ);

for i = 2:nbSamp

mcq = min([mcq; coda(squeeze(binICs(i,:,:)),winQ)],[],1);

end


idxQ = find(mcq>=0.9);

binQICs = binICs(:,:,idxQ);

mcqQ = mcq(idxQ);

nbQBin = numel(idxQ);


% scale the remaining chromatograms (profile is more important than abosolute amount)

for b = 1:nbQBin

for i = 1:nbSamp

binQICs(i,:,b) = binQICs(i,:,b)./ndsum(binQICs(i,:,b),2);

end

end



%% 2nd-phase screening based on reproducibility among samples (with xcorr)

xcMat = zeros(nbSamp,nbSamp,nbQBin);

xcAve = zeros(1,nbQBin);

rootE = zeros(1,nbSamp); % root of energy used for normalization

winX = 40;


for b = 1:nbQBin

for i = 1:nbSamp

rootE(i) = sqrt(sum((binQICs(i,:,b).*binQICs(i,:,b))));

xcMat(i,i,b) = 1;

end

for i = 1:nbSamp

for j = i+1:nbSamp

tmpXC = xcorr(binQICs(i,:,b),binQICs(j,:,b),winX);

idxXC = maxind(tmpXC);

if isempty(idxXC)

xcMat(i,j,b) = 0;

else

idxXC(find(tmpXC(idxXC)<0.5*max(tmpXC(idxXC)))) = [];

[~,idxx] = min(abs(idxXC-(winX+1)));

xcMat(i,j,b) = tmpXC(idxXC(idxx))/(rootE(i)*rootE(j));

end

xcMat(j,i,b) = xcMat(i,j,b);

end

end

xcAve(b) = (ndsum(xcMat(:,:,b),1:2)-nbSamp)/(nbSamp*nbSamp-nbSamp);

end


idxC = find(xcAve>=0.85);

binQCICs = binQICs(:,:,idxC);

mcqQC = mcqQ(idxC);

nbQCBin = numel(idxC);

xcAveQC = xcAve(idxC);



%% identify exemplars using affinity propagation (correlation coeff. as similarity)

ccQC = zeros(nbQCBin);

for b = 1:nbSamp

ccQC = ccQC + corr(squeeze(binQCICs(b,:,:)));

end

ccQC = ccQC/nbSamp;


sim = zeros(nbQCBin*nbQCBin-nbQCBin,3);

cnt = 1;

for b = 1:nbQCBin

for p = [1:b-1,b+1:nbQCBin]

sim(cnt,1)=b; sim(cnt,2)=p; sim(cnt,3)=ccQC(b,p);

cnt=cnt+1;

end

end

prefSim = mean(sim(:,3));

[idxExmp,~,~,~] = apcluster(sim,prefSim);



%% agglomerative clustering of the exemplars (based on overlapping level)

nbExmp = numel(unique(idxExmp));

exmpICs = binQCICs(:,:,unique(idxExmp));


tmpNum = nbExmp;

tmpICs = exmpICs;

tmpIdx = cell(1,tmpNum);

for b = 1:tmpNum

tmpIdx{b} = b;

end


for l = 1:nbExmp-1

tmpDis = zeros(tmpNum*(tmpNum-1)/2,3);

cnt = 1;

for b = 1:tmpNum-1

for p = b+1:tmpNum

tmpDis(cnt,1) = b;

tmpDis(cnt,2) = p;

for i = 1:nbSamp

tmpDis(cnt,3) = tmpDis(cnt,3) + sum(min(squeeze(tmpICs(i,:,[b,p])),[],2));

end

cnt = cnt+1;

end

end

[layer(l).ovp, idxPair] = min(tmpDis(:,3));

tmpICs(:,:,tmpDis(idxPair,1)) = tmpICs(:,:,tmpDis(idxPair,1))+tmpICs(:,:,tmpDis(idxPair,2));

tmpICs(:,:,tmpDis(idxPair,2)) = [];

tmpIdx{tmpDis(idxPair,1)} = [tmpIdx{tmpDis(idxPair,1)},tmpIdx{tmpDis(idxPair,2)}];

tmpIdx(tmpDis(idxPair,2)) = [];

tmpNum = tmpNum-1;

layer(l).num = tmpNum;

layer(l).idx = tmpIdx;

end


nbEIC = 4; % pre-defined value

EICs = zeros(nbSamp,nbRT,nbEIC);

for b=1:4

EICs(:,:,b) = ndsum(exmpICs(:,:,layer(numel(layer)-nbEIC+1).idx{b}),3);

end

EICs = EICs*10; % scale to a range of [0,10]


EICs = permute(EICs,[2 1 3]);

timeGrid = (1:nbRT)';


clear chrom proData agg aggPair bb binEdge binICs binQCICs binQICs ccQC cnt ...

exmpICs idxAgg idxC idxExmp idxQ idxXC idxx mcq mcqQ mcqQC ...

mzHi mzLow nbExmp nbMZ nbBin nbQBin nbQCBin p prefAgg prefSim rootE ...

sim tmpXC winQ winX xcAve xcAveQC xcMat xcMin xcMinQC i j



%% B-spline

order = 3;

timeExt = timeGrid;


denKnotReg = 0.5; % density of knots for prototype function (0.25--0.75)

denKnotMap = 0.025; % density of knots for mapping function (<= 0.1)

ptStart = timeExt(1);

ptEnd = timeExt(end);

lenBS = length(timeExt);

nbKnotReg = ceil(lenBS*denKnotReg);

knotsReg = unique(linspace(ptStart,ptEnd,nbKnotReg));

knotsReg = [knotsReg(1)*ones(1,order) knotsReg(2:(end-1)) ...

knotsReg(end)*ones(1,order)];

nbReg = length(knotsReg)-order;

BSReg = bsplinebasis(knotsReg,order,timeExt);

muReg = zeros(nbReg,1);


nbMap = ceil((timeGrid(end)-timeGrid(1))*denKnotMap);

varKnot = unique(round(linspace(timeGrid(1),timeGrid(end),nbMap)))';


for i = 1:nbSamp

map(i).coeff = varKnot;

map(i).acpt = zeros(4,nbRT-1);

end



%% Hyperparameters

hyMuScale = 1;

hyMuShift = 0;

hyTauScale = 1/0.5;

hyTauShift = 1/0.5;

hyShapeScale = 0.1;

hyRateScale = 1;

hyShapeShift = 0.1;

hyRateShift = 1;

hyShapePsi = 0.1;

hyRatePsi = 1;

hyShapeEpsilon = 0.1;

hyRateEpsilon = 0.2;

hyStd = 0.1;

SigmaReg = diag([2*ones(1,nbReg-1) 1],0) + diag(-1*ones(1,nbReg-1),1) ...

+diag(-1*ones(1,nbReg-1),-1);



%% MCMC setting/initialization

nbMCMC = 15000;


% Space allocation MCMC runs

spMuScale = zeros(1,nbMCMC); % a0

spMuShift = zeros(1,nbMCMC); % c0

spScale = zeros(nbSamp,nbMCMC); % ai

spShift = zeros(nbSamp,nbMCMC); % ci

spTauScale = zeros(1,nbMCMC); % 1/var(ai)

spTauShift = zeros(1,nbMCMC); % 1/var(ci)

spTauEpsilon = zeros(1,nbMCMC); % 1/var(ei)

spTauPsi = zeros(1,nbMCMC); % 1/var for regression coeff

spReg = zeros(nbReg,nbEIC,nbMCMC); % regression coeff (prototype function)

spMap = zeros(nbMap,nbSamp,nbMCMC);% mapping function coeff


% Initial value assignment

spMuScale(1) = hyMuScale;

spMuShift(1) = hyMuShift;

spScale(:,1) = hyMuScale*ones(nbSamp,1);

spShift(:,1) = hyMuShift*ones(nbSamp,1);

spTauScale(1) = hyShapeScale/hyRateScale;

spTauShift(1) = hyShapeShift/hyRateShift;

spTauEpsilon(1) = hyShapeEpsilon/hyRateEpsilon;

spTauPsi(1) = hyShapePsi/hyRatePsi;

spMap(:,:,1) = repmat(varKnot,1,nbSamp);


% Metropolis step

stepMH1 = 3;

stepMH2 = 10;


%% Run MCMC

BSTilt = repmat(BSReg,nbSamp,1); % BS_i (space allocation)

vecScale = ones(nbSamp*nbRT,1);

vecShift = ones(nbSamp*nbRT,1);

vecEICs = zeros(nbSamp*nbRT,nbEIC);

for b = 1:nbEIC

vecEICs(:,b) = reshape(EICs(:,:,b),nbSamp*nbRT,1);

end


rng default % reset the random seed

tic % initialize timer


for mc = 2:nbMCMC

% Matrix manipulation

idxMat = interp1(varKnot, spMap(:,:,mc-1), timeGrid);

BSTilt = interp1(timeExt,BSReg,idxMat(:));

repScale = repmat(spScale(:,mc-1)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % a_i*BS_i from lightspeed

repShift = repmat(spShift(:,mc-1)',nbRT,1);

vecShift = repShift(:);

%% Gibbs sampling goes below

% regression coefficients of prototype function

invCovReg = SigmaReg*spTauPsi(mc-1);

tmpCov = inv_posdef(invCovReg + (SBSTilt'*SBSTilt)*spTauEpsilon(mc-1)); % from lightspeed

for b = 1:nbEIC

tmpMuVec = tmpCov*SBSTilt'*(vecEICs(:,b)-vecShift)*spTauEpsilon(mc-1);

spReg(:,b,mc) = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

end

% a0

tmpVar = (hyTauScale + nbSamp*spTauScale(mc-1))^(-1);

tmpMu = tmpVar*(hyMuScale*hyTauScale + sum(spScale(:,mc-1))*spTauScale(mc-1));

spMuScale(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% c0

tmpVar = (hyTauShift + nbSamp*spTauShift(mc-1))^(-1);

tmpMu = tmpVar*(hyMuShift*hyTauShift + sum(spShift(:,mc-1))*spTauShift(mc-1));

spMuShift(mc) = tmpMu + sqrt(tmpVar)*randn(1);

% (ai, ci)

for i = 1:nbSamp

tmpMat = [reshape(BSTilt((i-1)*nbRT+1:i*nbRT,:)*spReg(:,:,mc),nbRT*nbEIC,1), ...

ones(nbRT*nbEIC,1)];

tmpCov = inv_posdef(diag([spTauScale(mc-1) spTauShift(mc-1)]) + spTauEpsilon(mc-1)*tmpMat'*tmpMat); % from lightspeed

tmpMuVec = tmpCov*(diag([spTauScale(mc-1) spTauShift(mc-1)])*[spMuScale(mc);spMuShift(mc)]...

+ spTauEpsilon(mc-1)*tmpMat'*reshape(EICs(:,i,:),nbRT*nbEIC,1));

tmpSp = randnorm(1,tmpMuVec,[],tmpCov); % from lightspeed

spScale(i,mc) = tmpSp(1);

spShift(i,mc) = tmpSp(2);

end

% 1/var(ei)

tmpShape = hyShapeEpsilon + 0.5*nbRT*nbSamp*nbEIC;

repScale = repmat(spScale(:,mc)',nbRT,1);

vecScale = repScale(:);

SBSTilt = scale_rows(BSTilt,vecScale); % from lightspeed

repShift = repmat(spShift(:,mc)',nbRT,1);

vecShift = repShift(:);

vecTICHat = reshape(SBSTilt*spReg(:,:,mc),nbRT*nbSamp*nbEIC,1) + repmat(vecShift,nbEIC,1);

tmpRate = hyRateEpsilon + 0.5*(vecEICs(:)-vecTICHat)'*(vecEICs(:)-vecTICHat);

spTauEpsilon(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ai)

tmpShape = hyShapeScale + 0.5*nbSamp;

tmpRate = hyRateScale + 0.5*sum((spScale(:,mc)-spMuScale(mc)).^2);

spTauScale(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var(ci)

tmpShape = hyShapeShift + 0.5*nbSamp;

tmpRate = hyRateShift + 0.5*sum((spShift(:,mc)-spMuShift(mc)).^2);

spTauShift(mc) = gamrnd(tmpShape,1/tmpRate);

% 1/var for the prototype function

tmpShape = hyShapePsi + 0.5*nbReg*nbEIC;

tmpMat1 = spReg(:,1,mc)*spReg(:,1,mc)';

for b=2:nbEIC

tmpMat1 = tmpMat1 + spReg(:,b,mc)*spReg(:,b,mc)';

end

tmpRate = hyRatePsi + 0.5*trace(tmpMat1*SigmaReg);

spTauPsi(mc) = gamrnd(tmpShape,1/tmpRate);

%% Metropolis-Hastings algo

for i = 1:nbSamp

tmpMap = map(i).coeff;

tmpIdx = interp1(varKnot,tmpMap,timeGrid);

tmpBSReg = interp1(timeExt,BSReg,tmpIdx);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSReg*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

% generating blocks

rInd = randi(3);

switch rInd

case 1

rBound = 1;

case 2

rBound = 0.5;

case 3

rBound = 0.25;

end

idxBound = find(rand(1,nbMap-3)<rBound)+2; % block ends at nbMap-1

blkMH = [2,idxBound; idxBound-1,nbMap-1];

nbBlock = size(blkMH,2);

rStep = randi(2);

switch rStep

case 1

stepMH = stepMH1;

case 2

stepMH = stepMH2;

end

for m = 1:nbBlock

tmpMapProp = tmpMap;

% identify moveable range

tmpLB = tmpMap(blkMH(1,m)-1)-tmpMap(blkMH(1,m));

tmpUB = tmpMap(blkMH(2,m)+1)-tmpMap(blkMH(2,m));

% uniform proposal reflective on the boundary

unBound = true;

stepProp = stepMH*(2*rand(1)-1);

while unBound

if stepProp > tmpUB

stepProp = 2*tmpUB-stepProp;

elseif stepProp < tmpLB

stepProp = 2*tmpLB-stepProp;

else

unBound = false;

end

end

tmpMapProp(blkMH(1,m):blkMH(2,m)) = tmpMapProp(blkMH(1,m):blkMH(2,m)) + stepProp;

pRatioln = priorRatioLn(tmpMap, varKnot, blkMH(1,m):blkMH(2,m), stepProp, hyStd);

tmpIdx = interp1(varKnot,tmpMapProp,timeGrid);

tmpBSRegProp = interp1(timeExt,BSReg,tmpIdx);

for b = 1:nbEIC

tmpEICs(:,b) = spScale(i,mc)*tmpBSRegProp*spReg(:,b,mc) + spShift(i,mc);

end

tmpEvaPropln = -0.5*spTauEpsilon(mc)* ndsum((squeeze(EICs(:,i,:))-tmpEICs).^2, [1 2]);

lRatioln = tmpEvaPropln-tmpEvaln;

switch rStep

case 1

map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(2,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(4,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

if rand(1) < min([1, exp(lRatioln+pRatioln)])

tmpMap = tmpMapProp;

tmpEvaln = tmpEvaPropln;

switch rStep

case 1

map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(1,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

case 2

map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) = map(i).acpt(3,varKnot(blkMH(1,m):blkMH(2,m))-1) + 1;

end

end

end

map(i).coeff = tmpMap;

spMap(:,i,mc) = tmpMap;

end

if rem(mc,500)==0

fprintf('Iteration %d, time %d \r', mc,toc/60);

end

end



%% retention time correction

postKnot = ndsum(spMap(:,:,5001:15000),3)/10000; % initial 5000 samples as burn-in

postMap = interp1(varKnot, postKnot, timeGrid);


crtRT = zeros(nbRT,nbSamp);

for i = 1:nbSamp

crtRT(:,i) = interp1(timeGrid,timeRes,postMap(:,i),'pchip');

end


for i = 1:nbSamp

in_text = ['sima\P1_' num2str(i) '.txt'];

out_text = ['mp4\P1_' num2str(i) '.txt'];

tmp = textread(in_text);

newRT = interp1(timeRes,crtRT(:,i),tmp(:,4),'pchip');

tmp(:,4) = round(newRT*100)/100;

dlmwrite(out_text, tmp, 'delimiter', '\t', 'precision', 10, 'newline', 'pc');

end