import sys import os import subprocess import glob import gzip import collections from scipy.stats import ks_2samp from scipy.stats import mstats import Statistics def correct_pvalues_for_multiple_testing(pvalues, correction_type): """ consistent with R - print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1]) """ from numpy import array, empty pvalues = array(pvalues) n = float(pvalues.shape[0]) new_pvalues = empty(n) if correction_type == "Bonferroni": new_pvalues = n * pvalues elif correction_type == "Bonferroni-Holm": values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ] values.sort() for rank, vals in enumerate(values): pvalue, i = vals new_pvalues[i] = (n-rank) * pvalue elif correction_type == "Benjamini-Hochberg": values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ] values.sort() values.reverse() new_values = [] for i, vals in enumerate(values): rank = n - i pvalue, index = vals new_values.append((n/rank) * pvalue) for i in xrange(0, int(n)-1): if new_values[i] < new_values[i+1]: new_values[i+1] = new_values[i] for i, vals in enumerate(values): pvalue, index = vals new_pvalues[index] = new_values[i] return new_pvalues def readDriverInfo(driverInputFile): cdsList = [] ncList = [] fileName = open(driverInputFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() if line.split()[9] == "snv": snpInfo = line.split()[4]+":"+"chr"+line.split()[5]+":"+line.split()[6]+":"+line.split()[7]+":"+line.split()[8] if line.split()[1] == "CDS" or line.split()[1] == "splice_sites": cdsList.append(snpInfo) if line.split()[1] not in ['CDS','splice_sites']: ncList.append(snpInfo) return cdsList,ncList ''' def readDriverList(inputDriverFile): driverList = [] fileName = open(inputDriverFile,'r') if fileName: for line in fileName: line = line.strip() if line.split()[11] == "known": driverSNPInfo = line.split()[0]+":chr"+line.split()[2]+":"+line.split()[3]+":"+line.split()[4]+":"+line.split()[5] driverList.append(driverSNPInfo) return driverList ''' def readTumorId(fileInfo): donorId = [] tumorId = [] tumorIdMap = {} fileName = open(fileInfo,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() donorId.append(line.split()[0]) tumorId.append(line.split()[1]) tumorIdMap = dict(zip(donorId,tumorId)) return tumorIdMap def extractTumorType(histopathFile): tumorTuple = () tumorType = [] fileName = open(histopathFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: tumorTuple = (line.split()[1],line.split()[2]) tumorType.append(tumorTuple) return list(set(tumorType)) def stratifyTumorId(histopathFile,tumorIdMap,tumorType): sampleIdList = [] outputSNVFile = '' workDir = '/SNV/somatic/' fileName = open(histopathFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() if (line.split()[0] in tumorIdMap.keys()): if (line.split()[1] == tumorType[0]) and (line.split()[2] == tumorType[1]): if len(tumorIdMap[line.split()[0]].split(',')) == 1: sampleIdList.append(tumorIdMap[line.split()[0]].split(',')[0]) elif len(tumorIdMap[line.split()[0]].split(',')) > 1: for element in tumorIdMap[line.split()[0]].split(','): sampleIdList.append(element) return list(set(sampleIdList)) def generateSigMatrix(inputFile): keyList = [] valueList = [] sigMatrixDict = {} fileName = open(inputFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() #sigKey = line.split()[0]+"_at_"+line.split()[1] sigKey = line.split()[60] #sigValue = line.split()[2:] sigValue = line.split()[0:60] keyList.append(sigKey) valueList.append(sigValue) sigMatrixDict = dict(zip(keyList,valueList)) return sigMatrixDict def calcSigDifferences(inputFile,signatureDict,sigIndex): signatureComposeList = [] fileName = open(inputFile,'r') if fileName: for line in fileName: line = line.strip() chromNum = line.split()[0] snvPos = line.split()[2] refAllele = line.split()[3] altAllele = line.split()[4] mutContext = line.split()[6] mutSigInfo = refAllele+">"+altAllele+"_at_"+mutContext if mutSigInfo in signatureDict.keys(): sigContribution = signatureDict[mutSigInfo][int(sigIndex)] signatureComposeList.append(float(sigContribution)) return signatureComposeList def main(): codingDriverList = [] ncDriverList = [] workDir = '/somatic/funseq_out/' impactDir = '/signatureAnalysis/dataFile/' signatureTuple = () signTupList = [] pValueList = [] medianSigHi = 0.0 medianSigLo = 0.0 pcawgTumor = readTumorId(sys.argv[1]) tumorClass = extractTumorType(sys.argv[2]) signatureMap = generateSigMatrix(sys.argv[3]) for item in tumorClass: if glob.glob(impactDir+item[0]+'.mutation.hImpact.updated.txt'): for index in range(0,60): sigHi = calcSigDifferences(impactDir+item[0]+'.mutation.hImpact.updated.txt',signatureMap,index) sigLo = calcSigDifferences(impactDir+item[0]+'.mutation.neutral.txt',signatureMap,index) meanSigHi = Statistics.mean(sigHi) #medianSigHi = Statistics.median(sigHi) meanSigLo = Statistics.mean(sigLo) #medianSigLo = Statistics.median(sigLo) significance = mstats.ks_twosamp(sigHi,sigLo)[1] signatureTuple = (item[0],str(index+1),meanSigHi,meanSigLo,str((meanSigHi-meanSigLo)/meanSigLo)) signTupList.append(signatureTuple) pValueList.append(significance) print item[0]+"\t"+str(index+1)+"\t"+str(meanSigHi)+"\t"+str(meanSigLo)+"\t"+str(meanSigHi-meanSigLo)+"\t"+str((meanSigHi-meanSigLo)/meanSigLo)+"\t"+str(significance) if __name__ == '__main__': main()