import sys import os import subprocess import glob import gzip import math import numpy as np import re #bedtools intersect -a /gpfs/scratch/fas/gerstein/cortex/PCAWG-AUG2016/SNV/somatic/spectrum_analysis/noncoding/hImpact/Lung-AdenoCA.noncoding.highImpact.txt -b ./VAF_info/Lung-AdenoCA.VAFInfo.updated.txt -r -f 1.0 -wo def readTumorId(fileInfo): donorId = [] tumorId = [] tumorIdMap = {} fileName = open(fileInfo,'r') if fileName: for line in fileName: line = line.strip() donorId.append(line.split()[0]) tumorId.append(line.split()[1]) tumorIdMap = dict(zip(donorId,tumorId)) return tumorIdMap def extractTumorType(histopathFile): tumorTuple = () tumorType = [] fileName = open(histopathFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: tumorTuple = (line.split()[1],line.split()[2]) tumorType.append(tumorTuple) return list(set(tumorType)) def stratifyTumorId(histopathFile,tumorIdMap,tumorType): sampleIdList = [] outputSNVFile = '' fileName = open(histopathFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() if (line.split()[0] in tumorIdMap.keys()): if (line.split()[1] == tumorType[0]) and (line.split()[2] == tumorType[1]): if len(tumorIdMap[line.split()[0]].split(',')) == 1: sampleIdList.append(tumorIdMap[line.split()[0]].split(',')[0]) elif len(tumorIdMap[line.split()[0]].split(',')) > 1: for element in tumorIdMap[line.split()[0]].split(','): sampleIdList.append(element) return list(set(sampleIdList)) def cancerGeneInfo(CGCFileInfo): cgcList = [] fileName = open(CGCFileInfo,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() cgcList.append(line.split()[0]) return cgcList def intersectFile(impactSNVFile,subcloneInputFile,flag): workDir = flag+'/noncoding/' intersect1 = 'bedtools intersect -a '+ workDir + impactSNVFile +' -b ./VAF_info/'+subcloneInputFile +' -r -f 1.0 -wo | awk \'{ print $1"\\t"$2"\\t"$3"\\t"$4"\\t"$5"\\t"$6"\\t"$15}\' > tmp.'+flag+'.bed' print intersect1 p0 = subprocess.Popen(intersect1,shell=True) p0.communicate() def calcMATHScore(tmpOutFile,sampleId,cancerType): vafList = [] updatedVAFList = [] mathScore= 'NA' fileName = open(tmpOutFile,'r') if fileName: for line in fileName: line = line.strip() if line.split()[5] == sampleId and re.match("^\d+?\.\d+?$", line.split()[6]) and float(line.split()[6]) > 0.075: vafList.append(float(line.split()[6])) if vafList: medianVAF = np.median(vafList) for VAF in vafList: updatedVAFList.append(abs(VAF-medianVAF)) if updatedVAFList: MATH=100*1.4826*(np.median(updatedVAFList)/medianVAF) mathScore = str(MATH) return mathScore def readMutationRate(inputFile): sampleIdList = [] mutRateList = [] fileName = open(inputFile,'r') if fileName: for index in range(1,2): fileName.next() for line in fileName: line = line.strip() sampleId = line.split()[1] mutRate = line.split()[3] sampleIdList.append(sampleId) mutRateList.append(mutRate) mutRateMap = dict(zip(sampleIdList,mutRateList)) return mutRateMap def main (): freqOrigMap = {} freqRandMap = {} normalizedFreqMap = {} pcawgTumor = {} freqList = [] tumorClass = [] pcawgTumor = readTumorId(sys.argv[1]) # read the released TumorId tumorClass = extractTumorType(sys.argv[2]) # map the tumorId to corresponding Tumor types and subtypes mutationRateMap = readMutationRate(sys.argv[3]) for item in tumorClass: if item[0] != 'NA': fileOut = open(item[0]+'.'+sys.argv[4]+'.mathScore.txt','a') sampId = stratifyTumorId(sys.argv[2],pcawgTumor,item) print sampId #intersectFile(item[0]+'.coding.nominal.txt',item[0]+'.VAFInfo.txt','nominal') intersectFile(item[0]+'.'+sys.argv[4]+'.neutral.txt',item[0]+'.VAFInfo.txt','neutral') intersectFile(item[0]+'.'+sys.argv[4]+'.hImpact.txt',item[0]+'.VAFInfo.txt','hImpact') for element in sampId: if element in mutationRateMap.keys(): mathValueHigh = calcMATHScore('tmp.hImpact.bed',element,item[0]) mathValueNeut = calcMATHScore('tmp.neutral.bed',element,item[0]) fileOut.write(item[0]+'\t'+element+'\t'+str(mathValueHigh)+"\t"+str(mathValueNeut)+'\t'+str(mutationRateMap[element])) fileOut.write("\n") if __name__ == '__main__': main()