## 1. Load in data frame containing VAF and GERP of each eligible variant library(data.table) options(stringsAsFactors = FALSE) goodIDs <- read.csv("META/release_aug2016.included.final.txt",sep = "\t",header = TRUE)[,2] x <- as.data.frame(fread("PROC/grandM2.txt")) names(x) <- c("subFile", "uvar", "chr", "pos", "sampleID", "GERP", "CDS", "GENE", "sens", "FSC", "FSN", "blank", "chr_again", "pos_end", "sampleID_again", "type2", "VAF") x <- x[,c(2,3,4,5,6,7,8,17)] x <- x[x$sampleID %in% goodIDs,] x <- x[!duplicated(x$uvar),] x$chr_pos <- paste(x$chr, x$pos, sep="_") ## 2. Exclude variants in regions of known or supsected copy number alterations delvar <- as.data.frame(fread("PROC/PtSV_obs/HAP/obs.som.del.uvar.txt", header=FALSE))[,1] dupvar <- as.data.frame(fread("PROC/PtSV_obs/SOM_DUP/obs.som.dup.uvar.txt", header=FALSE))[,1] badvar <- c(delvar, dupvar) x$DEL <- x$uvar %in% delvar x$DUP <- x$uvar %in% dupvar x <- x[!x$DUP & !x$DEL,] # known copy number alterations x <- x[x$VAF < 0.6,] # suspected copy number alterations ## 3. Cluster GERP values into bins for aggregation x$GERP <- as.numeric(x$GERP) x$VAF <- as.numeric(x$VAF) x <- x[!is.na(x$GERP) & !is.na(x$VAF),] x$GERP[x$GERP < 0] <- 0 x$GERP_bin_round <- round(5*x$GERP)/5 x <- x[!x$DUP & !x$DEL,] x$GERP_bin_round[x$GERP_bin_round>5] <- 5 ## 4. Split data set into # Variants in a high confidence set of driver genes # Variants in outside of a sensitive set driver genes # Variants in a high confidence set of driver genes # that have not been individually called as driver variants by PCAWG dr <- read.csv("META/beta_driver.txt", header = FALSE)[,1] x$dr = x$GENE %in% dr vogel <- read.csv("/home/fas/gerstein/wum2/scratch/VALID/META/vg.csv", header=TRUE) x_dr$vogel <- x_dr$GENE %in% vogel$Gene.Symbol drivers <- read.csv("/home/fas/gerstein/wum2/scratch/VALID/META/pcawg_whitelist_somatic_driver_mutations_beta.csv", sep="\t") drivers$uvar <- paste(drivers$sample, drivers$chr, drivers$pos, sep="_") x_dr <- x[x$dr,] x_ndr <- x[!x$dr,] x_dr_rec <- x_dr[x_dr$vogel,] x_ndr_r <- x_ndr x_dr$discDriver <- x_dr$uvar %in% drivers$uvar x_ld <- x_dr[!x_dr$discDriver & x_dr$vogel,] ## 5. Calculate mean VAF by GERP bin in each driver category, ## with bootstrapped confidence intervals library(boot) will_mean <- function(inp, i) {return(mean(inp[i]))} will_boot <- function(z) { b <- boot(data=z, statistic=will_mean, R=10) b.ci <- (as.numeric(quantile(as.numeric(unlist(b[2])), c(0.0275, 0.5, 0.975)))) paste(b.ci[1], b.ci[2], b.ci[3], sep="_") } rboot_dr <- aggregate(x_dr_rec$VAF, by=list(x_dr_rec$GERP_bin_round), FUN=will_boot) rboot_dr$l95 <- as.numeric(unlist(lapply(strsplit(rboot_dr[,2], "_"),head,1))) rboot_dr$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_dr[,2], "_"),head,2), tail, 1))) rboot_dr$u95 <- as.numeric(unlist(lapply(strsplit(rboot_dr[,2], "_"),tail,1))) rboot_dr <- rboot_dr[,-2] rboot_dr write.table(rboot_dr, "spline.boot_dr.txt", quote = FALSE, sep="\t", row.names = FALSE) rboot_ndr <- aggregate(x_ndr_r$VAF, by=list(x_ndr_r$GERP_bin_round), FUN=will_boot) rboot_ndr$l95 <- as.numeric(unlist(lapply(strsplit(rboot_ndr[,2], "_"),head,1))) rboot_ndr$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_ndr[,2], "_"),head,2), tail, 1))) rboot_ndr$u95 <- as.numeric(unlist(lapply(strsplit(rboot_ndr[,2], "_"),tail,1))) rboot_ndr <- rboot_ndr[,-2] rboot_ndr write.table(rboot_ndr, "spline.boot_ndr.txt", sep="\t", quote=FALSE, row.names = FALSE) rboot_ld <- aggregate(x_ld$VAF, by=list(x_ld$GERP_bin_round), FUN=will_boot) rboot_ld$l95 <- as.numeric(unlist(lapply(strsplit(rboot_ld[,2], "_"),head,1))) rboot_ld$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_ld[,2], "_"),head,2), tail, 1))) rboot_ld$u95 <- as.numeric(unlist(lapply(strsplit(rboot_ld[,2], "_"),tail,1))) rboot_ld <- rboot_ld[,-2] rboot_ld write.table(rboot_ld, "spline.boot_ld.txt", quote = FALSE, sep="\t", row.names = FALSE) ## 6. Plot results options(stringsAsFactors = F) library(ggplot2) pd <- position_dodge(0.1) df$`GERP bin`=as.factor(rep(c("< 0.5", "0.5-1.5", "1.5-2.5", "2.5-3.5", "3.5-4.5", "4.5 and above"),1)) b_ndr <- read.csv("spline.boot_ndr.txt", sep="\t") b_dr <- read.csv("spline.boot_dr.txt", sep="\t") b_ld <- read.csv("spline.boot_ld.txt", sep="\t") names(b_ndr) <- c("GERP bin", "l95", "mean VAF", "u95") names(b_dr) <- c("GERP bin", "l95", "mean VAF", "u95") names(b_ld) <- c("GERP bin", "l95", "mean VAF", "u95") b_ndr$Variant_Category <- "Outside driver gene-sets" b_dr$Variant_Category <- "Driver gene-set" b_ld$Variant_Category <- "Driver gene-set excluding identified driver variants" df <- rbind(b_ndr, b_dr, b_ld) df$`GERP bin`=as.factor(rep(b_ndr[,1],3)) df <- df[df$`GERP bin`!=0,] pdf("VAF_GERP_linear.pdf") ggplot(df, aes(x=`GERP bin`, y=`mean VAF`, colour=Variant_Category, group=Variant_Category)) + geom_point() + theme(text = element_text(size=10)) + geom_smooth(se = TRUE, method = "gam") dev.off()