## 1. Load in data frame containing VAF and GERP of each eligible variant

library(data.table)
options(stringsAsFactors = FALSE)
goodIDs <- read.csv("META/release_aug2016.included.final.txt",sep = "\t",header = TRUE)[,2]
x <- as.data.frame(fread("PROC/grandM2.txt"))
names(x) <- c("subFile", "uvar", "chr", "pos", "sampleID", "GERP", "CDS", "GENE",
              "sens", "FSC", "FSN", "blank", "chr_again", "pos_end", "sampleID_again",
              "type2", "VAF")

x <- x[,c(2,3,4,5,6,7,8,17)]
x <- x[x$sampleID %in% goodIDs,]
x <- x[!duplicated(x$uvar),]

x$chr_pos <- paste(x$chr, x$pos, sep="_")

## 2. Exclude variants in regions of known or supsected copy number alterations

delvar <- as.data.frame(fread("PROC/PtSV_obs/HAP/obs.som.del.uvar.txt", header=FALSE))[,1]
dupvar <- as.data.frame(fread("PROC/PtSV_obs/SOM_DUP/obs.som.dup.uvar.txt", header=FALSE))[,1]
badvar <- c(delvar, dupvar)

x$DEL <- x$uvar %in% delvar
x$DUP <- x$uvar %in% dupvar

x <- x[!x$DUP & !x$DEL,] # known copy number alterations
x <- x[x$VAF < 0.6,] # suspected copy number alterations

## 3.  Cluster GERP values into bins for aggregation

x$GERP <- as.numeric(x$GERP)
x$VAF <- as.numeric(x$VAF)
x <- x[!is.na(x$GERP) & !is.na(x$VAF),]
x$GERP[x$GERP < 0] <- 0
x$GERP_bin_round <- round(5*x$GERP)/5
x <- x[!x$DUP & !x$DEL,]
x$GERP_bin_round[x$GERP_bin_round>5] <- 5

## 4. Split data set into 
# Variants in a high confidence set of driver genes
# Variants in outside of a sensitive set driver genes
# Variants in a high confidence set of driver genes 
# that have not been individually called as driver variants by PCAWG

dr <- read.csv("META/beta_driver.txt", header = FALSE)[,1]
x$dr = x$GENE %in% dr
vogel <- read.csv("/home/fas/gerstein/wum2/scratch/VALID/META/vg.csv", header=TRUE)
x_dr$vogel <- x_dr$GENE %in% vogel$Gene.Symbol
drivers <- read.csv("/home/fas/gerstein/wum2/scratch/VALID/META/pcawg_whitelist_somatic_driver_mutations_beta.csv", sep="\t")
drivers$uvar <- paste(drivers$sample, drivers$chr, drivers$pos, sep="_")
x_dr <- x[x$dr,]
x_ndr <- x[!x$dr,]

x_dr_rec <- x_dr[x_dr$vogel,]
x_ndr_r <- x_ndr
x_dr$discDriver <- x_dr$uvar %in% drivers$uvar
x_ld <- x_dr[!x_dr$discDriver & x_dr$vogel,]

## 5. Calculate mean VAF by GERP bin in each driver category, 
## with bootstrapped confidence intervals

library(boot)
will_mean <- function(inp, i) {return(mean(inp[i]))} 
will_boot <- function(z) {
  b <- boot(data=z, statistic=will_mean, R=10)
  b.ci <- (as.numeric(quantile(as.numeric(unlist(b[2])), c(0.0275, 0.5, 0.975))))
  paste(b.ci[1], b.ci[2], b.ci[3], sep="_")
}

rboot_dr <- aggregate(x_dr_rec$VAF, by=list(x_dr_rec$GERP_bin_round), FUN=will_boot)
rboot_dr$l95 <- as.numeric(unlist(lapply(strsplit(rboot_dr[,2], "_"),head,1)))
rboot_dr$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_dr[,2], "_"),head,2), tail, 1)))
rboot_dr$u95 <- as.numeric(unlist(lapply(strsplit(rboot_dr[,2], "_"),tail,1)))
rboot_dr <- rboot_dr[,-2]
rboot_dr
write.table(rboot_dr, "spline.boot_dr.txt", quote = FALSE, sep="\t", row.names = FALSE)


rboot_ndr <- aggregate(x_ndr_r$VAF, by=list(x_ndr_r$GERP_bin_round), FUN=will_boot)
rboot_ndr$l95 <- as.numeric(unlist(lapply(strsplit(rboot_ndr[,2], "_"),head,1)))
rboot_ndr$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_ndr[,2], "_"),head,2), tail, 1)))
rboot_ndr$u95 <- as.numeric(unlist(lapply(strsplit(rboot_ndr[,2], "_"),tail,1)))
rboot_ndr <- rboot_ndr[,-2]
rboot_ndr
write.table(rboot_ndr, "spline.boot_ndr.txt", sep="\t", quote=FALSE, row.names = FALSE)


rboot_ld <- aggregate(x_ld$VAF, by=list(x_ld$GERP_bin_round), FUN=will_boot)
rboot_ld$l95 <- as.numeric(unlist(lapply(strsplit(rboot_ld[,2], "_"),head,1)))
rboot_ld$median <- as.numeric(unlist(lapply(lapply(strsplit(rboot_ld[,2], "_"),head,2), tail, 1)))
rboot_ld$u95 <- as.numeric(unlist(lapply(strsplit(rboot_ld[,2], "_"),tail,1)))
rboot_ld <- rboot_ld[,-2]
rboot_ld

write.table(rboot_ld, "spline.boot_ld.txt", quote = FALSE, sep="\t", row.names = FALSE)

## 6. Plot results

options(stringsAsFactors = F)
library(ggplot2)
pd <- position_dodge(0.1)

df$`GERP bin`=as.factor(rep(c("< 0.5", "0.5-1.5", "1.5-2.5", "2.5-3.5", "3.5-4.5", "4.5 and above"),1))

b_ndr <- read.csv("spline.boot_ndr.txt", sep="\t")
b_dr <- read.csv("spline.boot_dr.txt", sep="\t")
b_ld <- read.csv("spline.boot_ld.txt", sep="\t")

names(b_ndr) <- c("GERP bin", "l95", "mean VAF", "u95")
names(b_dr) <- c("GERP bin", "l95", "mean VAF", "u95")
names(b_ld) <- c("GERP bin", "l95", "mean VAF", "u95")

b_ndr$Variant_Category <- "Outside driver gene-sets"
b_dr$Variant_Category <- "Driver gene-set"
b_ld$Variant_Category <- "Driver gene-set excluding identified driver variants"
df <- rbind(b_ndr, b_dr, b_ld)
df$`GERP bin`=as.factor(rep(b_ndr[,1],3))

df <- df[df$`GERP bin`!=0,]
pdf("VAF_GERP_linear.pdf")
ggplot(df, aes(x=`GERP bin`, y=`mean VAF`, colour=Variant_Category, group=Variant_Category)) + 
  geom_point() + 
  theme(text = element_text(size=10)) +
  geom_smooth(se = TRUE, method = "gam")
dev.off()