library(data.table)
library(VariantAnnotation)
library(GenomicRanges)
library(ggplot2)
library(grid)
library(corrplot)
library(ROCR)
library(Matching)
library(gridExtra)

#' check whether all needed data files are available
stopifnot(file.exists('promoters_Gm12878rep2_dnase_validated.gff'))
stopifnot(file.exists('promoters_Gm12878rep2_hostgene.gff'))
stopifnot(file.exists('promoters_Gm12878rep2_intronic.gff'))
stopifnot(file.exists('promoters_unspecific.txt'))
stopifnot(file.exists('host_mirna_independence.txt'))
stopifnot(file.exists('cutting_points.bed'))
stopifnot(file.exists('hostgenes_mirna.txt'))
stopifnot(file.exists('hsa.gff2'))
stopifnot(file.exists('hsa.gff3'))
stopifnot(file.exists('id_mapping.txt'))
stopifnot(file.exists('disease_categories.txt'))
stopifnot(file.exists('gwascatalog.txt'))
stopifnot(file.exists('ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz'))
stopifnot(file.exists('wgEncodeBroadHmmGm12878HMM.bed.gz'))
stopifnot(file.exists('wgEncodeOpenChromDnaseGm12878Pk.narrowPeak.gz'))
stopifnot(file.exists('wgEncodeRegTfbsClusteredWithCellsV3.bed.gz'))
stopifnot(file.exists('EUR363.mi.cis.FDR5.all.rs137.txt.gz'))
stopifnot(file.exists('YRI89.mi.cis.FDR5.all.rs137.txt.gz'))
stopifnot(file.exists('GD480.MirnaQuantCount.txt.gz'))
stopifnot(file.exists('EUR373.gene.cis.FDR5.all.rs137.txt.gz'))
stopifnot(file.exists('EUR373.exon.cis.FDR5.all.rs137.txt.gz'))
stopifnot(file.exists('YRI89.exon.cis.FDR5.all.rs137.txt.gz'))
stopifnot(file.exists('YRI89.gene.cis.FDR5.all.rs137.txt.gz'))

###########################################################
##############     LAPPALAINEN DATA      ##################
###########################################################

#' get the data of Lappalainen et al., read counts for each mature miRNA in each sample
expr <- read.table('GD480.MirnaQuantCount.txt.gz', sep ='\t', stringsAsFactors=F, strip.white=TRUE, header=TRUE)
expr <- expr[expr$Chr != "X", c(-2,-3,-4)] #' only autosomal miRNAs

#' reproduce the miRNAs used in Lappalainen et al.
#' that are all miRNAs which have at least 1 read count in at least half (240) of the samples
mirnas <- tolower(expr$TargetID[apply(expr[,2:dim(expr)[2]], 1, function(x) sum(x > 0) > 240 )])

#' load the significant cis-eQTLs for miRNA expression
eur <- read.csv('EUR363.mi.cis.FDR5.all.rs137.txt.gz', sep ='\t')
yri <- read.csv('YRI89.mi.cis.FDR5.all.rs137.txt.gz', sep ='\t')
eur$GENE_ID <- tolower(as.character(eur$GENE_ID))
yri$GENE_ID <- tolower(as.character(yri$GENE_ID))
eur$SNP_ID <- as.character(eur$SNP_ID)
yri$SNP_ID <- as.character(yri$SNP_ID)

#' the data of Lappalainen is based on miRBase v18, we are using miRBase v20 now
#' there have been some name changes in the meantime
#'                v18               v20
#' MIMAT0005948   hsa-miR-664      hsa-miR-664a
#' MIMAT0005949   hsa-miR-664      hsa-miR-664a
#' MI0016843      hsa-mir-4482-1   hsa-mir-4482
#' MIMAT0004567   hsa-mir-219-1    hsa-mir-219a-1
#' MI0006437      hsa-mir-1280     DELETED
#' MI0006654      hsa-mir-720      DELETED
#' MI0016077      hsa-mir-3676     DELETED
pos <- which(substr(mirnas, 1, nchar(mirnas)-3) == "hsa-mir-664")
mirnas[pos] <- paste("hsa-mir-664a", substr(mirnas[pos], nchar(mirnas[pos])-2, nchar(mirnas[pos])), sep="")
pos <- which(substr(mirnas, 1, nchar(mirnas)-3) == "hsa-mir-219-1")
mirnas[pos] <- paste("hsa-mir-219a-1", substr(mirnas[pos], nchar(mirnas[pos])-2, nchar(mirnas[pos])), sep="")
pos <- which(substr(mirnas, 1, nchar(mirnas)-3) == "hsa-mir-4482-1")
mirnas[pos] <- paste("hsa-mir-4482", substr(mirnas[pos], nchar(mirnas[pos])-2, nchar(mirnas[pos])), sep="")
pos <- which(substr(mirnas, 1, nchar(mirnas)-3) %in% c("hsa-mir-1280", "hsa-mir-720", "hsa-mir-3676"))
mirnas <- mirnas[-pos]

#' the other changed miRNAs don't have eQTLs
pos <- which(substr(yri$GENE_ID, 1, nchar(yri$GENE_ID)-3) == "hsa-mir-4482-1")
yri$GENE_ID[pos] <- paste("hsa-mir-4482", substr(yri$GENE_ID[pos],nchar(yri$GENE_ID[pos])-2, nchar(yri$GENE_ID[pos])), sep="")
pos <- which(substr(eur$GENE_ID, 1, nchar(eur$GENE_ID)-3) == "hsa-mir-4482-1")
eur$GENE_ID[pos] <- paste("hsa-mir-4482", substr(eur$GENE_ID[pos], nchar(eur$GENE_ID[pos])-2, nchar(eur$GENE_ID[pos])), sep="")

#' clean up
rm(expr)

###########################################################
#############       COLLECTING SNPS      ##################
###########################################################

#' load the pre-miRNA genomic annotation according to miRBase v20
hsa <- read.table('hsa.gff2', sep='\t', skip = 8)
hsa$V1 <- substr(hsa$V1, 4, nchar(as.character(hsa$V1))) 

#' just select the pre-miRNAs we need
pos <- c()
for(x in 1:length(mirnas)){
  query <- paste(substr(mirnas[x], 1, nchar(mirnas[x])-3), ";", sep="")
  pos <- c(pos, which(grepl(query, hsa$V9, ignore.case = TRUE) == TRUE))
}
hsa <- hsa[pos,]

#' get all variants within +- 500kb of each pre-miRNA
variants <- data.table() # this will contain all unique SNPs + infos
feat.matrix <- data.table() # this will contain all observations and features

indexTabix('ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz', 'vcf')
range.mirna <- GRanges(seqnames = hsa$V1[1], strand = hsa$V7[1], ranges = IRanges(hsa$V4[1] - 500000, hsa$V5[1] + 500000))
vcf <- readVcf(file = 'ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz', genome = "GRCh37", range.mirna)
for(entry in 1:dim(hsa)[1]){
  sink("/dev/null")
  if(entry > 1) {
    prev.mirna <- substr(mirnas[entry-1], 1, nchar(mirnas[entry-1])-3)
    new.mirna <- substr(mirnas[entry], 1, nchar(mirnas[entry])-3)
    if(prev.mirna != new.mirna) {
      range.mirna <- GRanges(seqnames = hsa$V1[entry], strand = hsa$V7[entry], ranges = IRanges(hsa$V4[entry] - 500000, hsa$V5[entry] + 500000))
      vcf <- readVcf(file = 'ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz', genome = "GRCh37", range.mirna)
    }
  }
  sink()  
  #' filter for SNPs and MAF
  x <- which((info(vcf)$VT == "SNP") & ((info(vcf)$EUR_AF > 0.05 & info(vcf)$EUR_AF < 0.95) | (info(vcf)$AFR_AF > 0.05 & info(vcf)$AFR_AF < 0.95))) 
  variants <- rbindlist(list(variants, data.table(ID    = rownames(vcf)[x], 
                                                  CHROM = as.numeric(levels(seqnames(vcf)[x])), 
                                                  POS   = start(ranges(vcf))[x], 
                                                  REF   = as.character(rowRanges(vcf)$REF[x]),
                                                  ALT   = as.character(unlist(rowRanges(vcf)$ALT[x])))))
  feat.matrix <- rbind(feat.matrix, data.table(miRNA = mirnas[entry], variant = rownames(vcf)[x]))
  print(paste(entry, '/', dim(hsa)[1], sep=''))
}

#' some processing
variants <- unique(variants)
setkey(variants, ID)
setkey(feat.matrix, miRNA, variant)

#' expand the variants to match the number and order of observations
#' this simplifies a lot of subsequent computations 
expanded.var <- variants[feat.matrix$variant,2:3, with=FALSE]

#' clean up
rm(entry, new.mirna, prev.mirna, pos, query, range.mirna, x, vcf)
gc()

###########################################################
#############       MODEL RESPONSE       ##################
###########################################################

#' get the response vector y
#' for each miRNA/SNP pair: 1 = SNP is eQTL for this miRNA, 0 = not
feat.matrix[,y:=rep(0, dim(feat.matrix)[1])]
for(x in 1:dim(eur)[1]) {
  feat.matrix[list(eur$GENE_ID[x], eur$SNP_ID[x]), y := 1]
}
for(x in 1:dim(yri)[1]) {
  feat.matrix[list(yri$GENE_ID[x], yri$SNP_ID[x]), y := 1]
}

#' clean up
rm(x, eur, yri)

###########################################################
#############       MODEL FEATURES       ##################
###########################################################

#' returns a GRanges object containing all promoters for miRNAs in our dataset
get.ranges <- function(file.name) {
  promoter.file <- read.table(file.name,sep='\t', fill = TRUE, stringsAsFactors=FALSE, strip.white=TRUE)
  pos <- c()
  for(name in paste(substr(mirnas, 1, nchar(mirnas)-3),";",sep="")) {
    hits <- which(grepl(name, promoter.file$V9, ignore.case = TRUE) == TRUE)
    pos <- c(pos, hits)
  }
  prom.ranges <- GRanges(seqnames = substr(promoter.file$V1[pos],4,5), ranges = IRanges(promoter.file$V4[pos]-100, promoter.file$V5[pos]+100))
  return(prom.ranges)
}

query <- GRanges(seqnames = expanded.var$CHROM, ranges = IRanges(expanded.var$POS, expanded.var$POS))

#' promoter (specific)
prom.ranges <- get.ranges('promoters_Gm12878rep2_dnase_validated.gff')
feat.matrix[, promoter.specific := (!is.na(findOverlaps(query = query, subject = prom.ranges, select = "first")))*1]

#' promoter (unspecific)
prom.ranges <- get.ranges('promoters_unspecific.txt')
feat.matrix[, promoter.unspecific := (!is.na(findOverlaps(query = query, subject = prom.ranges, select = "first")))*1]

#' promoter (intronic) (subset of specific)
prom.ranges <- get.ranges('promoters_Gm12878rep2_intronic.gff')
feat.matrix[, promoter.intronic := (!is.na(findOverlaps(query = query, subject = prom.ranges, select = "first")))*1]

#' promoter (hostgene) (subset of specific)
prom.ranges <- get.ranges('promoters_Gm12878rep2_hostgene.gff')
feat.matrix[, promoter.host := (!is.na(findOverlaps(query = query, subject = prom.ranges, select = "first")))*1]

#' DNaseI
dnase <- read.table("wgEncodeOpenChromDnaseGm12878Pk.narrowPeak.gz", sep='\t', stringsAsFactors=FALSE)
dnase <- GRanges(seqnames = substr(dnase$V1, 4,5),ranges = IRanges(dnase$V2, dnase$V3))
feat.matrix[, dnase := (!is.na(findOverlaps(query = query, subject = dnase, select = "first")))*1]

#' ChromHMM
chromhmm <- read.table("wgEncodeBroadHmmGm12878HMM.bed.gz", sep='\t', stringsAsFactors=FALSE)
for(segment in unique(chromhmm$V4)) {
  seg <- chromhmm[which(chromhmm$V4 == segment),]
  seg <- GRanges(seqnames = substr(seg$V1,4,5), ranges = IRanges(seg$V2, seg$V3))
  feat.matrix[, (segment) := (!is.na(findOverlaps(query = query, subject = seg, select = "first")))*1]
}

#' TFBSs
tfbs <- fread("zcat wgEncodeRegTfbsClusteredWithCellsV3.bed.gz", sep='\t', stringsAsFactors=FALSE)
tfbs <- tfbs[grepl("GM12878", tfbs[,V6]),c(1:4), with=FALSE]
setkey(tfbs, V4)
for(tf in unique(tfbs[,V4])) {
  tmp <- tfbs[tf,]
  tmp <- GRanges(seqnames = substr(tmp[,V1],4,5), ranges = IRanges(tmp[,V2], tmp[,V3]))
  feat.matrix[, paste("TF_",tf,sep="") := (!is.na(findOverlaps(query = query, subject = tmp, select = "first")))*1]
}

#' insulator in between
hsa <- unique(hsa)
mirnas.pre <- unlist(strsplit(as.character(hsa$V9), "ID="))[seq(2,dim(hsa)[1]*2,2)]
mirnas.pre <- substr(mirnas.pre, 1, nchar(mirnas.pre)-1)
rownames(hsa) <- mirnas.pre
expanded.premirnas <- hsa[substr(feat.matrix[,miRNA], 1, nchar(feat.matrix[,miRNA])-3), c(4,5)]
colnames(expanded.premirnas) <- c('START','END')

insu <- chromhmm[intersect(which(chromhmm$V4 == "8_Insulator"), which(chromhmm$V1 != "chrX")),]
range.insu.up <- GRanges(seqnames = substr(insu$V1,4,5), ranges=IRanges(insu$V2,insu$V2))
range.insu.down <- GRanges(seqnames = substr(insu$V1,4,5), ranges=IRanges(insu$V3,insu$V3))
feat.matrix[,insulator.in.between := rep(0, dim(feat.matrix)[1])]

up <- which(expanded.var$POS < expanded.premirnas$START)
query <- GRanges(seqnames = expanded.var$CHROM[up], ranges = IRanges(expanded.var$POS[up], expanded.premirnas$START[up]))
feat.matrix[up, insulator.in.between := (!is.na(findOverlaps(query = query, subject = range.insu.up, select = "first")))*1]

down <- which(expanded.var$POS > expanded.premirnas$END)
query <- GRanges(seqnames = expanded.var$CHROM[down], ranges = IRanges(expanded.premirnas$END[down],expanded.var$POS[down]))
feat.matrix[down, insulator.in.between := (!is.na(findOverlaps(query = query, subject = range.insu.down, select = "first")))*1]

#' clean up
rm(chromhmm, tfbs, query, up, down, insu, dnase, range.insu.down, range.insu.up, seg,
   get.ranges, tmp, prom.ranges, segment, tf, expanded.premirnas)
gc()

###########################################################
#############       MIRNA FEATURES       ##################
#############    5p, 3p, loop, drosha    ##################
###########################################################

#' exact locations of mature miRNAs
locs.mirnas <- fread('hsa.gff3', sep='\t', stringsAsFactors = FALSE, skip = 13)

#' Drosha cut points
cut.points <- fread('cutting_points.bed', sep='\t', stringsAsFactors=FALSE)
setkey(cut.points, V4)

#' get the MIMAT IDs to select the correct mature miRNAs
id.mapping <- fread("id_mapping.txt", sep='\t', stringsAsFactors=FALSE, header = FALSE, skip = 2)

#' just select the mature miRNAs we need
ranges.mirnas <- data.table()
id.mapping <- id.mapping[V1 %in% mirnas.pre,]
for(row in 1:dim(id.mapping)[1]){
  mimats <- strsplit(id.mapping[row,V2], ',')[[1]]
  matures <- strsplit(id.mapping[row,V4], ',')[[1]]
  derived.from <- id.mapping[row, V3]
  if(length(matures) == 2) { # both 5p and 3p found
    pos.5p <- intersect(which(grepl(mimats[1], locs.mirnas[,V9])), which(grepl(derived.from, locs.mirnas[,V9])))
    pos.3p <- intersect(which(grepl(mimats[2], locs.mirnas[,V9])), which(grepl(derived.from, locs.mirnas[,V9])))
    if(locs.mirnas[pos.5p,V7] == '-') {
      tmp <- pos.5p
      pos.5p <- pos.3p
      pos.3p <- tmp
    }
    ranges.mirnas <- rbindlist(list(ranges.mirnas,
                                    data.table(ID       = id.mapping[row, V1], 
                                               START5P  = locs.mirnas[pos.5p, V4],
                                               END5P    = locs.mirnas[pos.5p, V5],
                                               START3P  = locs.mirnas[pos.3p, V4],
                                               END3P    = locs.mirnas[pos.3p, V5],
                                               DROSHA5P = locs.mirnas[pos.5p, V4]-1,
                                               DROSHA3P = locs.mirnas[pos.3p, V5]+1,
                                               LOOP5P   = locs.mirnas[pos.5p, V5]+1,
                                               LOOP3P   = locs.mirnas[pos.3p, V4]-1)))
  } else { # just one found, check to which end it is closer 
    pos.pre <- which(grepl(paste('Alias=', derived.from, sep=''),locs.mirnas[,V9]))
    pos.mature <- intersect(which(grepl(mimats[1], locs.mirnas[,V9])), which(grepl(derived.from, locs.mirnas[,V9])))
    is.5p <- locs.mirnas[pos.mature, V4] - locs.mirnas[pos.pre, V4] < locs.mirnas[pos.pre, V5] - locs.mirnas[pos.mature, V5]
    if(is.5p) {
      ranges.mirnas <- rbindlist(list(ranges.mirnas,
                                      data.table(ID       = id.mapping[row, V1], 
                                                 START5P  = locs.mirnas[pos.mature, V4],
                                                 END5P    = locs.mirnas[pos.mature, V5],
                                                 START3P  = 0, END3P   = 0,
                                                 DROSHA5P = locs.mirnas[pos.mature, V4]-1,
                                                 DROSHA3P = cut.points[id.mapping[row, V1], V3],
                                                 LOOP5P   = locs.mirnas[pos.mature, V5]+1,
                                                 LOOP3P   = cut.points[id.mapping[row, V1], V3]-1)))
    } else {
      ranges.mirnas <- rbindlist(list(ranges.mirnas,
                                      data.table(ID       = id.mapping[row, V1], 
                                                 START5P  = 0, END5P   = 0,
                                                 START3P  = locs.mirnas[pos.mature, V4], 
                                                 END3P    = locs.mirnas[pos.mature, V5],
                                                 DROSHA5P = cut.points[id.mapping[row, V1], V2],
                                                 DROSHA3P = locs.mirnas[pos.mature, V5]+1,
                                                 LOOP5P   = cut.points[id.mapping[row, V1], V2]+1,
                                                 LOOP3P   = locs.mirnas[pos.mature, V4]-1)))
    }
  }
}
setkey(ranges.mirnas, ID)
expanded.mirnas <- substr(feat.matrix[,miRNA], 1, nchar(feat.matrix[,miRNA])-3)

#' feature 5p arm
expanded.locs <- ranges.mirnas[expanded.mirnas,list(START5P, END5P)]
is.3p <- substr(feat.matrix[,miRNA], nchar(feat.matrix[,miRNA])-1, nchar(feat.matrix[,miRNA])) == '3p'
expanded.locs[is.3p, START5P := 0]
expanded.locs[is.3p, END5P := 0]
feat.matrix[, mirna.5p := ((expanded.var[,POS] >= expanded.locs[,START5P]) & (expanded.var[,POS] <= expanded.locs[,END5P]))*1]

#' feature 3p arm
expanded.locs <- ranges.mirnas[expanded.mirnas,list(START3P, END3P)]
is.5p <- substr(feat.matrix[,miRNA], nchar(feat.matrix[,miRNA])-1, nchar(feat.matrix[,miRNA])) == '5p'
expanded.locs[is.5p, START3P := 0]
expanded.locs[is.5p, END3P := 0]
feat.matrix[, mirna.3p := ((expanded.var[,POS] >= expanded.locs[,START3P]) & (expanded.var[,POS] <= expanded.locs[,END3P]))*1]

#' feature loop between 5p and 3p
expanded.locs <- ranges.mirnas[expanded.mirnas,list(LOOP5P, LOOP3P)]
feat.matrix[, mirna.loop := ((expanded.var[,POS] >= expanded.locs[,LOOP5P]) & (expanded.var[,POS] <= expanded.locs[,LOOP3P]))*1]

#' feature drosha flanking regions
expanded.locs <- data.table(STARTup   = ranges.mirnas[expanded.mirnas, DROSHA5P]-22,
                            ENDup     = ranges.mirnas[expanded.mirnas, DROSHA5P],
                            STARTdown = ranges.mirnas[expanded.mirnas, DROSHA3P],
                            ENDdown   = ranges.mirnas[expanded.mirnas, DROSHA5P]+22)
feat.matrix[, mirna.drosha := (((expanded.var$POS >= expanded.locs$STARTup) & (expanded.var$POS <= expanded.locs$ENDup)) 
                               | ((expanded.var$POS >= expanded.locs$STARTdown) & (expanded.var$POS <= expanded.locs$ENDdown)))*1]


#' test different drosha ranges, up until +22 eQTLs can be found, next one not until
#' +59 (just normal SNPs between these positions)
#for(x in 1:100) {
#  expanded.locs <- data.table(STARTup   = ranges.mirnas[expanded.mirnas, DROSHA5P]-x,
#                              ENDup     = ranges.mirnas[expanded.mirnas, DROSHA5P],
#                              STARTdown = ranges.mirnas[expanded.mirnas, DROSHA3P],
#                              ENDdown   = ranges.mirnas[expanded.mirnas, DROSHA5P]+x)
#  feat.matrix[, mirna.drosha := (((expanded.var$POS >= expanded.locs$STARTup) & (expanded.var$POS <= expanded.locs$ENDup)) 
#                                 | ((expanded.var$POS >= expanded.locs$STARTdown) & (expanded.var$POS <= expanded.locs$ENDdown)))*1]
#  print(paste(x, sum(feat.matrix[y==1, mirna.drosha]), sum(feat.matrix[y==0,mirna.drosha]), sep=','))
#}


#' clean up
rm(expanded.locs, expanded.mirnas, locs.mirnas, cut.points, id.mapping, ranges.mirnas,
   derived.from, matures, mimats, is.5p, pos.3p, pos.5p, pos.mature, pos.pre, row, tmp)
gc()

###########################################################
#############     REMAINING FEATURES     ##################
###########################################################

#' host eQTL and intragenic features
host.file <- read.table("hostgenes_mirna.txt", sep='\t', stringsAsFactors=FALSE)
hosts <- list()
for(mirna in mirnas.pre) {
  if(length(unique(host.file$V2[which(host.file$V1 == mirna)])) > 0) {
    hosts[[mirna]] <- unique(host.file$V2[which(host.file$V1 == mirna)])
  }
}

host.snps <- rbindlist(list(fread("zcat EUR373.exon.cis.FDR5.all.rs137.txt.gz",header=TRUE,sep='\t',stringsAsFactors=FALSE),
                            fread("zcat YRI89.exon.cis.FDR5.all.rs137.txt.gz", header=TRUE,sep='\t', stringsAsFactors=FALSE),
                            fread("zcat EUR373.gene.cis.FDR5.all.rs137.txt.gz",header=TRUE,sep='\t',stringsAsFactors=FALSE),
                            fread("zcat YRI89.gene.cis.FDR5.all.rs137.txt.gz", header=TRUE,sep='\t', stringsAsFactors=FALSE)))
host.snps <- host.snps[substr(host.snps$GENE_ID,1,15) %in% unname(unlist(hosts)),]

expanded.host.snps <- rep(0, dim(feat.matrix)[1])
old.host <- ""
for(x in 1:dim(feat.matrix)[1]) {
  name <- substr(feat.matrix[x,miRNA], 1, nchar(feat.matrix[x,miRNA])-3)
  for(host in hosts[[name]]) {
    if(host != old.host) {
      snps.host <- host.snps$SNP_ID[which(substr(host.snps$GENE_ID,1,15) == host)]
      old.host <- host
    }
    if(feat.matrix[x,variant] %in% snps.host) {
      expanded.host.snps[x] <- 1
      break
    }
  }
}
feat.matrix[, hosteqtl := expanded.host.snps]
feat.matrix[, intragenic := (substr(feat.matrix[,miRNA], 1, nchar(feat.matrix[,miRNA])-3) %in% names(hosts))*1]

#' distance feature
expanded.locs <- data.table(START=hsa[substr(feat.matrix[,miRNA],1,nchar(feat.matrix[,miRNA])-3),"V4"],
                              END=hsa[substr(feat.matrix[,miRNA],1,nchar(feat.matrix[,miRNA])-3),"V5"])
up <- expanded.var[,POS] <= expanded.locs[,START]
down <- expanded.var[,POS] >= expanded.locs[,END]
feat.matrix[, distance := rep(1,dim(feat.matrix)[1])]
feat.matrix[up, distance := 1 - (expanded.locs[up,START] - expanded.var[up,POS])/500000]
feat.matrix[down, distance := 1 - (expanded.var[down,POS] - expanded.locs[down,END])/500000]

#' clean up
rm(expanded.locs, expanded.host.snps, up, down, host.snps, host.file, hsa, mirna, name, old.host,
   x, hosts, snps.host, host, expanded.var)
gc()

###########################################################
##############     FEATURE SELECTION     ##################
###########################################################

#' run a regression with each feature individually, but always including the distance
coefs <- matrix(,ncol=4,nrow=0)
ci <- matrix(,ncol=2,nrow=0)
pos <- c(4:107)
for(x in pos) {
  m <- glm(y ~ ., data=feat.matrix[,unique(c(3,x,107)),with=FALSE],family="binomial")
  coefs <- rbind(coefs, summary(m)$coefficients[2,])
  ci <- rbind(ci, confint(m)[2,])
}
rownames(coefs) <- colnames(feat.matrix)[pos]
rownames(ci) <- colnames(feat.matrix)[pos]

###########################################################
################     TFBS SELECTION     ###################
###########################################################

coefs.tfbs <- coefs[substr(rownames(coefs), 1, 3) == 'TF_',]
ci.tfbs <- ci[substr(rownames(coefs), 1, 3) == 'TF_',]
#coefs.tfbs[ order(-coefs.tfbs[,1]), ] # sort by log-odds
#coefs.tfbs[ order(coefs.tfbs[,4]), ] # sort by p-value

#' create the merged tfbs feature out of all significant TFs
coefs.signi <- names(which(coefs.tfbs[,4] < 0.05))
feat.matrix[,tfbs := Reduce(`|`, feat.matrix[,coefs.signi,with=FALSE]) * 1]

#' plot it
rownames(coefs.tfbs) <- unlist(strsplit(rownames(coefs.tfbs),"_"))[seq(2,152,2)]
rownames(ci.tfbs) <- unlist(strsplit(rownames(ci.tfbs),"_"))[seq(2,152,2)]
#' remove ZNF274, it screws up the plot and it is not significant anyway
coefs.tfbs <- coefs.tfbs[-75,]
ci.tfbs <- ci.tfbs[-75,]

pdf(file = "plot_forest_tfbs.pdf", width = 7, height = 11)

sort.order <-  names(sort(coefs.tfbs[,1]))
x.labels <- factor(sort.order,levels=sort.order)
signi <- coefs.tfbs[sort.order,4] < 0.05
colors <- rep("black", length(signi))
colors[!signi] <- "#AAAAAA"
qplot(x.labels,sort(coefs.tfbs[,1]),ylab="log-odds ratio",xlab="",size = I(4),colour = I(colors))+
  geom_errorbar(aes(x=x.labels, ymin=ci.tfbs[sort.order,1], ymax=ci.tfbs[sort.order,2]), width=0.5,size = I(1),colour = I(colors))+ 
  geom_abline(slope=0, size=0.5) + 
  theme(axis.text.x = element_text(colour = "black", size=12),
        axis.text.y = element_text(colour = colors, size=12),
        text = element_text(size=12),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"),
        #panel.background = element_rect(fill='white', colour='black'),
        panel.grid.minor = element_line(colour = "grey", size=0.2),
        panel.grid.major = element_line(colour = "grey", size=0.2),
        plot.margin=unit(c(0,0,0,0),"mm"))+
  coord_flip() 

dev.off()

###########################################################
################ FEATURE SELECTION PLOT ###################
###########################################################

#' run the regression for feature selection again for the new tfbs feature
m <- glm(y ~ ., data=feat.matrix[,unique(c(3,108,107)),with=FALSE],family="binomial")
coefs <- rbind(coefs, summary(m)$coefficients[2,])
ci <- rbind(ci, confint(m)[2,])
rownames(coefs)[105] <- "tfbs"
rownames(ci)[105] <- "tfbs"

#' plot it
coefs.features <- coefs[c(1, 2, 5:20, 97:105),]
ci.features <- ci[c(1, 2, 5:20, 97:105),]
pretty.names <- c("promoter (specific)", "promoter (unspecific)", "DNaseI", "15_Repetitive/CNV","13_Heterochrom/lo",   
                  "8_Insulator","11_Weak_Txn","7_Weak_Enhancer","10_Txn_Elongation","9_Txn_Transition",    
                  "2_Weak_Promoter","1_Active_Promoter","3_Poised_Promoter","12_Repressed","6_Weak_Enhancer",     
                  "14_Repetitive/CNV","5_Strong_Enhancer","4_Strong_Enhancer","insulator in between", "5p",             
                  "3p", "hairpin loop", "Drosha cut sites","mRNA-eQTL","intragenic","distance", "TFBS")
rownames(coefs.features) <- pretty.names
rownames(ci.features) <- pretty.names

pdf(file = "plot_forest_features.pdf", width=7, height=5.5)

sort.order <-  names(sort(coefs.features[,1]))
x.labels <- factor(sort.order,levels=sort.order)
signi <- coefs.features[sort.order,4] < 0.05
colors <- rep("black", length(signi))
colors[!signi] <- "#AAAAAA"
qplot(x.labels,sort(coefs.features[,1]),ylab="log-odds ratio",xlab="",size = I(4),colour = I(colors))+
  geom_errorbar(aes(x=x.labels, ymin=ci.features[sort.order,1], ymax=ci.features[sort.order,2]), width=0.5,size = I(1),colour = I(colors))+ 
  geom_abline(slope=0, size=0.5) + 
  theme(axis.text.x = element_text(colour = "black", size=12),
        axis.text.y = element_text(colour = colors, size=12),
        text = element_text(size=12),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"),
        #panel.background = element_rect(fill='white', colour='black'),
        panel.grid.minor = element_line(colour = "grey", size=0.2),
        panel.grid.major = element_line(colour = "grey", size=0.2),
        plot.margin=unit(c(0,0,0,0),"mm"))+
  coord_flip() 

dev.off()

#' clean up
rm(m, coefs.tfbs, ci.tfbs, x.labels, signi, sort.order, coefs.features, coefs.signi,
   ci.features, pretty.names,pos,x,colors)
gc()

###########################################################
##############     CORRELATION MATRIX     #################
###########################################################

#' all features except single TFs
pos <- c(1:2, 5:20, 97:105)

coefs.features <- coefs[pos,]
sel.features <- which(colnames(feat.matrix) %in% rownames(coefs.features))

len <- length(sel.features)
cor.pvalues <- matrix(rep(NA, len**2), nrow=len, ncol=len)
cor.estimates <- matrix(rep(NA, len**2), nrow=len, ncol=len)
for(feat.row in 1:len) {
  for(feat.col in feat.row:len) {
    test <- cor.test(data.matrix(feat.matrix[,sel.features[feat.row],with=FALSE]),
                     data.matrix(feat.matrix[,sel.features[feat.col],with=FALSE]),
                     alternative="two.sided", method="pearson")
    cor.pvalues[feat.row, feat.col] <- cor.pvalues[feat.col, feat.row] <- test$p.value
    cor.estimates[feat.row, feat.col] <- cor.estimates[feat.col, feat.row] <- test$estimate
  }
}
rownames(cor.estimates) <- c("promoter (specific)", "promoter (unspecific)", "DNaseI", "15_Repetitive/CNV","13_Heterochrom/lo",   
                             "8_Insulator","11_Weak_Txn","7_Weak_Enhancer","10_Txn_Elongation","9_Txn_Transition",    
                             "2_Weak_Promoter","1_Active_Promoter","3_Poised_Promoter","12_Repressed","6_Weak_Enhancer",     
                             "14_Repetitive/CNV","5_Strong_Enhancer","4_Strong_Enhancer","insulator in between", "5p",             
                             "3p", "hairpin loop", "Drosha cut sites","mRNA-eQTL","intragenic","distance", "TFBS")
colnames(cor.estimates) <- rownames(cor.estimates)
reorder <- c("1_Active_Promoter","2_Weak_Promoter","3_Poised_Promoter","4_Strong_Enhancer","5_Strong_Enhancer",
             "6_Weak_Enhancer","7_Weak_Enhancer","8_Insulator","9_Txn_Transition","10_Txn_Elongation","11_Weak_Txn",
             "12_Repressed","13_Heterochrom/lo","14_Repetitive/CNV","15_Repetitive/CNV","5p","hairpin loop",
             "3p","Drosha cut sites","promoter (unspecific)","promoter (specific)","TFBS","DNaseI",
             "insulator in between","distance","mRNA-eQTL","intragenic")
cor.estimates <- cor.estimates[reorder,reorder]

#' plot it
pdf(file = "plot_corr_matrix.pdf", width = 7, height = 6)
corrplot(cor.estimates, p.mat=cor.pvalues , sig.level=0.05,method="color",addgrid.col="grey",
         tl.cex = 0.75, tl.col = "black", tl.srt = 45, cl.cex=0.75, pch.col="#888888", pch.cex = 0.75)
dev.off()

###########################################################
#################     MODEL SELECTION     #################
###########################################################

sig.features <- rownames(coefs.features)[coefs.features[,4] < 0.05]
chromhmm.set <- which(colnames(feat.matrix)[9:23] %in% sig.features) + 8
premirna.set <- 101:104

model.list <- list( c(107),                                 #distance
                    c(107,chromhmm.set),                    #distance + ChromHMM
                    c(107,premirna.set),                    #distance + pre-miRNA
                    c(107,chromhmm.set, premirna.set),      #distance + ChromHMM + pre-miRNA
                    c(107,chromhmm.set, premirna.set, 4),   #distance + ChromHMM + pre-miRNA + promoter
                    c(107,chromhmm.set, premirna.set, 108), #distance + ChromHMM + pre-miRNA + TFBS
                    c(107,chromhmm.set, premirna.set, 100), #distance + ChromHMM + pre-miRNA + insulator in between
                    c(107,chromhmm.set, premirna.set, 105), #distance + ChromHMM + pre-miRNA + mRNA eQTL
                    c(107,chromhmm.set, premirna.set, 106), #distance + ChromHMM + pre-miRNA + intragenic
                    c(107,chromhmm.set, premirna.set, 4,100,105,106,108) ) #everything
models.names <- c('                                              distance',
                  '                                   ChromHMM + distance',
                  '                                  pre-miRNA + distance', 
                  '                       pre-miRNA + ChromHMM + distance',
                  '           promoters + pre-miRNA + ChromHMM + distance',
                  '                TFBS + pre-miRNA + ChromHMM + distance',
                  'insulator in between + pre-miRNA + ChromHMM + distance',
                  '           mRNA-eQTL + pre-miRNA + ChromHMM + distance',
                  '          intragenic + pre-miRNA + ChromHMM + distance',
                  '                                            full model')

events <- which(feat.matrix$y==1) # all eQTL SNPs
non.events <- which(feat.matrix$y==0) # all non-eQTL SNPs
num.of.obs <- ceiling(length(events) * (3/4)) # we want to sample 3/4 of the eQTL SNPs

times <- 50 # how many samples we want to compute for for each model
set.seed(4)
models.aics <- list()
for(model in model.list) {
  aics <- rep(NA, times)
  for(x in 1:times) { # compute the AIC for each model
    train.events <- sample(events, num.of.obs)
    train.non.events <- sample(non.events, num.of.obs)
    m <- glm(y ~ ., data = feat.matrix[c(train.events, train.non.events), c(3,model),with=FALSE], family = "binomial")
    aics[x] <- m$aic
  }
  models.aics[[length(models.aics)+1]] <- aics
}

# prepare the plot, mean and sd
aics.mean <- sapply(models.aics, mean)
aics.sd <- sapply(models.aics,sd)
names(aics.mean) <- models.names
names(aics.sd) <- models.names

# sort by mean
aics.mean <- sort(aics.mean)
aics.sd <- aics.sd[names(aics.mean)]

# and prepare plot
pdf(file="plot_aic.pdf", width = 8, height=3)

labels <- factor(names(aics.mean),levels=names(aics.mean))
qplot(labels,aics.mean,ylab="AIC",xlab="",size = I(3.5))+
  geom_errorbar(ymin=aics.mean-aics.sd,ymax=aics.mean+aics.sd,width=0.4,size = I(1),colour = I('black'))+
  theme(axis.text.x = element_text(colour = "black", size=12),
        axis.text.y = element_text(colour = "black", size=12),
        text = element_text(size=12),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(0,1,0,0),"mm"))+
  coord_flip() 

dev.off()

###########################################################
#################   MODEL PERFORMANCE    ##################
###########################################################

model <- c(107,chromhmm.set, premirna.set, 4,100,105,106,108) #everything
sensi <- list()
speci <- list()
acc <- list()
prec.recall <- list()
for(x in 1:times) {
  tmp1 <- sample(events, num.of.obs)
  tmp0 <- sample(non.events, length(events)/(1+1/3))
  
  feat.train <- feat.matrix[tmp1, c(1:3,model), with=FALSE]
  feat.train <- rbind(feat.train, feat.matrix[tmp0, c(1:3,model), with=FALSE])
  
  test1 <- setdiff(events,tmp1)
  test0 <- sample(setdiff(non.events,tmp0), length(test1))
  feat.test <- feat.matrix[c(test1,test0), c(1:3,model), with=FALSE]
  
  m <- glm(y ~ ., data=feat.train[,3:dim(feat.train)[2],with=FALSE], family="binomial")
  pred <- ROCR::prediction(as.numeric(predict(m, feat.test, type='response')), feat.test$y)
  
  prec.recall[[length(prec.recall)+1]] <- performance(pred, measure="prec", x.measure="rec")
  sensi[[length(sensi)+1]] <- performance(pred, measure="sens")
  speci[[length(speci)+1]] <- performance(pred, measure="spec")
  acc[[length(acc)+1]] <- performance(pred, measure="acc")
  
}

# and plot it
gg_color_hue <- function(n) {
  hues = seq(15, 375, length=n+1)
  hcl(h=hues, l=65, c=100)[1:n]
}

pdf(file = "plot_performance.pdf", width=8, height=3)

par(mfrow=c(1,2),mar=c(4.1,4.1,0.2,3.5), bty="n")
plot(sensi[[1]],ylab="Percent",xlab = "Probability cutoff", col="#619CFF",lwd=1, xlim=c(0,1), ylim=c(0,1), ann=FALSE)
mtext(side = 2, text = "Percent", line = 2.5)
mtext(side = 1, text = "Probability cutoff", line = 2.2)
grid(lty=1, lwd=1)

for(x in 2:length(sensi)) lines(unlist(sensi[[x]]@x.values), unlist(sensi[[x]]@y.values), col="#619CFF", lwd=1)
for(x in 1:length(speci)) lines(unlist(speci[[x]]@x.values), unlist(speci[[x]]@y.values), col="#F8766D", lwd=1)
for(x in 1:length(acc)) lines(unlist(acc[[x]]@x.values), unlist(acc[[x]]@y.values), col="#00BA38", lwd=1)
legend("bottom",c("sensitivity","specificity","accuracy"),lty=c(1,1,1),
       lwd=c(2,2,2),col=c("#619CFF","#F8766D","#00BA38"),cex=1.0,bg="white")

plot(prec.recall[[1]],ylim=c(0,1),colorize=TRUE, colorize.palette=gg_color_hue(2000), cex.lab=1.0,cex.axis=1.0,lwd=1, ann=FALSE)
grid(lty=1, lwd=1)
mtext(side = 2, text = "Precision", line = 2.5)
mtext(side = 1, text = "Recall", line = 2.2)
mtext(side = 4, text = "Probability cutoff", line = 2.5)
for(x in 2:length(prec.recall)) plot(prec.recall[[x]],colorize=TRUE,colorize.palette=gg_color_hue(2000),add=TRUE,lwd=1)

dev.off()
par(mar=c(5.1,4.1,4.1,2.1))

#' clean up
rm(non.events, prec.recall, acc, sensi, speci, feat.train, feat.test, pred, events, train.events,
   train.non.events, tmp1, tmp0, cor.estimates, gg_color_hue, cor.pvalues, test1, test0,
   coefs.features, test, models.aics, reorder, model.list, labels, aics.mean, aics.sd,
   sig.features, models.names, aics, pos, sel.features, chromhmm.set, premirna.set, feat.col,
   feat.row, len, num.of.obs, times, x)
gc()


###########################################################
#################          GWAS          ##################
###########################################################

gwas <- fread("gwascatalog.txt", sep='\t', stringsAsFactors=FALSE, header = TRUE)
diseases <- read.table("disease_categories.txt", sep='\t', stringsAsFactors=FALSE, header = FALSE, fill = TRUE, strip.white = TRUE, quote="")
times <- 100
set.seed(5)

#' eCDF plot blood SNPs versus random non-GWAS SNPs
selected.pos <- c(which(grepl('blood', diseases$V1, ignore.case = TRUE) == TRUE), 
                  which(grepl('blood', diseases$V2, ignore.case = TRUE) == TRUE))
selected.pos <- gwas[,SNPs][gwas[,`Disease/Trait`] %in% diseases[selected.pos,1] & as.numeric(gwas[,`p-Value`]) < 5e-8]
selected.pos <- unique(unlist(strsplit(selected.pos,split = ', ')))
selected.pos <- unique(unlist(strsplit(selected.pos,split = ',')))
selected.pos <- selected.pos[which(selected.pos %in% variants[,ID])]

feat.neg <- gwas[as.numeric(gwas[,`p-Value`]) < 5e-8, SNPs]
feat.neg <- unique(unlist(strsplit(feat.neg,split = ', ')))
feat.neg <- unique(unlist(strsplit(feat.neg,split = ',')))
feat.neg <- feat.matrix[!feat.matrix[,variant] %in% feat.neg,]

ecdfs.pos = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ecdfs.neg = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ks.less <- 0
for(x in 1:times) {
  feat.test <- feat.matrix[variant %in% selected.pos, c(1:3,model),with=FALSE]
  num.gwas <- dim(feat.test)[1]
  feat.test <- rbind(feat.test, feat.neg[sample(1:dim(feat.neg)[1], num.gwas), c(1:3,model), with=FALSE])
  predictions <- predict(m, feat.test, type='response')
  
  if(ks.boot(predictions[1:num.gwas],predictions[(num.gwas+1):dim(feat.test)[1]],alternative = "less")$ks.boot.pvalue < 0.05) {
    ks.less <- ks.less + 1
  }
  
  ecdfs.neg[,x+1] <- ecdf(predictions[(num.gwas+1):dim(feat.test)[1]])(seq(0,1,length=1e3))
}
ecdfs.pos[,1] = ecdf(predictions[1:num.gwas])(seq(0,1,length=1e3))
print(paste('blood vs random non GWAS',ks.less, sep=', '))

ecdfs.neg <- as.data.frame(ecdfs.neg)
ecdfs.mean <- apply(ecdfs.neg[,2:times+1], 1, function(row) mean(row[-1]))
ecdfs.sd <- apply(ecdfs.neg[,2:times+1], 1, function(row) sd(row[-1]))
eb <- aes(ymax = ecdfs.mean + ecdfs.sd, ymin = ecdfs.mean - ecdfs.sd)
plot1 <- ggplot(data = ecdfs.neg[,2:times+1], aes(x = seq(0,1,length=1e3), y = ecdfs.mean)) + 
  geom_line(size = 1, colour="blue") + 
  geom_ribbon(eb, alpha = 0.2, fill="blue") +
  geom_line(aes(x=seq(0,1,length=1e3), y=ecdfs.pos[,1]),  size=1, colour = "red") +
  scale_x_continuous(name="predicted probability") +
  scale_y_continuous(name="eCDF percentage") + 
  theme(axis.title.x = element_text(colour="#000000", size=12),
        axis.title.y = element_text(colour="#000000", size=12),
        axis.text.x  = element_text(colour="#000000",size=10),
        axis.text.y  = element_text(colour="#000000",size=10),
        plot.title = element_text( face="bold", size=12, hjust=0.5),
        axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(0,0,0,0),"mm"))+
  ggtitle("blood (red) vs.\n random non-GWAS (blue)")

#' eCDF plot blood SNPs versus remaining random GWAS SNPs
selected.pos <- c(which(grepl('blood', diseases$V1, ignore.case = TRUE) == TRUE), 
                  which(grepl('blood', diseases$V2, ignore.case = TRUE) == TRUE))
selected.pos <- gwas[,SNPs][gwas[,`Disease/Trait`] %in% diseases[selected.pos,1] & as.numeric(gwas[,`p-Value`]) < 5e-8]
selected.pos <- unique(unlist(strsplit(selected.pos,split = ', ')))
selected.pos <- unique(unlist(strsplit(selected.pos,split = ',')))
feat.neg <- gwas[as.numeric(gwas[,`p-Value`]) < 5e-8, SNPs]
feat.neg <- unique(unlist(strsplit(feat.neg,split = ', ')))
feat.neg <- unique(unlist(strsplit(feat.neg,split = ',')))
feat.neg <- feat.neg[!feat.neg %in% selected.pos]
feat.neg <- feat.matrix[variant %in% feat.neg,]
selected.pos <- selected.pos[selected.pos %in% variants[,ID]]

ecdfs.pos2 = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ecdfs.neg2 = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ks.less <- 0
for(x in 1:times) {
  feat.test <- feat.matrix[variant %in% selected.pos, c(1:3,model), with=FALSE]
  num.gwas <- dim(feat.test)[1]
  feat.test <- rbind(feat.test, feat.neg[sample(1:dim(feat.neg)[1], num.gwas), c(1:3,model), with=FALSE])
  predictions <- predict(m, feat.test, type='response')
  
  if(ks.boot(predictions[1:num.gwas],predictions[(num.gwas+1):dim(feat.test)[1]],alternative = "less")$ks.boot.pvalue < 0.05) {
    ks.less <- ks.less + 1
  }

  ecdfs.neg2[,x+1] <- ecdf(predictions[(num.gwas+1):dim(feat.test)[1]])(seq(0,1,length=1e3))
}
ecdfs.pos2[,1] = ecdf(predictions[1:num.gwas])(seq(0,1,length=1e3))
print(paste('blood vs random remaining GWAS',ks.less, sep=', '))

ecdfs.neg2 <- as.data.frame(ecdfs.neg2)
ecdfs.mean2 <- apply(ecdfs.neg2[,2:times+1], 1, function(row) mean(row[-1]))
ecdfs.sd2 <- apply(ecdfs.neg2[,2:times+1], 1, function(row) sd(row[-1]))
eb2 <- aes(ymax = ecdfs.mean2 + ecdfs.sd2, ymin = ecdfs.mean2 - ecdfs.sd2)
plot2 <- ggplot(data = ecdfs.neg2[,2:times+1], aes(x = seq(0,1,length=1e3), y = ecdfs.mean2)) + 
  geom_line(size = 1, colour="#009E73") + 
  geom_ribbon(eb2, alpha = 0.2, fill="#009E73") +
  geom_line(aes(x=seq(0,1,length=1e3), y=ecdfs.pos2[,1]),  size=1, colour = "red") +
  scale_x_continuous(name="predicted probability") +
  scale_y_continuous(name="eCDF percentage") + 
  theme(axis.title.x = element_text(colour="#000000", size=12),
        axis.title.y = element_text(colour="#000000", size=12),
        axis.text.x  = element_text(colour="#000000",size=10),
        axis.text.y  = element_text(colour="#000000",size=10),
        plot.title = element_text( face="bold", size=12,hjust=0.5),
        axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(0,0,0,0),"mm"))+
  ggtitle("blood (red) vs.\n random GWAS (green)")

#' eCDF plot random GWAS versus random non-GWAS
selected.pos <- (gwas[,SNPs][as.numeric(gwas[,`p-Value`]) < 5e-8])
selected.pos <- unique(unlist(strsplit(selected.pos,split = ', ')))
selected.pos <- unique(unlist(strsplit(selected.pos,split = ',')))
selected.pos <- selected.pos[selected.pos %in% variants[,ID]]
feat.pos <- feat.matrix[ !feat.matrix[,variant] %in% selected.pos,]

ecdfs.pos3 = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ecdfs.neg3 = matrix(NA,nrow=length(seq(0,1,length=1e3)), ncol=times+1)
ks.less <- 0
for(x in 1:times) {
  feat.test <- feat.pos[sample(1:dim(feat.pos)[1], num.gwas), c(1:3,model), with=FALSE]
  num.gwas <- dim(feat.test)[1]
  feat.test <- rbind(feat.test, feat.neg[sample(1:dim(feat.neg)[1], num.gwas), c(1:3,model), with=FALSE])
  predictions <- predict(m, feat.test, type='response')
  
  if(ks.boot(predictions[1:num.gwas], predictions[(num.gwas+1):dim(feat.test)[1]],alternative = "less")$ks.boot.pvalue < 0.05) {
    ks.less <- ks.less + 1
  }
  
  ecdfs.pos3[,x+1] <- ecdf(predictions[1:num.gwas])(seq(0,1,length=1e3))
  ecdfs.neg3[,x+1] <- ecdf(predictions[(num.gwas+1):dim(feat.test)[1]])(seq(0,1,length=1e3))
}
print(paste('random GWAS vs random non GWAS',ks.less, sep=', '))

ecdfs.pos3 <- as.data.frame(ecdfs.pos3)
ecdfs.pos.mean3 <- apply(ecdfs.pos3[,2:times+1], 1, function(row) mean(row[-1]))
ecdfs.pos.sd3 <- apply(ecdfs.pos3[,2:times+1], 1, function(row) sd(row[-1]))
eb.pos3 <- aes(ymax = ecdfs.pos.mean3 + ecdfs.pos.sd3, 
               ymin = ecdfs.pos.mean3 - ecdfs.pos.sd3, 
               x = seq(0,1,length=1e3))

ecdfs.neg3 <- as.data.frame(ecdfs.neg3)
ecdfs.neg.mean3 <- apply(ecdfs.neg3[,2:times+1], 1, function(row) mean(row[-1]))
ecdfs.neg.sd3 <- apply(ecdfs.neg3[,2:times+1], 1, function(row) sd(row[-1]))
eb.neg3 <- aes(ymax = ecdfs.neg.mean3 + ecdfs.neg.sd3, 
               ymin = ecdfs.neg.mean3 - ecdfs.neg.sd3,
               x = seq(0,1,length=1e3))

plot3 <- ggplot() + 
  geom_line(data = ecdfs.neg3[,2:times+1], aes(x = seq(0,1,length=1e3), y = ecdfs.neg.mean3), size=1, colour="#009E73") + 
  geom_ribbon(eb.neg3, alpha = 0.2, fill="#009E73") +
  geom_line(data = ecdfs.pos3[,2:times+1], aes(x = seq(0,1,length=1e3), y = ecdfs.pos.mean3), size=1, colour="blue") + 
  geom_ribbon(eb.pos3, alpha = 0.2, fill="blue") +
  scale_x_continuous(name="predicted probability") +
  scale_y_continuous(name="eCDF percentage") + 
  theme(axis.title.x = element_text(colour="#000000", size=12),
        axis.title.y = element_text(colour="#000000", size=12),
        axis.text.x  = element_text(colour="#000000",size=10),
        axis.text.y  = element_text(colour="#000000",size=10),
        plot.title = element_text( face="bold", size=12, hjust=0.5),
        axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(0,0,0,0),"mm"))+
  ggtitle("random GWAS (green) vs.\n random non-GWAS (blue)")

pdf(file = "plot_gwas.pdf", width = 10.3, height = 3)
grid.arrange(plot1, plot2, plot3, nrow=1, ncol=3)
dev.off()

#' clean up
rm(list=ls(pattern="ecdfs"))
rm(list=ls(pattern="plot"))
rm(feat.neg, gwas, m, feat.pos, selected.pos, diseases, feat.test, predictions, selected.neg,
   eb.neg3, eb.pos3, eb, eb2, ks.less, ks.greater, ks.two, times, x, num.gwas)
gc()

###########################################################
#################   DISTANCE BIAS PLOT   ##################
###########################################################

pos <- c(4,5,8:23,100:106,108)
dist.feats.eqtls <- list()
dist.feats.all <- list()
for(x in pos){
  # just eQTLs
  dist.feats.eqtls[[length(dist.feats.eqtls)+1]] <- (1 - feat.matrix[y==1 & as.vector(feat.matrix[,x,with=FALSE]==1) ,distance]) * 500000
  # all SNPs
  dist.feats.all[[length(dist.feats.all)+1]] <- (1 - feat.matrix[as.vector(feat.matrix[,x,with=FALSE]==1) ,distance]) * 500000
}
names(dist.feats.eqtls) <- c("promoter (specific)", "promoter (unspecific)", "DNaseI", "15_Repetitive/CNV","13_Heterochrom/lo",   
                             "8_Insulator","11_Weak_Txn","7_Weak_Enhancer","10_Txn_Elongation","9_Txn_Transition",    
                             "2_Weak_Promoter","1_Active_Promoter","3_Poised_Promoter","12_Repressed","6_Weak_Enhancer",     
                             "14_Repetitive/CNV","5_Strong_Enhancer","4_Strong_Enhancer","insulator in between", "5p",             
                             "3p", "hairpin loop", "Drosha cut sites","mRNA-eQTL","intragenic", "TFBS")
names(dist.feats.all) <- names(dist.feats.eqtls)

#' plot dist eQTLs
dist.feats.eqtls <- dist.feats.eqtls[order(vapply(dist.feats.eqtls, median, 0))]
dist.feats.eqtls <- data.frame(x = unlist(dist.feats.eqtls), 
                         grp = rep(names(dist.feats.eqtls),times = sapply(dist.feats.eqtls,length)))
dist.feats.eqtls$grp2 <- factor(dist.feats.eqtls$grp, as.character(dist.feats.eqtls$grp))

pdf(file = "plot_dist_eqtl.pdf", height = 6)

options(scipen=10000)
ggplot(dist.feats.eqtls, aes(x = grp2, y = x)) +
  geom_boxplot(size=0.85) + 
  ylab("distance") +
  theme(axis.title.x = element_text(colour="#000000",size=12, vjust=0.2),
        axis.title.y = element_blank(),
        axis.text.x  = element_text(colour="#000000",size=12),
        axis.text.y  = element_text(colour="#000000",size=12),
        plot.title = element_text( face="bold", size=12, hjust=0.5, vjust = 1.5),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(4,5,2,4),"mm"))+
  ggtitle("distance of miRNA-eQTL SNPs") +
  coord_flip() 

dev.off()

#' plot dist all SNPs
dist.feats.all <- dist.feats.all[order(vapply(dist.feats.all, median, 0))]
dist.feats.all <- data.frame(x = unlist(dist.feats.all), 
                               grp = rep(names(dist.feats.all),times = sapply(dist.feats.all,length)))
dist.feats.all$grp2 <- factor(dist.feats.all$grp, as.character(dist.feats.all$grp))

pdf(file = "plot_dist_all.pdf", height = 6)

options(scipen=10000)
ggplot(dist.feats.all, aes(x = grp2, y = x)) +
  geom_boxplot(size=0.85) + 
  ylab("distance") +
  theme(axis.title.x = element_text(colour="#000000",size=12, vjust=0.2),
        axis.title.y = element_blank(),
        axis.text.x  = element_text(colour="#000000",size=12),
        axis.text.y  = element_text(colour="#000000",size=12),
        plot.title = element_text( face="bold", size=12, hjust=0.5, vjust = 1.5),
        panel.background = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(colour = "black"),
        panel.grid.minor = element_line(colour = "grey",size=0.2),
        panel.grid.major = element_line(colour = "grey",size=0.2),
        plot.margin=unit(c(4,5,2,4),"mm"))+
  ggtitle("distance of all SNPs") +
  coord_flip() 

dev.off()

###########################################################
############   enrichment mirna/host-eQTL     #############
###########################################################

feat.eqtl <- feat.matrix[intragenic==1 & ( y == 1 | hosteqtl == 1),]
pos <- c(setdiff(model, 105:107), 24:99, 5:7) # check model features and all individual TFs

#### shared eQTLs #####
enrichment <- c()
for(x in pos) {
  common <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==1 & feat.eqtl$hosteqtl==1),
                                                      which(feat.eqtl[,x,with=FALSE] == 1))]))
  not.common <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 1])) - common
  m3 <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==1 & feat.eqtl$hosteqtl==1),
                                                  which(feat.eqtl[,x,with=FALSE] == 0))]))
  m4 <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 0])) - m3
  enrichment <- c(enrichment,fisher.test(matrix(c(common, not.common,m3,m4),nrow = 2, ncol = 2, byrow = TRUE),
                                         alternative="greater")$p.value)
}
names(enrichment) <- colnames(feat.matrix[,pos,with=FALSE])
enrich.common <- as.matrix(enrichment)

#### miRNA-only eQTLs #####
enrichment <- c()
for(x in pos) {
  only <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==1 & feat.eqtl$hosteqtl==0),
                                                    which(feat.eqtl[,x,with=FALSE] == 1))]))
  not.only <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 1])) - only
  m3 <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==1 & feat.eqtl$hosteqtl==0),
                                                  which(feat.eqtl[,x,with=FALSE] == 0))]))
  m4 <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 0]))  - m3
  enrichment <- c(enrichment, fisher.test(matrix(c(only, not.only, m3, m4), nrow = 2, ncol = 2, byrow = TRUE),
                                          alternative="greater")$p.value)
}
names(enrichment) <- colnames(feat.matrix[,pos,with=FALSE])
enrich.mirna.only <- as.matrix(enrichment)

#### host-only eQTLs #####
enrichment <- c()
for(x in pos) {
  only <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==0 & feat.eqtl$hosteqtl==1),
                                                    which(feat.eqtl[,x,with=FALSE] == 1))]))
  not.only <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 1])) - only
  m3 <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$y==0 & feat.eqtl$hosteqtl==1),
                                                  which(feat.eqtl[,x,with=FALSE] == 0))]))
  m4 <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 0])) - m3
  enrichment <- c(enrichment, fisher.test(matrix(c(only, not.only,m3,m4), nrow = 2, ncol = 2, byrow = TRUE),
                                          alternative="greater")$p.value)
}
names(enrichment) <- colnames(feat.matrix[,pos,with=FALSE])
enrich.host.only <- as.matrix(enrichment)

# load shared/independent classification
independence <- read.table("host_mirna_independence.txt", sep='\t', stringsAsFactors=FALSE, header=TRUE)
feat.matrix[,independent := rep(0,dim(feat.matrix)[1])]
for(x in 1:dim(independence)[1]) {
  is.independent <- (independence$independent[x] == TRUE & independence$shared[x] == TRUE) * 1 
  feat.matrix[feat.matrix[,miRNA] == independence$mirna[x] & feat.matrix[,variant] == independence$snp[x], independent := is.independent]
}
feat.eqtl <- feat.matrix[intragenic==1 & ( y == 1 | hosteqtl == 1),]

#### independent eQTLs #####
enrichment <- c()
for(x in pos) {
  common <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$independent==1),
                                                      which(feat.eqtl[,x,with=FALSE] == 1))]))
  not.common <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 1])) - common
  m3 <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$independent==1),
                                                  which(feat.eqtl[,x,with=FALSE] == 0))]))
  m4 <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 0])) - m3
  enrichment <- c(enrichment,fisher.test(matrix(c(common, not.common,m3,m4),nrow = 2, ncol = 2, byrow = TRUE),
                                         alternative="greater")$p.value)
}
names(enrichment) <- colnames(feat.matrix[,pos,with=FALSE])
enrich.independent <- as.matrix(enrichment)

#### not independent eQTLs #####
enrichment <- c()
for(x in pos) {
  only <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$independent==0),
                                                    which(feat.eqtl[,x,with=FALSE] == 1))]))
  not.only <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 1])) - only
  m3 <- length(unique(feat.eqtl$variant[intersect(which(feat.eqtl$independent==0),
                                                  which(feat.eqtl[,x,with=FALSE] == 0))]))
  m4 <- length(unique(feat.eqtl$variant[feat.eqtl[,x,with=FALSE] == 0]))  - m3
  enrichment <- c(enrichment, fisher.test(matrix(c(only, not.only, m3, m4), nrow = 2, ncol = 2, byrow = TRUE),
                                          alternative="greater")$p.value)
}
names(enrichment) <- colnames(feat.matrix[,pos,with=FALSE])
enrich.not.independent <- as.matrix(enrichment)

enrichments <- as.data.frame(cbind(enrich.mirna.only, enrich.common, enrich.host.only,
                                   enrich.independent, enrich.not.independent))
rownames(enrichments) <- rownames(enrich.common)
colnames(enrichments) <- c('mirna only', 'common', 'host only', 'independent (subset of common)', 'not independent (subset of common)')

#' clean up
rm(dist.feats.all, feat.eqtl, is.3p, dist.feats.eqtls, enrich.common, enrich.host.only,
   enrich.mirna.only, enrichment, pos, common, m3,m4,not.common, not.only, only,x,model,mirnas,
   enrich.independent, enrich.not.independent, independence, mirnas.pre, is.independent)
gc()

#' variables which are left now: 'feat.matrix', 'coefs', 'ci', 'enrichments', 'variants'
