Description

Figures

load("~/GIT/tcgaMut/ppts/kgca_filtered_mr_gtype3_expr.Rda")
fread("/home/tc/DATA/dataset/pan20/covar") ->covar
kgca_filtered_mr_gtype3_expr <-
  left_join(kgca_filtered_mr_gtype3_expr,covar,by=c("Hugo_Symbol"="gene"))
Pyrimidine_pathway <- c("CAD", "UMPS", "DHODH", "CPS1")
sig22 <-
  c(
  "TP53",
  "PIK3CA",
  "PTEN",
  "RB1",
  "KRAS",
  "NRAS",
  "BRAF",
  "CDKN2A",
  "FBXW7",
  "ARID1A" ,
  "MLL2", # not found
  "STAG2",
  "ATM",
  "CASP8",
  "CTCF",
  "ERBB3",
  "HLA-A",
  "HRAS",
  "IDH1",
  "NF1",
  "NFE2L2",
  "PIK3R1"
  )
oppallate <- c("#000000","#0072B2")
gg <- ggplot(data=kgca_filtered_mr_gtype3_expr,aes(x=log2(expr),y=mrca)) +
#  geom_point(alpha=0.01)+
#  stat_smooth()+
#  geom_hex()+lo
  
    geom_smooth(se=FALSE, colour="grey")+
  stat_summary_bin(fun.data= "median_hilow",geom="errorbar",bins=20,width=0.2,colour="grey")+
    stat_summary_bin(fun.data= "median_hilow",geom="pointrange",bins=20,width=0.2,colour="grey")+
  
  geom_point(data=kgca_filtered_mr_gtype3_expr %>% filter(Genetype %in% c("oncogene","TSG")),aes(colour=Genetype),shape=21)+
  geom_point(data=kgca_filtered_mr_gtype3_expr %>% filter(Genetype %in% c("Pyrimidine_biosyn")),colour="red")+
  scale_colour_manual(values=oppallate)+
  scale_y_log10()+
#  scale_x_log10()+
   geom_text_repel(data=kgca_filtered_mr_gtype3_expr %>% filter(Hugo_Symbol %in% c(Pyrimidine_pathway)),aes(label = Hugo_Symbol),colour="red")+
  
  xlab("Gene expression level (log2)")+
  ylab("Gene mutation rate (#Mutations / bp)")+
  theme_classic()
print(gg)

#fread("~/Downloads/Census_allSat May 20 05-12-33 2017.csv") ->cgc
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
#cgc.filtered %>% mutate(Hugo_Symbol=`Gene Symbol`,Genetype=`Role in Cancer`) %>% select(Hugo_Symbol,Genetype) ->cgc.filtered.min
#load("~/GIT/tcgaMut/data/CG.uni_cgc_filtered.rda")
fread("~/Downloads/Census_allSat May 20 05-12-33 2017.csv") ->cgc
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
#Pyrimidine_pathway <- c("CAD", "UMPS", "DHODH", "CPS1")
cgc.filtered %>% mutate(Hugo_Symbol=`Gene Symbol`,Genetype=`Role in Cancer`) %>% select(Hugo_Symbol,Genetype) ->cgc.filtered.min
fread("~/DATA/dataset/pan20/pan20_out.sig_genes.txt") ->
  pan20.sigg
  left_join(pan20.sigg, cgc.filtered.min, by = c("gene" = "Hugo_Symbol")) -> pan20.sigg.cgc
  pan20.sigg.cgc %>% mutate(q2 = ifelse(q < 2.2e-16, 2.2e-16, q)) -> pan20.sigg.cgc
  pan20.sigg.cgc %>% mutate(Genetype = ifelse(is.na(Genetype), "Non-cancer", Genetype)) ->
  pan20.sigg.cgc
# pan20.sigg.cgc <-
#   left_join(pan20.sigg.cgc,covar,by="gene")
  
  
gg <-
  ggplot(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG", "Non-cancer")), aes(x =                                                                                            log2(expr), y = q)) +
#  geom_smooth(se=FALSE, colour="grey") +
  #  geom_point()+
  #  stat_summary_bin(data= pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene","TSG")),fun.data= "median_hilow",geom="errorbar",bins=20,width=0.2,alpha=0.2,colour="blue")+
  stat_summary_bin(
    data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
    fun.data = "median_hilow",
    geom = "errorbar",
    bins = 10,
    width = 0.2,
    colour = "grey"
  ) +
  stat_summary_bin(
    data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
    fun.data = "median_hilow",
    geom = "pointrange",
    bins = 10,
    width = 0.2,
    colour = "grey"
  ) +
  # geom_point(
  #   data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
  #   alpha=0.05
  #   ) +
  geom_point(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
             aes(colour = Genetype),
             shape=21,
             ) +
  
  geom_point(
    data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
    colour = "red",
    size = 2
  ) +
  geom_text_repel(
    data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
    aes(label = gene),
    colour = "blue"
  ) +
  ylim(1, 0) +
  scale_colour_manual(values = oppallate) +
  
  
  ylab("MutsigCV Q-value")+
  theme_classic()
#  theme_classic()
  print(gg)

  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_jitter() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour = "yellow") +
    geom_text_repel(
      data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
      aes(label = gene),
      colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)
# pan20.sigg.violin <-
#  pan20.sigg.cgc %>% mutate(Genetype= ifelse(gene %in% Pyrimidine_pathway,"Pyrimidine_biosyn",Genetype))
  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_violin() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour="red")+
  
  #  geom_jitter(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
   #   aes(colour=Genetype),size=2,height=0,alpha=0.33) +
     geom_text_repel(
       data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
       aes(label = gene),
       colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)

# pan20.sigg.violin <-
#  pan20.sigg.cgc %>% mutate(Genetype= ifelse(gene %in% Pyrimidine_pathway,"Pyrimidine_biosyn",Genetype))
  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_boxplot() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour="red")+
  #  geom_jitter(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
   #   aes(colour=Genetype),size=2,height=0,alpha=0.33) +
     geom_text_repel(
       data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
       aes(label = gene),
       colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)

---
title: "R Notebook"
output:
  bookdown::html_document2:
    fig_caption: yes
    toc: no
    number_section: no
  html_notebook: default
---


- method parag
- descript

- caption

```{r setup, include=FALSE}
knitr::opts_chunk$set(cache=TRUE, echo = FALSE,results = 'asis',fig.align = "center",warning = FALSE)

download_and_or_load <- function(x) {
  y <- x[!(x %in% installed.packages()[, "Package"])]
  if (length(y)){
  install.packages(y, dependencies = TRUE)
  }
  sapply(x, require, character.only = TRUE)
}

my_libraries <- c("ggplot2", "dplyr","tidyr","citr","knitr","xtable","pander","data.table","ggrepel")
download_and_or_load(my_libraries)



```

## Description

- Figure \@ref(fig:fig1)
    - Scatterplot of gene mutation rate against average expression level across 91 cell lines in the CCLE (Microarray)
    - errorbars show median, 25% quantile and 75% quantile gene mutation rates 
    - smooth line is calculated using generalized additive model (GAM) in ggplot2
    
- Figure \@ref(fig:fig2) is the mutsigCV q-value against expression. 
    - errorbars show median, 25% quantile and 75% quantile q-value, which are compressed to 1.00, meaning they are insignificant



- Figure \@ref(fig:fig3) might be more interesting. It is a violin plot of mutsigCV q-value in non-cancer genes, oncogenes and TSGs. which has more details on density than the boxplot as in \@ref(fig:fig4)

## Figures


```{r}
load("~/GIT/tcgaMut/ppts/kgca_filtered_mr_gtype3_expr.Rda")
fread("/home/tc/DATA/dataset/pan20/covar") ->covar

kgca_filtered_mr_gtype3_expr <-
  left_join(kgca_filtered_mr_gtype3_expr,covar,by=c("Hugo_Symbol"="gene"))

Pyrimidine_pathway <- c("CAD", "UMPS", "DHODH", "CPS1")

sig22 <-
  c(
  "TP53",
  "PIK3CA",
  "PTEN",
  "RB1",
  "KRAS",
  "NRAS",
  "BRAF",
  "CDKN2A",
  "FBXW7",
  "ARID1A" ,
  "MLL2", # not found
  "STAG2",
  "ATM",
  "CASP8",
  "CTCF",
  "ERBB3",
  "HLA-A",
  "HRAS",
  "IDH1",
  "NF1",
  "NFE2L2",
  "PIK3R1"
  )

```











```{r fig1, warning=FALSE,message=FALSE,fig.cap="Gene mutation rate against expression level"}

oppallate <- c("#000000","#0072B2")

gg <- ggplot(data=kgca_filtered_mr_gtype3_expr,aes(x=log2(expr),y=mrca)) +
#  geom_point(alpha=0.01)+
#  stat_smooth()+
#  geom_hex()+lo
  
    geom_smooth(se=FALSE, colour="grey")+
  stat_summary_bin(fun.data= "median_hilow",geom="errorbar",bins=20,width=0.2,colour="grey")+
    stat_summary_bin(fun.data= "median_hilow",geom="pointrange",bins=20,width=0.2,colour="grey")+

  
  geom_point(data=kgca_filtered_mr_gtype3_expr %>% filter(Genetype %in% c("oncogene","TSG")),aes(colour=Genetype),shape=21)+
  geom_point(data=kgca_filtered_mr_gtype3_expr %>% filter(Genetype %in% c("Pyrimidine_biosyn")),colour="red")+
  scale_colour_manual(values=oppallate)+


  scale_y_log10()+
#  scale_x_log10()+
#   geom_text_repel(data=kgca_filtered_mr_gtype3_expr %>% filter(Hugo_Symbol %in% c(Pyrimidine_pathway)),aes(label = Hugo_Symbol),colour="red")+
  
  xlab("Gene expression level (log2)")+
  ylab("Gene mutation rate (#Mutations / bp)")+
  theme_classic()


print(gg)


```

```{r}

```


```{r}
#fread("~/Downloads/Census_allSat May 20 05-12-33 2017.csv") ->cgc
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered

#cgc.filtered %>% mutate(Hugo_Symbol=`Gene Symbol`,Genetype=`Role in Cancer`) %>% select(Hugo_Symbol,Genetype) ->cgc.filtered.min

#load("~/GIT/tcgaMut/data/CG.uni_cgc_filtered.rda")

fread("~/Downloads/Census_allSat May 20 05-12-33 2017.csv") ->cgc
#cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
cgc %>% filter(grepl("Mis|N|F|S",`Mutation Types`)) ->cgc.filtered
#Pyrimidine_pathway <- c("CAD", "UMPS", "DHODH", "CPS1")
cgc.filtered %>% mutate(Hugo_Symbol=`Gene Symbol`,Genetype=`Role in Cancer`) %>% select(Hugo_Symbol,Genetype) ->cgc.filtered.min


fread("~/DATA/dataset/pan20/pan20_out.sig_genes.txt") ->
  pan20.sigg

  left_join(pan20.sigg, cgc.filtered.min, by = c("gene" = "Hugo_Symbol")) -> pan20.sigg.cgc

  pan20.sigg.cgc %>% mutate(q2 = ifelse(q < 2.2e-16, 2.2e-16, q)) -> pan20.sigg.cgc

  pan20.sigg.cgc %>% mutate(Genetype = ifelse(is.na(Genetype), "Non-cancer", Genetype)) ->
  pan20.sigg.cgc

# pan20.sigg.cgc <-
#   left_join(pan20.sigg.cgc,covar,by="gene")
  
  
```


```{r fig2,fig.cap="MutsigCV q-value against expression level"}
gg <-
  ggplot(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG", "Non-cancer")), aes(x =                                                                                            log2(expr), y = q)) +
#  geom_smooth(se=FALSE, colour="grey") +
  #  geom_point()+
  #  stat_summary_bin(data= pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene","TSG")),fun.data= "median_hilow",geom="errorbar",bins=20,width=0.2,alpha=0.2,colour="blue")+
  stat_summary_bin(
    data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
    fun.data = "median_hilow",
    geom = "errorbar",
    bins = 10,
    width = 0.2,
    colour = "grey"
  ) +
  stat_summary_bin(
    data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
    fun.data = "median_hilow",
    geom = "pointrange",
    bins = 10,
    width = 0.2,
    colour = "grey"
  ) +
  # geom_point(
  #   data = pan20.sigg.cgc %>% filter(Genetype %in% c("Non-cancer")),
  #   alpha=0.05
  #   ) +
  geom_point(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
             aes(colour = Genetype),
             shape=21,
             ) +
  
  geom_point(
    data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
    colour = "red",
    size = 2
  ) +
  geom_text_repel(
    data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
    aes(label = gene),
    colour = "blue"
  ) +
  ylim(1, 0) +
  scale_colour_manual(values = oppallate) +
  
  
  ylab("MutsigCV Q-value")+
  theme_classic()

#  theme_classic()

  print(gg)
```



```{r eval=FALSE}


  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_jitter() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour = "yellow") +
    geom_text_repel(
      data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
      aes(label = gene),
      colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)
```






```{r fig3,fig.cap= "MutsigCV Q-value distribution"}
# pan20.sigg.violin <-
#  pan20.sigg.cgc %>% mutate(Genetype= ifelse(gene %in% Pyrimidine_pathway,"Pyrimidine_biosyn",Genetype))

  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_violin() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour="red")+
  
  #  geom_jitter(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
   #   aes(colour=Genetype),size=2,height=0,alpha=0.33) +
     geom_text_repel(
       data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
       aes(label = gene),
       colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)

```



```{r fig4}
# pan20.sigg.violin <-
#  pan20.sigg.cgc %>% mutate(Genetype= ifelse(gene %in% Pyrimidine_pathway,"Pyrimidine_biosyn",Genetype))

  gg <-
    ggplot(data = pan20.sigg.cgc %>% filter(
      Genetype %in% c("oncogene", "TSG", "Non-cancer")
      ), aes(x = Genetype, y = q2)) +
    scale_y_log10() +
    geom_boxplot() +
    geom_point(data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),colour="red")+
  #  geom_jitter(data = pan20.sigg.cgc %>% filter(Genetype %in% c("oncogene", "TSG")),
   #   aes(colour=Genetype),size=2,height=0,alpha=0.33) +
     geom_text_repel(
       data = pan20.sigg.cgc %>% filter(gene %in% Pyrimidine_pathway),
       aes(label = gene),
       colour = "blue") +
    ylab("MutsigCV Q-value")
    
    
    
  print(gg)

```

