1.

The histogram shows a unimodal distribution. The distribution is skewed right. Most of the genes from this histogram are not highly expressed.

library(dplyr)
library(magrittr)
setwd("/Volumes/Exabyte/702/part1_rnaseq/")

# Load starter code
# source("part1.R")
counts.table <- read.csv("counts_table.csv")
exon.table <- read.csv("exon_table.csv")

hist(log10(counts.table$CXH_counts), breaks = 50)

2.

dplyr::select(exon.table, gene.id, CXH_RPKM) %>% arrange(desc(CXH_RPKM)) %>% dplyr::slice(1:10)
##            gene.id CXH_RPKM
## 1  ENSG00000198804 15774731
## 2  ENSG00000210082  7771213
## 3  ENSG00000237973  7650812
## 4  ENSG00000198712  6547179
## 5  ENSG00000198886  4614334
## 6  ENSG00000198888  4041698
## 7  ENSG00000198727  3212273
## 8  ENSG00000198938  3101698
## 9  ENSG00000230043  3076523
## 10 ENSG00000196205  2663679
dplyr::select(exon.table, gene.id, CXI_RPKM) %>% arrange(desc(CXI_RPKM)) %>% dplyr::slice(1:10)
##            gene.id CXI_RPKM
## 1  ENSG00000198804 14682736
## 2  ENSG00000210082  7650961
## 3  ENSG00000237973  6991689
## 4  ENSG00000198712  5194261
## 5  ENSG00000198886  4276665
## 6  ENSG00000230043  3551071
## 7  ENSG00000198888  3035025
## 8  ENSG00000196205  2775812
## 9  ENSG00000198938  2616787
## 10 ENSG00000244457  2504184
dplyr::select(exon.table, gene.id, DPG_RPKM) %>% arrange(desc(DPG_RPKM)) %>% dplyr::slice(1:10)
##            gene.id DPG_RPKM
## 1  ENSG00000210082 15520274
## 2  ENSG00000198804 13846785
## 3  ENSG00000198886  9721084
## 4  ENSG00000198712  6792592
## 5  ENSG00000244457  6637650
## 6  ENSG00000198786  5800161
## 7  ENSG00000198888  5085901
## 8  ENSG00000198763  4176377
## 9  ENSG00000211459  3032629
## 10 ENSG00000237973  2880082
dplyr::select(exon.table, gene.id, DPK_RPKM) %>% arrange(desc(DPK_RPKM)) %>% dplyr::slice(1:10)
##            gene.id DPK_RPKM
## 1  ENSG00000210082 16488875
## 2  ENSG00000198804 13957243
## 3  ENSG00000198886  7347955
## 4  ENSG00000198712  6417172
## 5  ENSG00000244457  4595578
## 6  ENSG00000198786  4184517
## 7  ENSG00000198888  4000233
## 8  ENSG00000237973  3377160
## 9  ENSG00000211459  3130758
## 10 ENSG00000198763  3128507

3.

rbind(
dplyr::select(exon.table, gene.id, CXH_counts, CXH_RPKM) %>%
  filter(gene.id == "ENSG00000143761"),
dplyr::select(exon.table, gene.id, CXH_counts, CXH_RPKM) %>%
  filter(gene.id == "ENSG00000240463")
)
##           gene.id CXH_counts  CXH_RPKM
## 1 ENSG00000143761       7647  11603.85
## 2 ENSG00000240463       7636 405550.65
coverage1 <- read.csv("coverage_table_chr1.csv")
coverage14 <- read.csv("coverage_table_chr14.csv")
plot(coverage1, type = "h", main = "ENSG00000143761 on Chromosome 1",
  xlab = "Position", ylab = "Expression", xlim = c(228260000, 228310000),
  ylim = c(0, 7000))
segments(228270361, 5500, 228286912, 5500)

plot(coverage14, type = "h", main = "ENSG00000240463 on Chromosome 14",
  xlab = "Position", ylab = "Expression", xlim = c(35037000, 35039000),
  ylim = c(0, 10000))
segments(35037895, 8000, 35038335, 8000)

4.

rbind(
dplyr::select(exon.table, gene.id, lengths, CXH_counts, CXH_RPKM) %>%
  filter(gene.id == "ENSG00000198744"),
dplyr::select(exon.table, gene.id, lengths, CXH_counts, CXH_RPKM) %>%
  filter(gene.id == "ENSG00000184009")
)
##           gene.id lengths CXH_counts CXH_RPKM
## 1 ENSG00000198744       1       5452 289557.6
## 2 ENSG00000184009      40     217505 288794.2
coverage17 <- read.csv("coverage_table_chr17.csv")

plot(coverage1, type = "h", main = "ENSG00000198744 on Chromosome 1",
  xlab = "Position", ylab = "Expression", xlim = c(569000, 571000),
  ylim = c(0, 20000))
segments(569756, 10000, 570302, 10000)

plot(coverage17, type = "h", main = "ENSG00000184009 on Chromosome 17",
  xlab = "Position", ylab = "Expression", xlim = c(79475000, 79500000),
  ylim = c(0, 40000))
segments(79476997, 37000, 79490873, 37000)

5.

To decide if there is differential gene expression between GM12878 and HepG2 I am using a two-sample t-test. This seems to be the most straightforward way to compare the expression levels of two genes.

ENSG00000250303 is significant (p < .01)

exon.table %>% dplyr::select(gene.id, CXH_RPKM, CXI_RPKM, DPG_RPKM, DPK_RPKM, p_value) %>%
  dplyr::filter(gene.id == "ENSG00000250303")
##           gene.id CXH_RPKM CXI_RPKM DPG_RPKM DPK_RPKM    p_value
## 1 ENSG00000250303 8.851725 50.17945  48.5407  52.5154 0.00723444
coverage11 <- read.csv("coverage_table_chr11.csv")

plot(coverage11, type = "h", main = "ENSG00000250303 on Chromosome 11 GM12878",
  xlab = "Position", ylab = "Expression", xlim = c(112141000, 112233500),
  ylim = c(0, 300))
segments(112141472, 250, 112233257, 250)

coverage11dpg <- read.csv("coverage_table_chr11_dpg.csv")

plot(coverage11dpg, type = "h", main = "ENSG00000250303 on Chromosome 11 HepG2",
  xlab = "Position", ylab = "Expression", xlim = c(112141000, 112233500),
  ylim = c(0, 300))
segments(112141472, 250, 112233257, 250)

ENSG00000227077 is not significant (p > .99)

exon.table %>% dplyr::select(gene.id, CXH_RPKM, CXI_RPKM, DPG_RPKM, DPK_RPKM, p_value) %>%
  filter(gene.id == "ENSG00000227077")
##           gene.id CXH_RPKM CXI_RPKM DPG_RPKM DPK_RPKM   p_value
## 1 ENSG00000227077 36008.82 30967.89 5048.233 5716.676 0.9999164
plot(coverage17, type = "h", main = "ENSG00000227077 on Chromosome 17 GM12878",
  xlab = "Position", ylab = "Expression", xlim = c(18476000, 18476450),
  ylim = c(0, 3000))
segments(18476045, 2000, 18476415, 2000)

coverage17dpg <- read.csv("coverage_table_chr17_dpg.csv")

plot(coverage17dpg, type = "h", main = "ENSG00000227077 on Chromosome 17 HepG2",
  xlab = "Position", ylab = "Expression", xlim = c(18476000, 18476450),
  ylim = c(0, 3000))
segments(18476045, 2000, 18476415, 2000)