The histogram shows a unimodal distribution. The distribution is skewed right. Most of the genes from this histogram are not highly expressed.
library(dplyr)
library(magrittr)
setwd("/Volumes/Exabyte/702/part1_rnaseq/")
# Load starter code
# source("part1.R")
counts.table <- read.csv("counts_table.csv")
exon.table <- read.csv("exon_table.csv")
hist(log10(counts.table$CXH_counts), breaks = 50)
dplyr::select(exon.table, gene.id, CXH_RPKM) %>% arrange(desc(CXH_RPKM)) %>% dplyr::slice(1:10)
## gene.id CXH_RPKM
## 1 ENSG00000198804 15774731
## 2 ENSG00000210082 7771213
## 3 ENSG00000237973 7650812
## 4 ENSG00000198712 6547179
## 5 ENSG00000198886 4614334
## 6 ENSG00000198888 4041698
## 7 ENSG00000198727 3212273
## 8 ENSG00000198938 3101698
## 9 ENSG00000230043 3076523
## 10 ENSG00000196205 2663679
dplyr::select(exon.table, gene.id, CXI_RPKM) %>% arrange(desc(CXI_RPKM)) %>% dplyr::slice(1:10)
## gene.id CXI_RPKM
## 1 ENSG00000198804 14682736
## 2 ENSG00000210082 7650961
## 3 ENSG00000237973 6991689
## 4 ENSG00000198712 5194261
## 5 ENSG00000198886 4276665
## 6 ENSG00000230043 3551071
## 7 ENSG00000198888 3035025
## 8 ENSG00000196205 2775812
## 9 ENSG00000198938 2616787
## 10 ENSG00000244457 2504184
dplyr::select(exon.table, gene.id, DPG_RPKM) %>% arrange(desc(DPG_RPKM)) %>% dplyr::slice(1:10)
## gene.id DPG_RPKM
## 1 ENSG00000210082 15520274
## 2 ENSG00000198804 13846785
## 3 ENSG00000198886 9721084
## 4 ENSG00000198712 6792592
## 5 ENSG00000244457 6637650
## 6 ENSG00000198786 5800161
## 7 ENSG00000198888 5085901
## 8 ENSG00000198763 4176377
## 9 ENSG00000211459 3032629
## 10 ENSG00000237973 2880082
dplyr::select(exon.table, gene.id, DPK_RPKM) %>% arrange(desc(DPK_RPKM)) %>% dplyr::slice(1:10)
## gene.id DPK_RPKM
## 1 ENSG00000210082 16488875
## 2 ENSG00000198804 13957243
## 3 ENSG00000198886 7347955
## 4 ENSG00000198712 6417172
## 5 ENSG00000244457 4595578
## 6 ENSG00000198786 4184517
## 7 ENSG00000198888 4000233
## 8 ENSG00000237973 3377160
## 9 ENSG00000211459 3130758
## 10 ENSG00000198763 3128507
rbind(
dplyr::select(exon.table, gene.id, CXH_counts, CXH_RPKM) %>%
filter(gene.id == "ENSG00000143761"),
dplyr::select(exon.table, gene.id, CXH_counts, CXH_RPKM) %>%
filter(gene.id == "ENSG00000240463")
)
## gene.id CXH_counts CXH_RPKM
## 1 ENSG00000143761 7647 11603.85
## 2 ENSG00000240463 7636 405550.65
coverage1 <- read.csv("coverage_table_chr1.csv")
coverage14 <- read.csv("coverage_table_chr14.csv")
plot(coverage1, type = "h", main = "ENSG00000143761 on Chromosome 1",
xlab = "Position", ylab = "Expression", xlim = c(228260000, 228310000),
ylim = c(0, 7000))
segments(228270361, 5500, 228286912, 5500)
plot(coverage14, type = "h", main = "ENSG00000240463 on Chromosome 14",
xlab = "Position", ylab = "Expression", xlim = c(35037000, 35039000),
ylim = c(0, 10000))
segments(35037895, 8000, 35038335, 8000)
rbind(
dplyr::select(exon.table, gene.id, lengths, CXH_counts, CXH_RPKM) %>%
filter(gene.id == "ENSG00000198744"),
dplyr::select(exon.table, gene.id, lengths, CXH_counts, CXH_RPKM) %>%
filter(gene.id == "ENSG00000184009")
)
## gene.id lengths CXH_counts CXH_RPKM
## 1 ENSG00000198744 1 5452 289557.6
## 2 ENSG00000184009 40 217505 288794.2
coverage17 <- read.csv("coverage_table_chr17.csv")
plot(coverage1, type = "h", main = "ENSG00000198744 on Chromosome 1",
xlab = "Position", ylab = "Expression", xlim = c(569000, 571000),
ylim = c(0, 20000))
segments(569756, 10000, 570302, 10000)
plot(coverage17, type = "h", main = "ENSG00000184009 on Chromosome 17",
xlab = "Position", ylab = "Expression", xlim = c(79475000, 79500000),
ylim = c(0, 40000))
segments(79476997, 37000, 79490873, 37000)
To decide if there is differential gene expression between GM12878 and HepG2 I am using a two-sample t-test. This seems to be the most straightforward way to compare the expression levels of two genes.
ENSG00000250303 is significant (p < .01)
exon.table %>% dplyr::select(gene.id, CXH_RPKM, CXI_RPKM, DPG_RPKM, DPK_RPKM, p_value) %>%
dplyr::filter(gene.id == "ENSG00000250303")
## gene.id CXH_RPKM CXI_RPKM DPG_RPKM DPK_RPKM p_value
## 1 ENSG00000250303 8.851725 50.17945 48.5407 52.5154 0.00723444
coverage11 <- read.csv("coverage_table_chr11.csv")
plot(coverage11, type = "h", main = "ENSG00000250303 on Chromosome 11 GM12878",
xlab = "Position", ylab = "Expression", xlim = c(112141000, 112233500),
ylim = c(0, 300))
segments(112141472, 250, 112233257, 250)
coverage11dpg <- read.csv("coverage_table_chr11_dpg.csv")
plot(coverage11dpg, type = "h", main = "ENSG00000250303 on Chromosome 11 HepG2",
xlab = "Position", ylab = "Expression", xlim = c(112141000, 112233500),
ylim = c(0, 300))
segments(112141472, 250, 112233257, 250)
ENSG00000227077 is not significant (p > .99)
exon.table %>% dplyr::select(gene.id, CXH_RPKM, CXI_RPKM, DPG_RPKM, DPK_RPKM, p_value) %>%
filter(gene.id == "ENSG00000227077")
## gene.id CXH_RPKM CXI_RPKM DPG_RPKM DPK_RPKM p_value
## 1 ENSG00000227077 36008.82 30967.89 5048.233 5716.676 0.9999164
plot(coverage17, type = "h", main = "ENSG00000227077 on Chromosome 17 GM12878",
xlab = "Position", ylab = "Expression", xlim = c(18476000, 18476450),
ylim = c(0, 3000))
segments(18476045, 2000, 18476415, 2000)
coverage17dpg <- read.csv("coverage_table_chr17_dpg.csv")
plot(coverage17dpg, type = "h", main = "ENSG00000227077 on Chromosome 17 HepG2",
xlab = "Position", ylab = "Expression", xlim = c(18476000, 18476450),
ylim = c(0, 3000))
segments(18476045, 2000, 18476415, 2000)