1 | #Date: 12-Dec-2013
|
---|
2 | #gender check on all run1 samples
|
---|
3 | source("http://bioconductor.org/biocLite.R")
|
---|
4 | biocLite("edgeR")
|
---|
5 | biocLite("limma")
|
---|
6 | biocLite("biomaRt")
|
---|
7 | library(limma)
|
---|
8 | library(edgeR)
|
---|
9 | library(biomaRt)
|
---|
10 |
|
---|
11 | data <- read.table("/virdir/Scratch/RP3_data/run1_count_files/combined_gene_count_run_1.txt",sep="\t",header=TRUE, row.names=1)
|
---|
12 | dim(data)
|
---|
13 | #[1] 61776 2330
|
---|
14 |
|
---|
15 |
|
---|
16 | listMarts()
|
---|
17 | #ENSEMBL GENES 73
|
---|
18 |
|
---|
19 | #count number of zeros (genes with no expression) in all samples
|
---|
20 | zero.count <- array(0, dim=c(nrow(data), ncol(data)))
|
---|
21 | zero.count[data==0] <- TRUE
|
---|
22 | zero.counts.col <- apply(zero.count, 2, sum)
|
---|
23 | summary(zero.counts.col)
|
---|
24 | # Min. 1st Qu. Median Mean 3rd Qu. Max.
|
---|
25 | # 24910 30200 30930 31060 31730 61730
|
---|
26 |
|
---|
27 | #retain only genes present in at least 50% of samples
|
---|
28 | threshold <- 0.5
|
---|
29 | sum.zero <- apply(zero.count, 1, sum)
|
---|
30 | sum(sum.zero <= threshold*ncol(data)) #[1] 29833
|
---|
31 | data.filt <- data[sum.zero <= threshold*ncol(data),]
|
---|
32 | colnames(data.filt) <- colnames(data)
|
---|
33 | rownames(data.filt) <- rownames(data[sum.zero <= threshold*ncol(data),])
|
---|
34 |
|
---|
35 | #attach annotation data
|
---|
36 | mart <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="www.biomart.org")
|
---|
37 | ann <- getGene(id = rownames(data), type = "ensembl_gene_id", mart = mart)
|
---|
38 | which(!row.names(data)%in%as.character(ann[,1]))
|
---|
39 | ann2 <- getBM(attributes=c("ensembl_gene_id", "gene_biotype"),filters = c("ensembl_gene_id"), values = rownames(data),mart = mart)
|
---|
40 | ann.merge <- merge(ann, ann2, by.x=1, by.y=1, all.x=T)
|
---|
41 | which(!row.names(data) %in% as.character(ann.merge[,1]))
|
---|
42 |
|
---|
43 | ann.merge <- ann.merge[duplicated(ann.merge$ensembl_gene_id)==F,]
|
---|
44 | matchid <- match( row.names(data), as.character(ann.merge[,1]))
|
---|
45 | annotation <- ann.merge[matchid,]
|
---|
46 | identical(as.character(annotation[,1]), row.names(data)) #FALSE but order is the same; just a few missing annotations
|
---|
47 | annotation.filt <- annotation[sum.zero <= threshold*ncol(data),]
|
---|
48 |
|
---|
49 | D <- DGEList(as.matrix(data.filt),lib.size=colSums(data.filt), genes=rownames(data.filt) )
|
---|
50 | d <- calcNormFactors(D)
|
---|
51 |
|
---|
52 | XIST <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "XIST"),]
|
---|
53 | EIF1AY <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "EIF1AY"),]
|
---|
54 | Y <- apply(cpm(d)[which(as.character(annotation.filt$chromosome_name) == "Y" & as.character(annotation.filt$gene_biotype) == "protein_coding"),],2,sum)
|
---|
55 |
|
---|
56 | write.table(cbind(colnames(data.filt), XIST, EIF1AY, Y), file="normalized_expression_run1_XIST_Y_EIF1AY.txt", sep="\t", row.names=F, quote=F)
|
---|
57 |
|
---|
58 |
|
---|
59 | pdf("plot_XIST_EIF1AY_normalized_run1.pdf")
|
---|
60 | plot(as.numeric(XIST), as.numeric(EIF1AY), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="EIF1AY (cpm)")
|
---|
61 | dev.off()
|
---|
62 |
|
---|
63 | pdf("plot_XIST_Y_normalized_run1.pdf")
|
---|
64 | plot(as.numeric(XIST), as.numeric(Y), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="Y (sum cpm)")
|
---|
65 | dev.off()
|
---|
66 |
|
---|
67 | sessionInfo()
|
---|
68 |
|
---|
69 | ------------------------------------------------------------------------------
|
---|
70 | R version 2.15.1 (2012-06-22)
|
---|
71 | Platform: x86_64-unknown-linux-gnu (64-bit)
|
---|
72 |
|
---|
73 | locale:
|
---|
74 | [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
|
---|
75 | [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
|
---|
76 | [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
|
---|
77 | [7] LC_PAPER=C LC_NAME=C
|
---|
78 | [9] LC_ADDRESS=C LC_TELEPHONE=C
|
---|
79 | [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
|
---|
80 |
|
---|
81 | attached base packages:
|
---|
82 | [1] stats graphics grDevices utils datasets methods base
|
---|
83 |
|
---|
84 | other attached packages:
|
---|
85 | [1] biomaRt_2.14.0 edgeR_3.0.8 limma_3.14.4
|
---|
86 | [4] BiocInstaller_1.8.3
|
---|
87 |
|
---|
88 | loaded via a namespace (and not attached):
|
---|
89 | [1] RCurl_1.95-4.1 tools_2.15.1 XML_3.98-1.1
|
---|
90 |
|
---|
91 | ------------------------------------------------------------------------------ |
---|