BIOS_SampleBlacklist: gender_analysis.r

File gender_analysis.r, 3.6 KB (added by jamverlouw, 8 years ago)
Line 
1#Date: 12-Dec-2013
2#gender check on all run1 samples
3source("http://bioconductor.org/biocLite.R")
4biocLite("edgeR")
5biocLite("limma")
6biocLite("biomaRt")
7library(limma)
8library(edgeR)
9library(biomaRt)
10
11data <- read.table("/virdir/Scratch/RP3_data/run1_count_files/combined_gene_count_run_1.txt",sep="\t",header=TRUE, row.names=1)
12dim(data)
13#[1] 61776  2330
14
15
16listMarts()
17#ENSEMBL GENES 73
18
19#count number of zeros (genes with no expression) in all samples
20zero.count <- array(0, dim=c(nrow(data), ncol(data)))
21zero.count[data==0] <- TRUE
22zero.counts.col <- apply(zero.count, 2, sum)
23summary(zero.counts.col)
24#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
25#  24910   30200   30930   31060   31730   61730
26
27#retain only genes present in at least 50% of samples
28threshold <- 0.5
29sum.zero <- apply(zero.count, 1, sum)
30sum(sum.zero <= threshold*ncol(data)) #[1] 29833
31data.filt <- data[sum.zero <= threshold*ncol(data),]
32colnames(data.filt) <- colnames(data)
33rownames(data.filt) <- rownames(data[sum.zero <= threshold*ncol(data),])
34
35#attach annotation data
36mart <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="www.biomart.org")
37ann <- getGene(id = rownames(data), type = "ensembl_gene_id", mart = mart)
38which(!row.names(data)%in%as.character(ann[,1]))
39ann2 <- getBM(attributes=c("ensembl_gene_id", "gene_biotype"),filters = c("ensembl_gene_id"), values = rownames(data),mart = mart)
40ann.merge <- merge(ann, ann2, by.x=1, by.y=1, all.x=T)
41which(!row.names(data) %in% as.character(ann.merge[,1]))
42
43ann.merge <- ann.merge[duplicated(ann.merge$ensembl_gene_id)==F,]
44matchid <- match( row.names(data), as.character(ann.merge[,1]))
45annotation <- ann.merge[matchid,]
46identical(as.character(annotation[,1]), row.names(data)) #FALSE but order is the same; just a few missing annotations
47annotation.filt <- annotation[sum.zero <= threshold*ncol(data),]
48
49D <- DGEList(as.matrix(data.filt),lib.size=colSums(data.filt), genes=rownames(data.filt) )
50d <- calcNormFactors(D)
51
52XIST <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "XIST"),]
53EIF1AY <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "EIF1AY"),]
54Y <- apply(cpm(d)[which(as.character(annotation.filt$chromosome_name) == "Y" & as.character(annotation.filt$gene_biotype) == "protein_coding"),],2,sum)
55
56write.table(cbind(colnames(data.filt), XIST, EIF1AY, Y), file="normalized_expression_run1_XIST_Y_EIF1AY.txt", sep="\t", row.names=F, quote=F)
57
58
59pdf("plot_XIST_EIF1AY_normalized_run1.pdf")
60plot(as.numeric(XIST), as.numeric(EIF1AY), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="EIF1AY (cpm)")
61dev.off()
62
63pdf("plot_XIST_Y_normalized_run1.pdf")
64plot(as.numeric(XIST), as.numeric(Y), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="Y (sum cpm)")
65dev.off()
66
67sessionInfo()
68
69------------------------------------------------------------------------------
70R version 2.15.1 (2012-06-22)
71Platform: x86_64-unknown-linux-gnu (64-bit)
72
73locale:
74 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C             
75 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8   
76 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
77 [7] LC_PAPER=C                 LC_NAME=C                 
78 [9] LC_ADDRESS=C               LC_TELEPHONE=C           
79[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
80
81attached base packages:
82[1] stats     graphics  grDevices utils     datasets  methods   base     
83
84other attached packages:
85[1] biomaRt_2.14.0      edgeR_3.0.8         limma_3.14.4       
86[4] BiocInstaller_1.8.3
87
88loaded via a namespace (and not attached):
89[1] RCurl_1.95-4.1 tools_2.15.1   XML_3.98-1.1
90
91------------------------------------------------------------------------------