Context Navigation

Back to BIOS_SampleBlacklist

BIOS_SampleBlacklist: gender_analysis.r

File gender_analysis.r, 3.6 KB (added by jamverlouw, 8 years ago)

Line
1	#Date: 12-Dec-2013
2	#gender check on all run1 samples
3	source("http://bioconductor.org/biocLite.R")
4	biocLite("edgeR")
5	biocLite("limma")
6	biocLite("biomaRt")
7	library(limma)
8	library(edgeR)
9	library(biomaRt)
10
11	data <- read.table("/virdir/Scratch/RP3_data/run1_count_files/combined_gene_count_run_1.txt",sep="\t",header=TRUE, row.names=1)
12	dim(data)
13	#[1] 61776 2330
14
15
16	listMarts()
17	#ENSEMBL GENES 73
18
19	#count number of zeros (genes with no expression) in all samples
20	zero.count <- array(0, dim=c(nrow(data), ncol(data)))
21	zero.count[data==0] <- TRUE
22	zero.counts.col <- apply(zero.count, 2, sum)
23	summary(zero.counts.col)
24	# Min. 1st Qu. Median Mean 3rd Qu. Max.
25	# 24910 30200 30930 31060 31730 61730
26
27	#retain only genes present in at least 50% of samples
28	threshold <- 0.5
29	sum.zero <- apply(zero.count, 1, sum)
30	sum(sum.zero <= threshold*ncol(data)) #[1] 29833
31	data.filt <- data[sum.zero <= threshold*ncol(data),]
32	colnames(data.filt) <- colnames(data)
33	rownames(data.filt) <- rownames(data[sum.zero <= threshold*ncol(data),])
34
35	#attach annotation data
36	mart <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="www.biomart.org")
37	ann <- getGene(id = rownames(data), type = "ensembl_gene_id", mart = mart)
38	which(!row.names(data)%in%as.character(ann[,1]))
39	ann2 <- getBM(attributes=c("ensembl_gene_id", "gene_biotype"),filters = c("ensembl_gene_id"), values = rownames(data),mart = mart)
40	ann.merge <- merge(ann, ann2, by.x=1, by.y=1, all.x=T)
41	which(!row.names(data) %in% as.character(ann.merge[,1]))
42
43	ann.merge <- ann.merge[duplicated(ann.merge$ensembl_gene_id)==F,]
44	matchid <- match( row.names(data), as.character(ann.merge[,1]))
45	annotation <- ann.merge[matchid,]
46	identical(as.character(annotation[,1]), row.names(data)) #FALSE but order is the same; just a few missing annotations
47	annotation.filt <- annotation[sum.zero <= threshold*ncol(data),]
48
49	D <- DGEList(as.matrix(data.filt),lib.size=colSums(data.filt), genes=rownames(data.filt) )
50	d <- calcNormFactors(D)
51
52	XIST <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "XIST"),]
53	EIF1AY <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "EIF1AY"),]
54	Y <- apply(cpm(d)[which(as.character(annotation.filt$chromosome_name) == "Y" & as.character(annotation.filt$gene_biotype) == "protein_coding"),],2,sum)
55
56	write.table(cbind(colnames(data.filt), XIST, EIF1AY, Y), file="normalized_expression_run1_XIST_Y_EIF1AY.txt", sep="\t", row.names=F, quote=F)
57
58
59	pdf("plot_XIST_EIF1AY_normalized_run1.pdf")
60	plot(as.numeric(XIST), as.numeric(EIF1AY), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="EIF1AY (cpm)")
61	dev.off()
62
63	pdf("plot_XIST_Y_normalized_run1.pdf")
64	plot(as.numeric(XIST), as.numeric(Y), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="Y (sum cpm)")
65	dev.off()
66
67	sessionInfo()
68
69	------------------------------------------------------------------------------
70	R version 2.15.1 (2012-06-22)
71	Platform: x86_64-unknown-linux-gnu (64-bit)
72
73	locale:
74	[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
75	[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
76	[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
77	[7] LC_PAPER=C LC_NAME=C
78	[9] LC_ADDRESS=C LC_TELEPHONE=C
79	[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
80
81	attached base packages:
82	[1] stats graphics grDevices utils datasets methods base
83
84	other attached packages:
85	[1] biomaRt_2.14.0 edgeR_3.0.8 limma_3.14.4
86	[4] BiocInstaller_1.8.3
87
88	loaded via a namespace (and not attached):
89	[1] RCurl_1.95-4.1 tools_2.15.1 XML_3.98-1.1
90
91	------------------------------------------------------------------------------

Download in other formats:

Original Format