#Date: 12-Dec-2013 #gender check on all run1 samples source("http://bioconductor.org/biocLite.R") biocLite("edgeR") biocLite("limma") biocLite("biomaRt") library(limma) library(edgeR) library(biomaRt) data <- read.table("/virdir/Scratch/RP3_data/run1_count_files/combined_gene_count_run_1.txt",sep="\t",header=TRUE, row.names=1) dim(data) #[1] 61776 2330 listMarts() #ENSEMBL GENES 73 #count number of zeros (genes with no expression) in all samples zero.count <- array(0, dim=c(nrow(data), ncol(data))) zero.count[data==0] <- TRUE zero.counts.col <- apply(zero.count, 2, sum) summary(zero.counts.col) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 24910 30200 30930 31060 31730 61730 #retain only genes present in at least 50% of samples threshold <- 0.5 sum.zero <- apply(zero.count, 1, sum) sum(sum.zero <= threshold*ncol(data)) #[1] 29833 data.filt <- data[sum.zero <= threshold*ncol(data),] colnames(data.filt) <- colnames(data) rownames(data.filt) <- rownames(data[sum.zero <= threshold*ncol(data),]) #attach annotation data mart <- useMart("ensembl", dataset="hsapiens_gene_ensembl", host="www.biomart.org") ann <- getGene(id = rownames(data), type = "ensembl_gene_id", mart = mart) which(!row.names(data)%in%as.character(ann[,1])) ann2 <- getBM(attributes=c("ensembl_gene_id", "gene_biotype"),filters = c("ensembl_gene_id"), values = rownames(data),mart = mart) ann.merge <- merge(ann, ann2, by.x=1, by.y=1, all.x=T) which(!row.names(data) %in% as.character(ann.merge[,1])) ann.merge <- ann.merge[duplicated(ann.merge$ensembl_gene_id)==F,] matchid <- match( row.names(data), as.character(ann.merge[,1])) annotation <- ann.merge[matchid,] identical(as.character(annotation[,1]), row.names(data)) #FALSE but order is the same; just a few missing annotations annotation.filt <- annotation[sum.zero <= threshold*ncol(data),] D <- DGEList(as.matrix(data.filt),lib.size=colSums(data.filt), genes=rownames(data.filt) ) d <- calcNormFactors(D) XIST <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "XIST"),] EIF1AY <- cpm(d)[which(as.character(annotation.filt$hgnc_symbol) == "EIF1AY"),] Y <- apply(cpm(d)[which(as.character(annotation.filt$chromosome_name) == "Y" & as.character(annotation.filt$gene_biotype) == "protein_coding"),],2,sum) write.table(cbind(colnames(data.filt), XIST, EIF1AY, Y), file="normalized_expression_run1_XIST_Y_EIF1AY.txt", sep="\t", row.names=F, quote=F) pdf("plot_XIST_EIF1AY_normalized_run1.pdf") plot(as.numeric(XIST), as.numeric(EIF1AY), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="EIF1AY (cpm)") dev.off() pdf("plot_XIST_Y_normalized_run1.pdf") plot(as.numeric(XIST), as.numeric(Y), pch=19,cex=0.8, xlab="XIST (cpm)", ylab="Y (sum cpm)") dev.off() sessionInfo() ------------------------------------------------------------------------------ R version 2.15.1 (2012-06-22) Platform: x86_64-unknown-linux-gnu (64-bit) locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 [7] LC_PAPER=C LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] biomaRt_2.14.0 edgeR_3.0.8 limma_3.14.4 [4] BiocInstaller_1.8.3 loaded via a namespace (and not attached): [1] RCurl_1.95-4.1 tools_2.15.1 XML_3.98-1.1 ------------------------------------------------------------------------------