BIOS_PreparedData: get_RNA_seq_and_covariates.R

File get_RNA_seq_and_covariates.R, 5.3 KB (added by Rick, 9 months ago)
Line 
1
2# To work with latest R type R-3.0.3 in the terminal of R-3.1.2
3# fast R for matrix multiplication/PCA: type  /usr/bin/R
4
5
6##########################                    get gene count data
7
8RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1.txt',sep='\t',header=TRUE,check.names=FALSE) # per row one gene
9save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData')
10load('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData')
11
12##     get IDs
13source('/home/rjansen/R  scripts/getView.R') # getView.R By Maarten van Iterson see https://git.lumc.nl/rp3/rscripts  for last version
14freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records
15
16##  take Freeze 1 and replace with BIOS ID
17
18RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id]
19colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)]
20
21##  remove genes with standard deviation = 0
22
23nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
24RNA=RNA[nz,]
25
26##   TMM normalization per subject
27
28library(edgeR)
29D <- DGEList(counts=RNA)
30d <- calcNormFactors(D)
31scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
32scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
33RNAs =  d$counts/scal.mat
34
35save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_R_object.RData')
36save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_TMM_normalized_R_object.RData')
37
38##########################                  get Exon count data
39
40load('/virdir/Backup/pthoen/run1_exon_count_analysis/combined_exon_count_run_1_R_object.R')
41
42##     get IDs
43source('/home/rjansen/R  scripts/getView.R')
44freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records
45
46##  take Freeze 1 and replace with BIOS ID
47
48idtemp=colnames(RNA)
49for (i in 1: length(idtemp)){
50  idtemp[i]=gsub("[.]","-", idtemp[i]) }
51
52RNA = RNA[,idtemp %in% freeze1$rnaseq_run_id]
53idtemp=idtemp[idtemp %in% freeze1$rnaseq_run_id]
54colnames(RNA)=freeze1$uuid[match(idtemp,freeze1$rnaseq_run_id)]
55
56# ##  remove exons with standard deviation = 0 (none in exon counts)
57#
58# nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
59# RNA=RNA[nz,]
60
61##   TMM normalization per subject
62
63library(edgeR)
64D <- DGEList(counts=RNA)
65d <- calcNormFactors(D)
66scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
67scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
68RNAs =  d$counts/scal.mat
69
70save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_R_object.RData')
71save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_TMM_normalized_R_object.RData')
72
73##########################           get transcript counts
74
75RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.txt',as.is=TRUE,sep='\t',check.names=FALSE,header=TRUE)
76save(RNA, file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData')
77load('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData')
78
79##     get IDs
80source('/home/rjansen/R  scripts/getView.R')
81freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records
82
83##  take Freeze 1 and replace with BIOS ID
84
85rownames(RNA)=RNA[,1]
86RNA=RNA[,-1]
87RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id]
88colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)]
89
90# ##  remove transcripts with standard deviation = 0 (none)
91#
92 #nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
93# RNA=RNA[nz,]
94
95##   TMM normalization per subject
96
97library(edgeR)
98D <- DGEList(counts=RNA)
99d <- calcNormFactors(D)
100scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
101scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
102RNAs =  d$counts/scal.mat
103
104save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_R_object.RData')
105save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_TMM_normalized_R_object.RData')
106
107##########################   get Phenotypes and technical covariates
108
109 
110source('/home/rjansen/R  scripts/getView.R')
111
112S=getView(view="rnaseq", design="samplesheets") # 4039 records
113save(S, file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.RData')
114write.table(S,file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')
115
116P = getView(design="phenotypes", view="allPhenotypes") # 4564
117save(P, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.RData')
118write.table(P,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')
119
120F <- getView(view="getIds", design="identifiers") # 6037 records
121
122# get lane and index from run id
123rid=F[,5];lane=NULL;index=NULL
124for(i in 1: length(rid)){
125  lane[i]=strsplit(rid[i],'-')[[1]][2]
126  index[i]=strsplit(rid[i],'-')[[1]][3]}
127
128F=as.data.frame(cbind(F,lane,index))
129
130save(F, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.RData')
131write.table(F,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')
132 
133