1 | |
---|
2 | # To work with latest R type R-3.0.3 in the terminal of R-3.1.2 |
---|
3 | # fast R for matrix multiplication/PCA: type /usr/bin/R |
---|
4 | |
---|
5 | |
---|
6 | ########################## get gene count data |
---|
7 | |
---|
8 | RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1.txt',sep='\t',header=TRUE,check.names=FALSE) # per row one gene |
---|
9 | save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData') |
---|
10 | load('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData') |
---|
11 | |
---|
12 | ## get IDs |
---|
13 | source('/home/rjansen/R scripts/getView.R') # getView.R By Maarten van Iterson see https://git.lumc.nl/rp3/rscripts for last version |
---|
14 | freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records |
---|
15 | |
---|
16 | ## take Freeze 1 and replace with BIOS ID |
---|
17 | |
---|
18 | RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id] |
---|
19 | colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)] |
---|
20 | |
---|
21 | ## remove genes with standard deviation = 0 |
---|
22 | |
---|
23 | nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA) |
---|
24 | RNA=RNA[nz,] |
---|
25 | |
---|
26 | ## TMM normalization per subject |
---|
27 | |
---|
28 | library(edgeR) |
---|
29 | D <- DGEList(counts=RNA) |
---|
30 | d <- calcNormFactors(D) |
---|
31 | scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors))) |
---|
32 | scal.mat <- outer(rep(1,nrow(d$counts)), scalar) |
---|
33 | RNAs = d$counts/scal.mat |
---|
34 | |
---|
35 | save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_R_object.RData') |
---|
36 | save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_TMM_normalized_R_object.RData') |
---|
37 | |
---|
38 | ########################## get Exon count data |
---|
39 | |
---|
40 | load('/virdir/Backup/pthoen/run1_exon_count_analysis/combined_exon_count_run_1_R_object.R') |
---|
41 | |
---|
42 | ## get IDs |
---|
43 | source('/home/rjansen/R scripts/getView.R') |
---|
44 | freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records |
---|
45 | |
---|
46 | ## take Freeze 1 and replace with BIOS ID |
---|
47 | |
---|
48 | idtemp=colnames(RNA) |
---|
49 | for (i in 1: length(idtemp)){ |
---|
50 | idtemp[i]=gsub("[.]","-", idtemp[i]) } |
---|
51 | |
---|
52 | RNA = RNA[,idtemp %in% freeze1$rnaseq_run_id] |
---|
53 | idtemp=idtemp[idtemp %in% freeze1$rnaseq_run_id] |
---|
54 | colnames(RNA)=freeze1$uuid[match(idtemp,freeze1$rnaseq_run_id)] |
---|
55 | |
---|
56 | # ## remove exons with standard deviation = 0 (none in exon counts) |
---|
57 | # |
---|
58 | # nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA) |
---|
59 | # RNA=RNA[nz,] |
---|
60 | |
---|
61 | ## TMM normalization per subject |
---|
62 | |
---|
63 | library(edgeR) |
---|
64 | D <- DGEList(counts=RNA) |
---|
65 | d <- calcNormFactors(D) |
---|
66 | scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors))) |
---|
67 | scal.mat <- outer(rep(1,nrow(d$counts)), scalar) |
---|
68 | RNAs = d$counts/scal.mat |
---|
69 | |
---|
70 | save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_R_object.RData') |
---|
71 | save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_TMM_normalized_R_object.RData') |
---|
72 | |
---|
73 | ########################## get transcript counts |
---|
74 | |
---|
75 | RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.txt',as.is=TRUE,sep='\t',check.names=FALSE,header=TRUE) |
---|
76 | save(RNA, file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData') |
---|
77 | load('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData') |
---|
78 | |
---|
79 | ## get IDs |
---|
80 | source('/home/rjansen/R scripts/getView.R') |
---|
81 | freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records |
---|
82 | |
---|
83 | ## take Freeze 1 and replace with BIOS ID |
---|
84 | |
---|
85 | rownames(RNA)=RNA[,1] |
---|
86 | RNA=RNA[,-1] |
---|
87 | RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id] |
---|
88 | colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)] |
---|
89 | |
---|
90 | # ## remove transcripts with standard deviation = 0 (none) |
---|
91 | # |
---|
92 | #nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA) |
---|
93 | # RNA=RNA[nz,] |
---|
94 | |
---|
95 | ## TMM normalization per subject |
---|
96 | |
---|
97 | library(edgeR) |
---|
98 | D <- DGEList(counts=RNA) |
---|
99 | d <- calcNormFactors(D) |
---|
100 | scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors))) |
---|
101 | scal.mat <- outer(rep(1,nrow(d$counts)), scalar) |
---|
102 | RNAs = d$counts/scal.mat |
---|
103 | |
---|
104 | save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_R_object.RData') |
---|
105 | save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_TMM_normalized_R_object.RData') |
---|
106 | |
---|
107 | ########################## get Phenotypes and technical covariates |
---|
108 | |
---|
109 | |
---|
110 | source('/home/rjansen/R scripts/getView.R') |
---|
111 | |
---|
112 | S=getView(view="rnaseq", design="samplesheets") # 4039 records |
---|
113 | save(S, file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.RData') |
---|
114 | write.table(S,file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t') |
---|
115 | |
---|
116 | P = getView(design="phenotypes", view="allPhenotypes") # 4564 |
---|
117 | save(P, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.RData') |
---|
118 | write.table(P,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t') |
---|
119 | |
---|
120 | F <- getView(view="getIds", design="identifiers") # 6037 records |
---|
121 | |
---|
122 | # get lane and index from run id |
---|
123 | rid=F[,5];lane=NULL;index=NULL |
---|
124 | for(i in 1: length(rid)){ |
---|
125 | lane[i]=strsplit(rid[i],'-')[[1]][2] |
---|
126 | index[i]=strsplit(rid[i],'-')[[1]][3]} |
---|
127 | |
---|
128 | F=as.data.frame(cbind(F,lane,index)) |
---|
129 | |
---|
130 | save(F, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.RData') |
---|
131 | write.table(F,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t') |
---|
132 | |
---|
133 | |
---|