RNA-TNBC

(一)GSE65212

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
setwd("path to\\GSE65212")

#读入样本
sample<-read.csv("sample.csv")
TNBC<-sample$Sample_geo_accession[1:55]
normal<-sample$Sample_geo_accession[56:66]
group_list<-c(rep('TNBC',55),rep('Normal',11))

#读入矩阵
GSE65212<-read.table('GSE65212_series_matrix.txt.gz',
sep='\t',quote = "",fill = T,
comment.char="!",header=T)
colname<-colnames(GSE65212)
colname<-gsub("\\.","",colname)
colname<-gsub("X","",colname)
colnames(GSE65212)<-colname
GSE65212_TNBC_normal<-GSE65212[,c(TNBC,normal)]
rownames(GSE65212_TNBC_normal)<-GSE65212$ID_REF
rownames(GSE65212_TNBC_normal)<-gsub('\\"',"",rownames(GSE65212_TNBC_normal))

##ID转换
GPL14877<-read.table('GPL14877_HGU133Plus2_Hs_ENTREZG_mapping.txt.gz',
sep='\t',quote = "",fill = T,
comment.char="!",header=T)
table(rownames(GSE65212_TNBC_normal)%in%GPL14877$Probe.Set.Name)#TRUE:18123
exprSet<-GSE65212_TNBC_normal
#step1 Probe.Set.Name转成Affy.Probe.Set.Name
exprSet<-GSE65212_TNBC_normal
table(rownames(GSE65212_TNBC_normal)%in%GPL14877$Probe.Set.Name)
dim(exprSet)
exprSet<-exprSet[rownames(exprSet)%in%GPL14877$Probe.Set.Name,]
dim(exprSet)
GPL14877<-GPL14877[match(rownames(exprSet),GPL14877$Probe.Set.Name),]
head(GPL14877)
exprSet[1:5,1:5]
tmp<-by(exprSet,GPL14877$Affy.Probe.Set.Name,function(x)rownames(x)[which.max(rowMeans(x))])
probes<-as.character(tmp)
dim(exprSet)
exprSet<-exprSet[rownames(exprSet)%in%probes,]
dim(exprSet)
rownames(exprSet)<-GPL14877[match(rownames(exprSet),GPL14877$Probe.Set.Name),7]
exprSet[1:5,1:5]
#step2 Affy.Probe.Set.Name转成GPL750的ID
GPL570<-read.table("GPL570-55999.txt",sep='\t',quote = "",fill = T,
comment.char="#",header=T)
GPL570<-GPL570[,c(1,11)]
table(GPL14877$Affy.Probe.Set.Name%in%GPL570$ID)#TRUE:18123

table(rownames(exprSet)%in%GPL570$ID)
dim(exprSet)
head(GPL570)
GPL570<-GPL570[match(rownames(exprSet),GPL570$ID),]
head(GPL570)
exprSet[1:5,1:5]
tmp<-by(exprSet,GPL570$Gene.Symbol,function(x)rownames(x)[which.max(rowMeans(x))])
probes<-as.character(tmp)
dim(exprSet)
exprSet<-exprSet[rownames(exprSet)%in%probes,]
dim(exprSet)
rownames(exprSet)<-GPL570[match(rownames(exprSet),GPL570$ID),2]
exprSet[1:5,1:5]

#limma差异分析
suppressMessages(library(limma))
design<-model.matrix(~0+factor(group_list))
colnames(design)<-levels(factor(group_list))
rownames(design)<-colnames(exprSet)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse="-"),levels = design)
contrast.matrix
fit<-lmFit(exprSet,design)
fit2<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit2)
tempOutput<-topTable(fit2,coef = 1,n=Inf)
nrDEG=na.omit(tempOutput)
head(nrDEG)
test<-nrDEG
test$id<-rownames(nrDEG)

topT <- as.data.frame(nrDEG)
logFC_Cutoff<-with(topT,mean(abs(logFC))+2*sd(abs(logFC)))##2.5

#拿到显著差异的基因
select_gene=(abs(nrDEG$logFC)>=logFC_Cutoff)&(nrDEG$adj.P.Val<0.05)
select_gene.sig=nrDEG[select_gene,]
write.csv(select_gene.sig,'GSE65212_TNBC_Normal_select_gene.sig.csv')

(二)GSE76250

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
setwd("path to\\GSE76250")

#读入样本
sample<-read.csv("sample.csv")
TNBC<-sample$锘縏NBC
TNBC<-gsub(" ","",TNBC)
normal<-sample$paired_normal
normal<-gsub(" ","",normal)
group_list<-c(rep('TNBC',33),rep('Normal',33))

#读入矩阵
GSE76250<-read.table('GSE76250_series_matrix.txt.gz',
sep='\t',quote = "",fill = T,
comment.char="!",header=T)
colname<-colnames(GSE76250)
colname<-gsub("\\.","",colname)
colname<-gsub("X","",colname)
colnames(GSE76250)<-colname
rownames(GSE76250)<-GSE76250$ID_REF
rownames(GSE76250)<-gsub('\\"',"",rownames(GSE76250))
GSE76250<-GSE76250[,-1]
GSE76250_TNBC_Normal<-GSE76250[,c(TNBC,normal)]

##加载R包
#其实这里可以直接下载GPL文件的,多走了一步,就当了解了一个新东西
library(GEOquery)
gse <- getGEO(filename = "GSE76250_family.soft.gz",destdir = ".")
str(gse)
length(gse)

id_probe <- gse@gpls$GPL17586@dataTable@table
dim(id_probe)
id_probe[1:4,1:15]
probe2gene <- id_probe[,c(2,8)]

#提取基因名称
library(stringr)
probe2gene$symbol=trimws(str_split(probe2gene$gene_assignment,'//',simplify = T)[,2])
plot(table(table(probe2gene$symbol)),xlim=c(1,50))
dim(probe2gene)
View(head(probe2gene))
ids2 <- probe2gene[,c(1,3)]
View(head(ids2))
ids2[1:20,1:2]#含有缺失值
table(table(unique(ids2$symbol)))#30907 ,30906个基因,一个空字符
save(ids2,probe2gene,file='gse-probe2gene.Rdata')

#过滤表达矩阵
exprSet<-GSE76250_TNBC_Normal
exprSet <- exprSet[rownames(exprSet) %in% ids2$probeset_id,]
dim(exprSet)
exprSet[1:5,1:5]

#ids过滤探针
ids <- ids2[match(rownames(exprSet),ids2$probeset_id),]
dim(ids)
ids[1:5,1:2]
tmp<-by(exprSet,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))])
probes<-as.character(tmp)
dim(exprSet)
exprSet<-exprSet[rownames(exprSet)%in%probes,]
dim(exprSet)
rownames(exprSet)<-ids[match(rownames(exprSet),ids$probeset_id),2]
exprSet[1:5,1:5]

#limma
suppressMessages(library(limma))
design<-model.matrix(~0+factor(group_list))
colnames(design)<-levels(factor(group_list))
rownames(design)<-colnames(exprSet)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse="-"),levels = design)
contrast.matrix
fit<-lmFit(exprSet,design)
fit2<-contrasts.fit(fit,contrast.matrix)
fit2<-eBayes(fit2)
tempOutput<-topTable(fit2,coef = 1,n=Inf)
nrDEG=na.omit(tempOutput)
head(nrDEG)
test<-nrDEG
test$id<-rownames(nrDEG)

topT <- as.data.frame(nrDEG)
logFC_Cutoff<-with(topT,mean(abs(logFC))+2*sd(abs(logFC)))##0.584

#拿到显著差异的基因
select_gene=(abs(nrDEG$logFC)>=logFC_Cutoff)&(nrDEG$adj.P.Val<0.05)
select_gene.sig=nrDEG[select_gene,]
write.csv(select_gene.sig,'GSE76250_TNBC_Normal_select_gene.sig.csv')

(三)获取两个数据集里面同时上调的差异基因

1
2
3
4
5
6
7
8
9
10
setwd("path to\\GSE65212")
GSE65212_gene<-read.csv("GSE65212_TNBC_Normal_select_gene.sig.csv")
GSE65212_gene<-GSE65212_gene[GSE65212_gene$logFC>0,]

setwd("path to\\GSE76250")
GSE76250_gene<-read.csv("GSE76250_TNBC_Normal_select_gene.sig.csv")
GSE76250_gene<-GSE76250_gene[GSE76250_gene$logFC>0,]

overlap_gene<-intersect(GSE65212_gene$X,GSE76250_gene$X)
write.csv(over_lapgene,"TNBC_upregulation_gene.csv")
Author

Yuqi Gong

Posted on

2021-07-04

Updated on

2022-11-26

Licensed under