scRNA-TNBC

8个TNBC患者的单细胞样本合并+分析

(一)数据合并

step 1、组织文件格式

1
2
#下载好数据,不同病人样本的matrix.mtx.gz、features.tsv.gz、barcodes.tsv.gz文件放在各自的文件夹下面。
cd /home/gongyuqi/project/scRNA-seq/TNBC

看一下文件结构

1
2
3
4
5
#记得解压gz文件,不然Read10X()会报错
find ./ -name "*.gz"|xargs gunzip
#Read10X()中,文件的命名一定要是"barcodes.tsv" "genes.tsv""matrix.mtx"
#所以我们将features.tsv改成genes.tsv
find ./ -name "features*" | while read id; do mv $id ${id%/*}/genes.tsv; done

再次看一下现在的文件结构

step 2、合并数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
setwd("/home/gongyuqi/project/scRNA-seq/TNBC")

#Read10函数要对目录进行操作,先提取目录名
folders<-list.files("./")
folders

library(Seurat)
scelist<-lapply(folders, function(folder){
CreateSeuratObject(counts = Read10X(folder),
project = folder)
})
#scelist只是8个10X对象的集合
scelist

#现在开始真正的合并
TNBC_scRNA<-merge(scelist[[1]],
y=c(scelist[[2]],scelist[[3]],scelist[[4]],scelist[[5]],
scelist[[6]],scelist[[7]],scelist[[8]]),
add.cell.ids=folders,
project="TNBC")
table(TNBC_scRNA$orig.ident)
GetAssayData(TNBC_scRNA)[1:20,1:25]#感受一下矩阵的内容,是个稀疏矩阵
save(TNBC_scRNA,file = "TNBC_scRNA.Rdata")

Suerat下游分析

step 1、过滤

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#去除线粒体基因
grep(pattern = "^MT-",rownames(TNBC_scRNA),value = T)
TNBC_scRNA[["percent.mt"]] <- PercentageFeatureSet(TNBC_scRNA, pattern = "^MT-")
head(TNBC_scRNA@meta.data,5)
summary(TNBC_scRNA@meta.data$nCount_RNA)

#可视化单细胞数据质量
VlnPlot(TNBC_scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
data<-TNBC_scRNA@meta.data
library(ggplot2)
p<-ggplot(data = data,aes(x=nFeature_RNA))+geom_density()
p
#根据上一步质控的结果进行参数的设定
#同时结合原文的参数设置,我们这里只保留RNA含量再500到6000之间,且线粒体含量小于20%的细胞
TNBC_scRNA <- subset(TNBC_scRNA, subset = nFeature_RNA > 500 & nFeature_RNA < 6000 & percent.mt < 20)
TNBC_scRNA
VlnPlot(TNBC_scRNA, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

step 2、标准化&&降维

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#标准化
TNBC_scRNA[["RNA"]]@data[1:20,1:10]
TNBC_scRNA <- NormalizeData(TNBC_scRNA, normalization.method = "LogNormalize", scale.factor = 10000)
#标准化后的值保存在TNBC_scRNA[["RNA"]]@data
normalized.data<-TNBC_scRNA[["RNA"]]@data
normalized.data[1:20,1:10]

#鉴定高变基因
TNBC_scRNA <- FindVariableFeatures(TNBC_scRNA, selection.method = "vst", nfeatures = 2000)

#全部基因进行归一化
all.genes <- rownames(TNBC_scRNA)
TNBC_scRNA <- ScaleData(TNBC_scRNA, features = all.genes)

#PCA降维
TNBC_scRNA <- RunPCA(TNBC_scRNA, features = VariableFeatures(object = TNBC_scRNA))
# Examine and visualize PCA results a few different ways
print(TNBC_scRNA[["pca"]], dims = 1:5, nfeatures = 5)
#visualization
VizDimLoadings(TNBC_scRNA, dims = 1:2, reduction = "pca")
DimPlot(TNBC_scRNA, reduction = "pca")
DimHeatmap(TNBC_scRNA, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(TNBC_scRNA, dims = 1:20, cells = 500, balanced = TRUE)

#PCA个数的确定
TNBC_scRNA <- JackStraw(TNBC_scRNA, num.replicate = 100)
TNBC_scRNA <- ScoreJackStraw(TNBC_scRNA, dims = 1:20)
ScoreJackStraw(TNBC_scRNA, dims = 1:20)
ElbowPlot(TNBC_scRNA)#可视化不同PCA个数对分群的影响

step 3、细胞聚类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#细胞聚类
#resolution = 0.1, cluster=11(结合UMAP和tSNE,11个亚群是不能再合并了,但还可以再细分)
#resolution = 0.2, cluster=13(同resolution = 0.25的结果几乎一致)
#####################################################################
#结合bulk RNA-seq的差异基因在不同亚群的分布,13个亚群的单细胞数据比较合适
#resolution = 0.25, cluster=13
#####################################################################
#resolution = 0.35, cluster=16(同resolution = 0.4的结果几乎一致)
#resolution = 0.4, cluster=16
#resolution = 0.45, cluster=18
#resolution = 0.5, cluster=20
#resolution = 0.6, cluster=21(和resolution = 0.5时差异不大,tSNE结果显示有过度分群的趋势)
TNBC_scRNA <- FindNeighbors(TNBC_scRNA, dims = 1:10)
TNBC_scRNA <- FindClusters(TNBC_scRNA, resolution = 0.25)

#UMAP可视化细胞亚群分布
TNBC_scRNA <- RunUMAP(TNBC_scRNA, dims = 1:10)
DimPlot(TNBC_scRNA, reduction = "umap",label = T,label.size = 5)
#tSNE可视化细胞亚群分布
set.seed(1918)
TNBC_scRNA <- RunTSNE(TNBC_scRNA, dims = 1:10)
DimPlot(TNBC_scRNA, reduction = "tsne",label = T,label.size = 5)

step 4、探寻亚群特异性的差异基因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
siggene<-read.csv("TNBC_upregulation_gene.csv")#RNA-TNBC分析得到的文件
siggene<-siggene$x#一共177个基因

#写了循环可视化177个在bulk RNA-seq数据集中共同上调的差异基因,挑选亚群特异性差异基因
siggene<-read.csv("TNBC_upregulation_gene.csv")
siggene<-siggene$x
plot_list = list()
for (i in 1:6) {
front=i*30-29
back=i*30
plot_list[[i]] = DotPlot(TNBC_scRNA, features = siggene[front:back]) + coord_flip()
print(plot_list[[i]])
}
############################################################################################
#上面的代码提示没有在scRNA-seq数据中找到KIAA0101,EPT1,HN1基因,于是找到这三个基因的所有alias
#但是,scRNA-seq数据中仍然没有这三个基因~~~所以就不要这三个基因咯
KIAA0101<-c("L5", "NS5ATP9", "OEATC", "OEATC-1", "OEATC1", "PAF", "PAF15", "p15(PAF)",
"p15/PAF", "p15PAF")
EPT1<-c("SELI", "SEPI")
HN1<-c("ARM2", "HN1A")
DotPlot(TNBC_scRNA, features = HN1) + coord_flip()
#############################################################################################

#第一步初筛选定了下列基因
#69 candidate genes
select_siggene<-read.csv("first_filter.csv")
select_siggene<-select_siggene$candidate
#有了初筛后的基因,我们再次写循环,这次在tSNE中看看这些基因的分布情况,确定最终的亚群特异性差异基因
plot_list = list()
for (i in 1:12) {
front=i*6-5
back=i*6
plot_list[[i]]=FeaturePlot(TNBC_scRNA, features = select_siggene[front:back],
reduction="tsne",
cols = c("grey", "red"))
print(plot_list[[i]])
}

#可视化上述基因后,进一步选定下列基因为最终的亚群特异性基因
last_select_gene<-c("CEP55","PRC1","PBK","CXCL10","SULF1","BCL2A1",
"PLA2G7","CXCL9","LYZ","COL11A1","MMP9","TDO2",
"TNFSF13B","ADAMDEC1","MMP12","SLAMF8",
"MMP1","COL10A1","NCF2")

plot_list = list()
for (i in 1:4) {
front=i*6-5
back=i*6
plot_list[[i]]=FeaturePlot(TNBC_scRNA, features = last_select_gene[front:back],
reduction="tsne",
cols = c("grey", "red"))
print(plot_list[[i]])
}

p <- DotPlot(TNBC_scRNA, features = last_select_gene) + coord_flip()
p

这里简单看一下不同亚群都是些什么细胞

下一步计划

1、细胞亚群的注释(下周完成)
2、亚群特异性基因的预后意义(下周完成)
3、如果有多个预后意义显著的基因,准备进行预后模型的构建

疑惑

1、上述亚群特异性基因主要集中在免疫细胞群里面?要怎么去解释这个现象呢
2、我们拿到这样的亚群特异性基因,如何更好的给它上价值呢?这一点我比较困惑。

Author

Yuqi Gong

Posted on

2021-06-29

Updated on

2022-11-26

Licensed under