…
SUPPA2实战(DLBCL)
因为最近ascp无法使用了,只好使用preftch进行数据下载SRA数据,再自己转成fastq。
1
| nohup prefetch --option-file SRR_Acc_List1.txt &
|
1 2 3 4 5 6 7
| scp -P 6001 ./test.csv gongyuqi@124.77.170.70:/home/gongyuqi
#port: 6654 gmb39@81.70.205.254:/home/data/gmb39/project/AS/SUPPA2/SRA
#port: 6001 gongyuqi@124.77.170.70:/home/gongyuqi/project/AS/SUPPA2/SRA
|
conda安装参考:https://www.jianshu.com/p/edaa744ea47d
数据下载、转换
1 2 3 4 5 6 7 8 9 10
| ls *.sra | while read id do nohup fastq-dump --split-3 $id & done
ls *.fastq | while read id do nohup pigz -p 8 $id & done
|
质控
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| nohup fastqc -t 16 *gz -o ./fastqc & nohup multiqc ./fastqc/*.zip -o ./multiqc &
out_dir=/home/gongyuqi/project/AS/SUPPA2/clean trim_galore --paired --quality 20 -a AGATCGGAAGAGC -a2 AGATCGGAAGAGC --length 20 -o $out_dir R1.fq.gz R2.fq.gz
out_dir=/home/gongyuqi/project/AS/SUPPA2/clean cat sample.txt|while read id do arr=(${id}) fq1=${arr[0]} fq2=${arr[1]} nohup trim_galore --paired --quality 25 -e 0.1 --length 36 -a AGATCGGAAGAGC -a2 AGATCGGAAGAGC -o $out_dir $fq1 $fq2 & done
|
(1)、运行salmon软件定量
1 2 3 4 5 6 7 8
| cat clean_sample.txt|while read id do arr=(${id}) fq1=${arr[0]} fq2=${arr[1]} dir=${fq1%%_*} nohup salmon quant -i /home/gongyuqi/ref/hg38/ensembl/SUPPA_ref/Ensembl_hg19_salmon_index -l ISF --gcBias -1 $fq1 -2 $fq2 -p 8 -o /home/gongyuqi/project/AS/SUPPA2/salmon_quant/$dir & done
|
(2)、提取所有样本的TPM值并合并为一个文件
1 2
| suppa_dir=/home/gongyuqi/miniconda3/envs/SUPPA2_3.9/bin/ nohup python $suppa_dir/multipleFieldSelection.py -i /home/gongyuqi/project/AS/SUPPA2/salmon_quant/*/quant.sf -k 1 -f 4 -o /home/gongyuqi/project/AS/SUPPA2/output/iso_tpm.txt &
|
(3)、使iso_tpm.txt文件中的转录本id同下载的gtf文件id一致
运行此R脚本,会生成iso_tpm_formatted.txt文件
1
| Rscript format_Ensembl_ids.R iso_tpm.txt
|
三、PSI计算
1、根据参考基因组注释文件生成可变剪切事件文件
-i
GTF文件
-o
输出文件前缀
-e
输出文件中包含的可变剪切类型
-f
输出的格式
1 2 3 4 5 6 7 8 9 10
| suppa_dir=/home/gongyuqi/miniconda3/envs/SUPPA2_3.9/bin/ gtf_dir=/home/gongyuqi/ref/hg38/ensembl/SUPPA_ref python $suppa_dir/suppa.py generateEvents -i $gtf_dir/Homo_sapiens.GRCh37.75.formatted.gtf -o ensembl_hg19.events -e SE SS MX RI FL -f ioe
cd $gtf_dir awk ' FNR==1 && NR!=1 { while (/^<header>/) getline; } 1 {print} ' *.ioe > ensembl_hg19.events.ioe
|
结果如下
2、计算样本的PSI值
1 2 3
| cd /home/gongyuqi/project/AS/SUPPA2 ioe_merge_file=~/ref/hg38/ensembl/SUPPA_ref/hg19_event/ensembl_hg19.events.ioe nohup python $suppa_dir/suppa.py psiPerEvent -i $ioe_merge_file -e iso_tpm_formatted.txt -o TRA2_events &
|
根据ensembl ID获取基因名
参考资料:https://www.sohu.com/a/245475759_777125
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| library(biomaRt)
listMarts()
human_mart<-useMart("ENSEMBL_MART_ENSEMBL")
datasets<-listDatasets(human_mart) view(datasets)
my_datasets<-useDataset("hsapiens_gene_ensembl",mart=human_nart)
ensembl_id<-"ENSG00000149554"
ensembl_symbol<-getBM(attributes = c("ensembl_gene_id","chromosome_name","hgnc_symbol","hgnc_id"), filters = "ensembl_gene_id", values = ensembl_id, mart = my_datasets)
|
ENSG00000000457