来源:根据徐洲更B站视频整理。
1.分析准备:软件的安装
1.1安装miniconda3
wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-4.7.10-Linux-x86_64.sh
1.2安装miniconda3
sh Miniconda3-4.6.14-Linux-×86_64.sh
1.3按照https:/bioconda.github.io/的要求,我们添加如下频道
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
conda config --set show_channel_urls yes
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/
1.4使用conda安装软件
conda search sra
conda install sra-tools
conda create -n rna-seq sra-tools fastqc cutadapt trimmomatic star hisat2 samtools subread htseq
1.5 启动conda工作环境
conda activate rna-seq
2.下载生物信息学数据
2.1先建立工作目录
mkdir -p rna-seq-project
cd rna-seq-project
2.2下载基因组序列
mkdir -p reference
cd reference
wget -c ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
2.3下载cDNA序列和GTF注释文件
wget -c ftp://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
wget -c ftp://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz
3.下载高通量数据
3.1安装aspera
wget https://download.asperasoft.com/download/sw/connect/3.9.5/ibm-aspera-connect-3.9.5.172984-linux-g2.12-64.tar.gz
#解压缩
tar xf ibm-aspera-connect-3.9.5.172984-linux-g2.12-64.tar.gz
#安装
./ibm-aspera-connect-3.9.5.172984-linux-g2.12-64.sh
3.2使用prefetch下载数据
cd..
mkdir -p sra
prefetch -O sra SRR4820707
4.数据质控
4.1sra转fastq
mkdir-p fastq
fasterq-dump -p -3 -O fastq sra/SRR4820707
4.2fastq合并
cat SRR4820707.sra_1.fastq SRR4820708.sra_1.fastq > EZH1_1_R1.fastq
cat SRR4820707.sra_2.fastq SRR4820708.sra_2.fastq > EZH1_1_R2.fastq
4.3质量控制
rm SRR48207*
mkdir qc
fastqc -o qc fastq/*.fastq
输出结果:html格式
5.建立HISAT2索引
5.1解压缩reference文件
gunzip Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz &
gunzip Homo _sapiens.GRCh38.97.gtf.gz &
5.2使用hisat2
conda activate rna-seq
hisat2_extract_exons.py Homo sapiens.GRCh38.97.gtf > exons.txt
hisat2_extract _splice_sites.py Homo_sapiens.GRCh38.97.gtf > ss.txt
hisat2-build -p 4 --ss ss.txt --exon exons.txt Homo_sapiens.GRCh38.dna.primary_assembly.fa GRCh38
6.将RNA-seq数据比对到参考基因组
mkdir bam
conda activate rna-seq
hisat2 --new-summary --thread 4 -x reference/grch38/genome -1 fastq/EZH1_1_R1.fastq -2 fastq/EZH1_1_R2.fastq 2 > qc/EZH1_1.ht2.log | samtools sort > bam/EZH1_1.bam &
samtools index -@ 4 EZH1_1.bam
7.用multiqc整合RNA-seq分析的质控信息
conda activate rna-seq
conda install multiqc
cd ~/rna-seq-project
mutiqc qc
在rna-seq-project文件夹下查看report
8.对RNA-seq比对的BAM文件进行定量
cd ~/rna-seq-project
mkdir counts
featureCounts -a reference/Homo_sapiens.GRCh38.97.gtf -F gtf -g gene_id -p -T 4 -o counts/expr_matrix.txt bam/*.bam