# Login to sigma.nsc.
ssh -X x_ALEOH@sigma.nsc.liu.se # this is for terminal login
# We will book one interactive nodes for 2 hours.
interactive -t 02:00:00 -n1 --reservation=devel
# Load required modules
module load Anaconda/2022.05-nsc1
# activate the conda environment
conda activate 16S-Part1
# go to your own directory and make the directories as the suggestion above or as you wish.
cd /proj/liu-compute-2023-21/users/x_ALEOH
# run fastqc
fastqc -o /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/quality_check_raw /proj/liu-compute-2023-21/16S_amplicon/raw_data/data1/*.fastq.gz
#run multiqc (it will automatically generate a directory)
multiqc /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/quality_check_raw -o /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/quality_check_raw
# Visualize the html files either by:
# opening them directly from the terminal if you are connected via ThinLinc.
# transferring the html files to your local computer. In the Unix/Linux HPC Lab tutorial you will find instructions on how to transfer the files under the sections “File Transfer”.
# We want to run all the samples in one go, and we don't want to manually enter the name of all the samples.
# For that, we will first crate a file called "samples" which contains the samples names based
cd /proj/liu-compute-2023-21/16S_amplicon/raw_data/data1
ls *fastq.gz | cut -f1-2 -d "_" > /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/samples
# go back to the "working path"
cd /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1
# Then we will make a loop for the trimming and filtering to be performed on each sample (based on the sample names in the file sample).
# First we will trim the primers and the 5' end to Q35
for sample in $(cat /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/samples); \
do echo "On sample: $sample"; bbduk.sh in=/proj/liu-compute-2023-21/16S_amplicon/raw_data/data1/"$sample"_L001_R1_001.fastq.gz in2=/proj/liu-compute-2023-21/16S_amplicon/raw_data/data1/"$sample"_L001_R2_001.fastq.gz \
out=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R1_trimmed1.fastq.gz out2=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R2_trimmed1.fastq.gz \
literal=CCTACGGGNGGCWGCAG,GACTACHVGGGTATCTAATCC k=17 ordered=t \
ktrim=l copyundefined ftm=5 qtrim=l trimq=35 minlen=100; \
done 2> /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimming1_stats.txt
# Then we will trim the 3' end to Q30.
for sample in $(cat /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/samples); \
do echo "On sample: $sample"; bbduk.sh in=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R1_trimmed1.fastq.gz in2=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R2_trimmed1.fastq.gz \
out=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R1_trimmed2.fastq.gz out2=/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data/"$sample"_R2_trimmed2.fastq.gz \
qtrim=r trimq=30 minlen=100; \
done 2> /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimming2_stats.txt
# run fastqc
fastqc fastqc -o /proj/liu-compute-2023-21/users/x_UERSNAME/16S_Part1/quality_check_raw/proj/liu-compute-2023-21/16S_amplicon/raw_data/data1/*.fastq.gz
#run multiqc
multiqc /proj/liu-compute-2023-21/users/x_UERSNAME/16S_Part1/quality_check_raw -o /proj/liu-compute-2023-21/users/x_UERSNAME/16S_Part1/quality_check_raw
# Visualize the html files either by:
# opening them directly from the terminal if you are connected via ThinLinc.
# transferring the html files to your local computer. In the Unix/Linux HPC Lab tutorial you will find instructions on how to transfer the files under the sections “File Transfer”.
# Visualize the html files by:
# opening them directly from the terminal if you are connected via ThinLinc.
# Go to the Routput directory to start R
cd /proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/Routput
# Activate R
R
install.packages("dada2")
# Load packages
library(dada2); packageVersion("dada2")
# We set seed to 123 in order to replicate results for interations
set.seed(123)
#path to the directory containing the sequences (fastq files)
path <- "/proj/liu-compute-2023-21/users/x_aleoh/16S_Part1/trimmed_data"
list.files(path)
# First we sort files to ensure forward/reverse reads are in same the same order.
# Forward and reverse fastq filenames have normally the format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq.
# In our case we changed the file name in the trimming step.
fnFs <- sort(list.files(path, pattern="_R1_trimmed2.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_trimmed2.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
sample.names
# Define new path for filtered samples and files (defining file names).
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq"))
names(filtFs)<- sample.names
names(filtRs)<- sample.names
# remove phix
filtered_out<- filterAndTrim(fnFs, filtFs, fnRs, filtRs,rm.phix=TRUE)
#show the number of sequences that were filtered according to quality and trimming settings.
head(filtered_out)
# Learn error rates. This action will take several minutes.
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
# Plot the estimated error rates plotted for F and R samples.
plotErrors(errF, nominalQ=TRUE)
plotErrors(errR, nominalQ=TRUE)
# Visualize the html files either by:
# transferring the html files to your local computer.
# opening them directly from the terminal if you are connected via ThinLinc.
# We are now ready to apply the core sample inference algorithm to the dereplicated data.
# We will use the pseudo-pooling. This will take some minutes.
# Make sure you have 2 nodes booked.
dadaFs <- dada(derepFs, err=errF, multithread=TRUE, pool=FALSE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE, pool=FALSE)
# Information on unique sequence variants will appear in the shell and can be inspected using:
dadaFs[[1]]
dadaRs[[1]]
We can now construct the ASV count table.
# construct a count ASV table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
#to save the table:
write.table(table(nchar(getSequences(seqtab))),"seqlength_pseudo.txt",sep="\t")
#create a plot
require(ggplot2)
table <- as.data.frame(table(nchar(colnames(seqtab))))
colnames(table) <- c("LENGTH","COUNT")
p1<- ggplot(table,aes(x=LENGTH,y=COUNT)) +
geom_histogram(stat="identity") +
ggtitle("Sequence Lengths by SEQ Count") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5,size=10)) +
theme(axis.text.y=element_text(size=10))
# save the plot
pdf("p1.pdf")
p1 + scale_y_log10()
dev.off()
# Visualize the pdf file either by:
# transferring the html files to your local computer.
# opening them directly from the terminal if you are connected via ThinLinc.
Remove long and short sequences (sequences that fall outside the
expected range).
# Remove long and short sequences (non-target-length sequences). In this example, sequences from 200 to 500 nucleotide long are kept.
# Adjust the lengths based on the table and plot you just generated, and the expected length.
seqtab2 <- seqtab[,nchar(colnames(seqtab)) %in% seq(300,460)]
# inspect again the distribution of sequence lengths after selecting the read lengths.
table2 <- as.data.frame(table(nchar(colnames(seqtab2))))
colnames(table2) <- c("LENGTH","COUNT")
p2<- ggplot(table2,aes(x=LENGTH,y=COUNT)) +
geom_histogram(stat="identity") +
ggtitle("Sequence Lengths by SEQ Count") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5,size=10)) +
theme(axis.text.y=element_text(size=10))
# save the plot.
pdf("p2.pdf")
p2 + scale_y_log10()
dev.off()
# Visualize the pdf file by:
# opening them directly from the terminal if you are connected via ThinLinc.
#set a function
getN <- function(x) sum(getUniques(x))
#make a table
summary_tab <- data.frame(row.names=sample.names, dada2_input=filtered_out[,1], filtered=filtered_out[,2], denoisedF=sapply(dadaFs, getN), denoisedR=sapply(dadaRs, getN), merged=sapply(mergers, getN), nonchim=rowSums(seqtab.nochim), final_perc_reads_retained=round(rowSums(seqtab.nochim)/filtered_out[,1]*100, 1))
write.table(summary_tab,"summary_tab_pseudo.txt",sep="\t")
write.table(summary_tab, "read-count-tracking.tsv", quote=FALSE, sep="\t", col.names=NA)
taxa <- assignTaxonomy(seqtab.nochim, "/proj/liu-compute-2023-21/16S_amplicon/RefSeq_silva/silva_nr99_v138.1_train_set.fa.gz", multithread=T, tryRC=T,minBoot=80)
#Add species
taxa <- addSpecies(taxa, "/proj/liu-compute-2023-21/16S_amplicon/RefSeq_silva/silva_species_assignment_v138.1.fa.gz",allowMultiple=TRUE)
#Inspect the Taxa file
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
# giving our seq headers more manageable names (ASV_1, ASV_2...)
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
for (i in 1:dim(seqtab.nochim)[2]) {
asv_headers[i] <- paste(">ASV", i, sep="_")
}
# count table:
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, "ASVs_counts.txt", sep="\t", quote=F, col.names=NA)
# tax table:
asv_tax <- taxa
row.names(asv_tax) <- sub(">", "", asv_headers)
write.table(asv_tax, "ASVs_taxonomy.txt", sep="\t", quote=F, col.names=NA)
# download it to your computer to proceed with Part 2.
For your individual assignment you can choose the dataset 2 or
dataset 3.
The individual assignment must be in a form of HTML file
derived from .Rmd. All the code must be provided along with
documentations on the performed steps and (subjective) decisions made
when necessary.
The questions that you encountered through out the
tutorial are meant as a guide for you to reflect on certain steps and
understand what you did and why. The questions must be answered in your
report.
Question 1: Did all
the samples passed the quality control? Are the any modules that failed
(which ones)? Is there a biological explanation why they failed?
Answer: No some failed. The data contained a lot of repeeted sequences
so no sequenceses passed. This could be due to sample contamination.
Another model that also failed is the overrepresentation of sequences
that comfirms the contamination. However all passed the primer sequence
analysis, so that was not the problem.
Question 2: Which
trimming settings did you use and why?
Answer:
- The settings
used was used in the first trimming from the 5´ end, 17 nucleotides
length of reads, the minilength of reads were 100 basees, The primer
used had the sequence of CCTACGGGNGGCWGCAG,GACTACHVGGGTATCTAATCC
-
In the second trimming from the 3´ end, 17 nucleotides length of reads,
the minilength of reads were 100 basees, The primer used had the
sequence of CCTACGGGNGGCWGCAG,GACTACHVGGGTATCTAATCC
Question 3: Keep
track of reads lost on that process: how many (N) input read you had and
how many (N and %) output reads you have? How many (N and %) of reads
were removed with the trimming?
Answer: +The trimming forward
remooved arround 0.18% of the reads and 10% of sequence
+ The
reverse trimming remooved approximetly 30-40% of the reads and 55.36% of
sequence
Question 4: Is the
compromise between quality and number of reads acceptable or should you
revise the trimming and filtering?
Answer:This is a verry high rate
of remooval in the reverse reads. This can be explained as the reverse
end is less accurate.
Question 5: Which
version of Dada2 you worked with (why is this informality
relevant?)?
Answer: we used version 1.28.0. This is important to
know for other to be able to recreate the analysis.
Question 6: Which
type and how many sequences were filtered?
Answer:All the forward
reads were remooved and 65 reads were remmoved from the reverse reads.
Question 7: Do the
error rates look reasonable? Do you think you have a good fit between
the observed and estimated error rate? Do the forward and reverse look
similar?
Answer: most of the models fitted the data correctly.
Exceptions are for the T2A, T2c T2G C2T. The measurements that were
different matched between the forward and reverse reads. IN THIS CASE
THE DEFAULT SETING WAS GOOD
Question 9: What is
the expected overlap of your paired-end sequences here?
Answer:This is done by calculating the overlap between the forward
and reverse reads. This would give an overlap of 140BP
Question 10: What
is the expected length? Does the length of the majority ASVs fall within
the expected range? Which range of sequences length did you keep?
Answer: The sequences found was 239-428bp that is longer than expected.
Question 11: How
many sequences were lost? And how abundant where they?
Answer: 171
sequences. This is about 5% of sequeces.
Question 12: Summary table.
Where did the major loss of reads happen?
Answer: In the
denoicedR-step