First we need to add packages ### Load Libraries
# Bioconductor
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("phyloseq")
BiocManager::install("microbiome")
BiocManager::install("ANCOMBC")
BiocManager::install("caret")
# CRAN
install.packages("ggplot2")
install.packages("data.table")
install.packages("vegan")
install.packages("rstatix")
install.packages("dplyr")
install.packages("indicspecies")
# Devtools
#install.packages("devtools")
#library(devtools)
#install_github("Russel88/MicEco")
library(phyloseq)
library(ggplot2)
library(data.table)
library(vegan)
library(microbiome)
library(ANCOMBC)
library(DESeq2)
library(MicEco)
library(indicspecies)
# setwd
setwd("/Volumes/KODEN/2023/microbiome/")
count_tab <- read.table("ASVs_counts.txt", header=T, row.names=1, check.names=F)
tax_tab <- as.matrix(read.table("ASVs_taxonomy.txt", header=T, row.names=1, check.names=F, na.strings="", sep="\t"))
data<-read.csv("metadata1.csv",row.names=1)
#create a phyloseq object. We will call the phyloseq object "ps".
ps <- phyloseq(otu_table(count_tab, taxa_are_rows=TRUE),
tax_table(tax_tab),
sample_data(data))
ps
# explore the data
ntaxa(ps)
nsamples(ps)
sample_names(ps)
head(rank_names(ps))
sample_variables(ps)
# we will
ps1<-subset_taxa(ps, Kingdom == "Bacteria" &
Family!= "mitochondria" &
Phylum != "Cyanobacteria" &
Phylum != "NA" &
Order != "Chloroplast")
# check how many taxa are left after the pre-processing
ps1
# An easy way to graphically identify samples with an even sequence depth is with a bar plot showing the number of reads per sample.
PlotBar<-plot_bar(ps1)+ geom_bar(stat = "identity", position = "stack")
PlotBar
Question 1: do the
samples have an even distribution of reads? Based on this plot, would
you remove any sample?
No. some samples were pressent at verry low levels compared to some others.
# summarize the total reads per sample
sort(sample_sums(ps1))
# visualize distribution of samples sequencing depth
sdt = data.table(as(sample_data(ps1), "data.frame"),
TotalReads = sample_sums(ps1), keep.rownames = TRUE)
setnames(sdt, "rn", "SampleID")
ggplot(sdt, aes(x = TotalReads)) +
geom_histogram(color = "black", fill = "indianred", binwidth = 2500) +
ggtitle("Distribution of sample sequencing depth") +
xlab("Read counts")
theme(axis.title.y = element_blank())
# visualize the sequencing depth by variable of interest. In this example "batch"
pSeqDepth = ggplot(sdt, aes(TotalReads)) + geom_histogram(bins = 20) + ggtitle("Sequencing Depth") + facet_wrap(~Term)
pSeqDepth
sample_sums(ps)
Question 2:
Why shouldn’t samples with a poor sequencing depth be considered for
subsequent analyses?
The samples can be due to contamination and sequencing errors.
Are there any samples that should be removed or rarefied? Which ones,
and why?
The onces that have few reads and low depth.
Can you find any biological explanation to why some samples have so
few reads?
Use the previous plots and sample_sums()
to
decide which sampels you want to remove
Remoove 110-1v, 130-1v, 203-1V,
# We will fro examle remove all the samples with less than 10000 reads.
# you can change number of reads to your convenience.
ps2 <- prune_samples(sample_sums(ps1)>=10000, ps1)
# check how many samples do you have now
ps2
# To rarefy selected samples.
# for other datasets or other decisions, here you must change the sample name ("1036-1v","1033-1v"") and the number of reds to be rarefied to (84613).
asv = as(otu_table(ps2), "matrix")
asvt<-as.data.frame(t(asv))
rare<-asvt[c("1036-1v","1033-1v"),] # change sample(s) name
asvt<-asvt[!row.names(asvt) %in% row.names(rare),]
set.seed(3683)
asv.rare<-rrarefy(rare,84613) # change number of reads
asv.rare<-rbind(asvt,asv.rare)
sum(sample_sums(ps2)) # total of reads
data.table(summarize_phyloseq(ps2))
# construct the phyloseq object with the rarefied samples
ps3 <- phyloseq(otu_table(t(asv.rare), taxa_are_rows=TRUE),
sample_data(data),
tax_table(tax_tab))
# unless we clearly specify it, taxa with no counts will be still left.
# Here we remove all taxa (if any) that has no counts after the rarefaction.
ps3<- prune_taxa(taxa_sums(ps3) > 0, ps3)
# summarize the outcome of pre-processing
sum(sample_sums(ps3)) # total of reads
ps3
sort(sample_sums(ps3))
data.table(summarize_phyloseq(ps3))
sum(otu_table(ps3))
sum(sample_sums(ps3))
Question 3: Provide
a summary of the IN and OUT for each preprocessing step:
- total
number of reads IN and OUT. You can use for example or
* For Ps2
total number of reads 2048520, Min 105902 and max is 1127913. * For Ps3
min numbers was 105902 and max is 846133. Total 2019379. * Numbers
remmoved 2048520-2019379=29141
It is usually a good idea to remove ASVs that have a very low
prevalence and when detected they have very few reads. This is a measure
to remove potential contaminants and also ASVs with a small mean and
large coefficient of variance (i.e. “noise”) that will influence the
downstream statistical analyses.
Filtering out low abundance taxa,
for instance, will prevent spurious diversity to be considered in your
dataset.
Remember filtering is often
subjective. You should be honest about that and keep focused on
your goal, but always justify and keep a record for your decision for
filtering.
# Plot a histogram of counts per ASV.
# The Histogram shows the number of taxa (count) according to the number of reads (TotalReads) across samples.
tdt = data.table(tax_table(ps3),
TotalReads = taxa_sums(ps3),
OTU = taxa_names(ps3))
ggplot(tdt, aes(TotalReads)) + scale_x_log10() +
geom_histogram (bins=100) +
ggtitle("Histogram of Total Counts")
# Here we will remove ASVs detected in only 1 sample and with less than 30 reads.
#First we will count the ASVs prevalence
prev1 = apply(X = otu_table(ps3),
MARGIN = 1,
FUN = function(x){sum(x > 0)})
# Then we will create a table including the prevalence, the total abundance, and the taxonomy.
prev1 = data.frame(Prevalence = prev1,
TotalAbundance = taxa_sums(ps3),
tax_table(ps3))
# We define the ASVs we want to keep: prevalence>=1 with > 30 reads.
keepTaxa = rownames(prev1)[(prev1$Prevalence >= 1) & (prev1$TotalAbundance > 30)]
ps4 = prune_taxa(keepTaxa, ps3)
ps4
# check the distributions of total reds per taxa again. Has it changed? Were taxa with few total reads filtered out?
tdt = data.table(tax_table(ps4),
TotalReads = taxa_sums(ps4),
OTU = taxa_names(ps4))
ggplot(tdt, aes(TotalReads)) + scale_x_log10() +
geom_histogram (bins=100) +
ggtitle("Histogram of Total Counts")
Question 4: Why do we want to remove low prevalence reads? How many taxa were “lost” after the prevalence filtering?
We lost 110 taxa durring the filtering. If it is only present one time it probably lacks function or is a result of contamination or enviromental factors.
Considerations for your dataset: Check if you have an imbalanced
dataset and act accordingly.
For instance, you may need to filter
out samples of a very poor sequencing depth compared to the rest of the
samples in the dataset or to use a rarefaction process to normalise the
number of reads per sample (of all or some samples).
Use the graphs
and table with sorted number of reads to decide what is a reasonable
number of reads/sample.
# bar chart using Categories
pseq.fam <- aggregate_rare(ps4, level="Family", detection = 500, prevalence = 2/10)
p.fam <- plot_composition(pseq.fam, sample.sort = NULL,
otu.sort = NULL,
group_by = "Term",
plot.type = "barplot",
verbose = FALSE) +
theme_bw() + scale_fill_brewer("Family", palette = "Paired")
p.fam
Question 5: why the
abundance differs that much across samples? Can you notice some
difference already between the two groups?
Answer: The abundance in the full term babies have had longer time to grow. The preterm bacterium is less diverse and contains mostly Staphylococcaceae bacteria while there is no clear dominant in the fullterm group. It also seems like there is less bacteria compared to fullterm in the samples.
#Transform to relative abundance. Save as new object.
ps4_rel = transform_sample_counts(ps4, function(x){x / sum(x)})
# Repeat the previous plot
pseq.fam <- aggregate_rare(ps4_rel, level="Family", detection = 0.1, prevalence = 1/10)
p.fam <- plot_composition(pseq.fam, sample.sort = NULL,
otu.sort = NULL,
group_by = "Term",
plot.type = "barplot",
verbose = FALSE) +
theme_bw() + scale_fill_brewer("Family", palette = "Paired")
p.fam
#Calculate alpha metrics; richness and Shannon diversity index
alpha_div<-estimate_richness(ps4, measures=c("Observed", "Shannon"))
alpha_div
# Plot the Alpha diversity measures according to Term
P1 <- plot_richness(ps4, x = "Term", measures=c("Observed", "Shannon"), color = "Term") + geom_boxplot()
P1
#extract metadata file from the filtered and rarefied phyloseq object.
metadata = as(sample_data(ps4), "matrix")
# Coerce to data.frame named metadata
metadata = as.data.frame(metadata)
# Add Shannon diversity values to the metadata file, now meta2
metadata$Shannon <- alpha_div$Shannon
metadata$Observed <- alpha_div$Observed
# check equality of variances
require(dplyr)
require(rstatix)
metadata %>%
levene_test(Shannon ~ Term)
metadata %>%
levene_test(Observed ~ Term)
# check for normality
shapiro.test(alpha_div$Shannon)
shapiro.test(alpha_div$Observed)
# We will apply a t-test based on the assumptions checked above.
# Change to the appropriate test if necessary. You can follow the tutorials on the Links above.
# unpaired two-sample t-test Shannon
x <- filter(metadata, Term == "Preterm") %>% select(Shannon)
y <- filter(metadata, Term == "Fullterm") %>% select(Shannon)
t.test(x, y, alternative = "two.sided", var.equal = FALSE)
# unpaired two-sample t-test Observed
z <- filter(metadata, Term == "Preterm") %>% select(Observed)
a <- filter(metadata, Term == "Fullterm") %>% select(Observed)
t.test(a, z, alternative = "two.sided", var.equal = FALSE)
Adapt the code above for the alpha-diversity indicator “Observed”
<span style="color: red;">**Question 6**</span>: Make sure you used the correct statistical test. Interpret the alpha-diversity results. What can you conclude?
Answers:
The shannon data was found to be non-normaly distributed so a wilcon test would be used (I didnt get it to work)
The observed data was found to be normaly distributed the t tes shall be used. This showed a difference between preterm and Fullterm baybies.
### Beta diversity
#### Visualization
```r
# Ordination of Samples: NMDS based on Bray Curtis distances
nmds_bray <- ordinate(ps4_rel, method="NMDS", distance="bray")
# you can test many other methods and distances here.
plot_nmds_bray = plot_ordination(ps4_rel, nmds_bray, type="samples", color="Term", shape="Term", label="Term") + geom_point(size=3) + ggtitle("NMDS ordination of samples (Bray Curtis)")
plot_nmds_bray
# Ordination of Samples: NMDS based on VST
# First need to to the VST
dds = phyloseq_to_deseq2(ps4, ~Term) # we must define the variable we want to VST to.
deseq_counts <- estimateSizeFactors(dds, type = "poscounts") # add pseudcounts
NormVST = t(assay(varianceStabilizingTransformation(deseq_counts))) # normalization VST and pull out transformed table
euc_dist <- dist(t(NormVST)) # calculate euclidean distance on the VST data
ps4.VST = ps4
otu_table(ps4.VST)<-otu_table(NormVST,FALSE) # new phyloseq object with otu_table replaced by VST
# NMDS based on VST
pcoa_vst<- ordinate(ps4.VST, method="MDS", distance="euclidean")
plot_pco_vst = plot_ordination(ps4.VST, pcoa_vst, type="samples", color="Term", shape="Term", label="Term") + geom_point(size=3) + ggtitle("MDS ordination of samples (VST)")
plot_pco_vst
# Extract ASV table from the phyloseq object
ASV = as(otu_table(ps4), "matrix")
# transpose if necessary
if(taxa_are_rows(ps4)){ASV <- t(ASV)}
# Coerce to data.frame named metadata
ASVdf= as.data.frame(ASV)
# Extract Taxonomy table from the phyloseq object
TAXA = as(tax_table(ps4), "matrix")
# transpose if necessary
if(taxa_are_rows(ps4)){TAXA <- t(TAXA)}
# Coerce to a data.frame named TAXAdf
TAXAdf = as.data.frame(TAXA)
# Bray-Curtis dissimilarity index with hellinger transformation of data
hell.dist <- vegdist(decostand(ASVdf, method="hellinger"), method = "bray")
dim(hell.dist)
# betadisper calculation
betadisper.factor <- betadisper(hell.dist, as.factor(metadata$Term)) # you had extracted the metadata above. Here you must change the variable of interest.
permutest(betadisper.factor, permutations=999, pairwise = TRUE)
# Order factor levels
betadisper.factor$group <- factor(betadisper.factor$group, levels = c("Preterm","Fullterm"))
# # Posthoc comparisons for when you have more than two categories
(factor.HSD <- TukeyHSD(betadisper.factor))
plot(factor.HSD)
# betadisper graph
plot_betadisper <-qplot(x = factor(betadisper.factor$group), y = betadisper.factor$distances, geom="boxplot") + theme_bw()+
geom_boxplot(aes(fill = factor(betadisper.factor$group), alpha=0.6))+ geom_jitter(width=0.25, alpha=0.4, size=2, color = "black") + labs(x="Term", y="Distance to centroid", caption = "")+
theme(legend.position="none", axis.text.x=element_text(size = rel(1), angle=0, vjust=0, hjust = 0.5), aspect.ratio = 4/4)
plot_betadisper
# OPTION 1. anosim test
ps4_anosim <- anosim(x = hell.dist, grouping = metadata$Term, permutations = 999)
summary (ps4_anosim)
plot (ps4_anosim) #plot option of summary data
# OPTION 2. Adonis test. We can't use adonis, but it is written here so you know how to run it.
ps4_adonis <- adonis(hell.dist ~ Term, data = metadata, permutations = 999)
ps4_adonis
Question 7:
Interpret the beta-diversity results.
Which plotting method works
better?
Answer: The Beta values shows a broader variance in the variance of bacterium.
##Venn diagrams. Venn diagrams show in a very easy way the number of ASV that are exclussive for a category
require(MicEco)
Venn_Treat <- ps_venn(
ps4,
group= 'Term',
fraction = 0,
weight = FALSE,
relative = FALSE,
plot = TRUE, quantities = list(type=c("counts")), labels = list(cex = 1), fill= c("#4682B4", "#CC0033"), alpha = 0.6, adjust.labels = TRUE)
#here I used manual colors, If more than three categories in your data, add more colors
Venn_Treat
Take advantage of the additional feature of this Venn diagram by
settingrelative=TRUE
and weigth=TRUE
to get an
idea of the relative importance of the ASVs for the community.
## Perform the plot again with the arguments modification
Venn_Treat <- ps_venn(
ps4,
group= 'Term',
fraction = 0,
weight = TRUE,
relative = TRUE,
plot = TRUE, quantities = list(type=c("counts")), labels = list(cex = 1), fill= c("#4682B4", "#CC0033"), alpha = 0.6, adjust.labels = TRUE) #here I used manual colors, If more than three categories in your data, add more colors
Venn_Treat
Question 8:
Interpret the results. How many different ASVs (taxa) are shared between
the two groups? How much do they represent in terms of relative
abundance?
ANSWER: 5 ASV:s are shared between fullterm and preterm. This makes
up aproxmatly 7.3%
require(indicspecies)
# You will need an ASV table file and a metadata file. Please, note in this tutorial ASVdf and metadata files have been extracted previously from the phyloseq object and can be used here. In the code below we also define the factor (factor1) to be used for comparisons, "Term" in this example.
dt.iv <- data.frame("sample_id"=rownames(ASVdf), "factor1"=metadata$Term, ASVdf)
#extract dimension of the data frame. This info is needed to set the colSums([VALUES]) in the next line.
dim(dt.iv)
head(names(dt.iv))
# check and remove if any column contains all zero values
dt.iv.f <- as.data.frame(dt.iv[ , colSums(dt.iv[3:365]) != 0] )
# the [3:365] refer to all the ASV in our date set. This information is obtained from the dim(dt.iv)
dim(dt.iv.f)
df.iv <- dt.iv.f
levels(as.factor(dt.iv$factor1))
indval2 = multipatt(x = dt.iv[,-c(1:2)], cluster = dt.iv$factor1, func="IndVal", control = how(nperm=999))
decode <- colnames(indval2$comb)
# Lists these species with significant association to one combination,
# alpha: Significance level for selected species in the summary.
# minstat: Minimum value of the statistic for selecting species in the summary
# This results informas as of each ASV are singifiant but without knowing their assigned taxonomy it is imossible to interpret.
summary(indval2, alpha=0.01, minstat = 0.5)
# You capture output as a txt file
capture.output(summary(indval2, alpha=0.01, minstat = 0.5),file = "./output_indval2.txt")
# Here we add taxonomy information to IndVal results and explore. TAXAdf obtained earlier is needed.
# Include a new column with ASV names
TAXA<-as.data.frame(t(TAXAdf))
TAXA$ASV_id <- rownames(TAXA)
# convert indVal2 sign file from the output to a data frame
indval2.df <- as.data.frame(indval2$sign)
# Include ASV_id to indVal result and to TAXAdf
indval2.df$ASV_id <- rownames(indval2.df);names(indval2.df)
indval2.df.tax <- inner_join(x = indval2.df, y = TAXA, by = "ASV_id")
head(indval2.df.tax)
# Capture output as a txt file,now including the taxonomy.
write.table(indval2.df.tax, file = "indval2.df.tax.txt", sep = "\t", na = "NA", row.names = F, quote = F)
# explore the indval2.df.tax file
Question 9: Did you find any indicator taxa? Interpete the results.
Answer: The main families were Enterobacterias and Bacteroidaceae.
Bacteroides, staphylococer and Veillonella was the biggest genus and ###
Differential abundance We will use ANCOMBC. Several papers suggest this
is the best option.
But, Here you could encounter problems
installing the “mia” package. If this is the case, you can’t run this
part. But at last you have it so you know how to do it.
If this is
thecase, then procees with DESeq2, also commonly used, althouh it
retuerns more false-positives.
# prerequisists:
remotes::install_github("FelixErnst/mia")
library(mia)
require(ANCOMBC)
require(tidyverse)
require(caret)
require(DT)
tse<-makeTreeSEFromPhyloseq(ps4)
colData(tse)$outcome = factor(colData(tse)$outcome, levels = c("Fullterm", "Preterm"))
levels(colData(tse)$outcome)
set.seed(123)
output = ancombc2(data = tse, assay_name = "counts", tax_level = NULL,
fix_formula = "outcome", rand_formula = NULL,
p_adj_method = "fdr", pseudo = 0, pseudo_sens = TRUE,
prv_cut = 0.10, lib_cut = 0, s0_perc = 0.05,
struc_zero = FALSE, neg_lb = FALSE,
alpha = 0.05, n_cl = 2, verbose = TRUE,
global = FALSE, pairwise = FALSE,
dunnet = FALSE, trend = FALSE,
iter_control = list(tol = 1e-5, max_iter = 20,
verbose = FALSE),
em_control = list(tol = 1e-5, max_iter = 100),
lme_control = NULL, mdfdr_control = NULL,
trend_control = NULL)
output$res
#Option 3. Differential abundance
require (DESeq2)
# Abvoe you already convert phyloseq data to DESeq format and run basic analysis (dds = phyloseq_to_deseq2(ps4, ~Term))
dds_deseq = DESeq(dds, test="Wald", fitType="parametric")
#To analyze the results, I used an alpha parameter (probability) of 0.01 for differences to be accepted. The method computes a log2fold change in the order you entered the categories.
deseq_res = results(dds_deseq, cooksCutoff = FALSE, alpha = 0.01, contrast=c("Fullterm", "Preterm"))
summary(deseq_res)
sigtab = deseq_res[which(deseq_res$padj < 0.01), ]
#we now add taxonomy in a data frame
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps4)[rownames(sigtab), ], "matrix"))
head(sigtab)
#we can also plot data as a graph. I used the Phylum and Class ranks since you may see from sigtab some of the differentially abundant taxa are not classified at lower taxonomic ranks
library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
scale_fill_brewer(palette = palname, ...)
}
# Phylum ordering of data
x = tapply(sigtab$log2FoldChange, sigtab$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtab$Phylum = factor(as.character(sigtab$Phylum), levels=names(x))
# Class ordering of data
x = tapply(sigtab$log2FoldChange, sigtab$Class, function(x) max(x))
x = sort(x, TRUE)
sigtab$Class = factor(as.character(sigtab$Class), levels=names(x))
# plot data
ggplot(sigtab, aes(x=Class, y=log2FoldChange, color=Phylum)) + geom_point(size=3) +
theme(axis.text.x = element_text(angle = -45, hjust = 0, vjust=-0.1))
Question 10:
Hopefully you could run some of the differential abundance test. Can you
interpret the results?
No I couldn´t get it work.