First we need to add packages ### Load Libraries

# Bioconductor
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("phyloseq")
BiocManager::install("microbiome")
BiocManager::install("ANCOMBC")
BiocManager::install("caret")

# CRAN
install.packages("ggplot2")
install.packages("data.table")
install.packages("vegan")
install.packages("rstatix")
install.packages("dplyr")
install.packages("indicspecies")

# Devtools
#install.packages("devtools")
#library(devtools)
#install_github("Russel88/MicEco")

library(phyloseq)
library(ggplot2)
library(data.table)
library(vegan)
library(microbiome)
library(ANCOMBC)
library(DESeq2)
library(MicEco)
library(indicspecies)

PHYLOSEQ OBJECT

# setwd
setwd("/Volumes/KODEN/2023/microbiome/")
count_tab <- read.table("ASVs_counts.txt", header=T, row.names=1, check.names=F) 
tax_tab <- as.matrix(read.table("ASVs_taxonomy.txt", header=T, row.names=1, check.names=F, na.strings="", sep="\t"))
data<-read.csv("metadata1.csv",row.names=1)
#create a phyloseq object. We will call the phyloseq object "ps".

ps <- phyloseq(otu_table(count_tab, taxa_are_rows=TRUE),
      tax_table(tax_tab),
      sample_data(data))
ps

# explore the data
ntaxa(ps)
nsamples(ps)
sample_names(ps)
head(rank_names(ps))
sample_variables(ps)

PREPROCESSING

Remove unclassified ASVs

# we will
ps1<-subset_taxa(ps, Kingdom == "Bacteria" & 
                   Family!= "mitochondria" &
                   Phylum != "Cyanobacteria" &
                   Phylum != "NA" &
                   Order != "Chloroplast")

# check how many taxa are left after the pre-processing
ps1

Normalisation of library size (Remove Samples)

# An easy way to graphically identify samples with an even sequence depth is with a bar plot showing the number of reads per sample.
PlotBar<-plot_bar(ps1)+ geom_bar(stat = "identity", position = "stack")
PlotBar

Question 1: do the samples have an even distribution of reads? Based on this plot, would you remove any sample?

No. some samples were pressent at verry low levels compared to some others.

# summarize the total reads per sample 
sort(sample_sums(ps1))

# visualize distribution of samples sequencing depth
sdt = data.table(as(sample_data(ps1), "data.frame"),
   TotalReads = sample_sums(ps1), keep.rownames = TRUE)
setnames(sdt, "rn", "SampleID")

ggplot(sdt, aes(x = TotalReads)) + 
  geom_histogram(color = "black", fill = "indianred", binwidth = 2500) +
  ggtitle("Distribution of sample sequencing depth") + 
  xlab("Read counts") 
  theme(axis.title.y = element_blank())

# visualize the sequencing depth by variable of interest. In this example "batch"
pSeqDepth = ggplot(sdt, aes(TotalReads)) + geom_histogram(bins = 20) + ggtitle("Sequencing Depth") + facet_wrap(~Term)
pSeqDepth

sample_sums(ps)

Question 2:
Why shouldn’t samples with a poor sequencing depth be considered for subsequent analyses?

The samples can be due to contamination and sequencing errors.

Are there any samples that should be removed or rarefied? Which ones, and why?

The onces that have few reads and low depth.

Can you find any biological explanation to why some samples have so few reads?
Use the previous plots and sample_sums() to decide which sampels you want to remove

Remoove 110-1v, 130-1v, 203-1V,


# We will fro examle remove all the samples with less than 10000 reads. 
# you can change number of reads to your convenience.
ps2 <- prune_samples(sample_sums(ps1)>=10000, ps1) 

# check how many samples do you have now
ps2

# To rarefy selected samples. 
# for other datasets or other decisions, here you must change the sample name ("1036-1v","1033-1v"") and the number of reds to be rarefied to (84613).
asv = as(otu_table(ps2), "matrix")
asvt<-as.data.frame(t(asv))
rare<-asvt[c("1036-1v","1033-1v"),] # change sample(s) name
asvt<-asvt[!row.names(asvt) %in% row.names(rare),]
set.seed(3683)
asv.rare<-rrarefy(rare,84613) # change number of reads
asv.rare<-rbind(asvt,asv.rare)


sum(sample_sums(ps2)) # total of reads 
data.table(summarize_phyloseq(ps2)) 

# construct the phyloseq object with the rarefied samples
ps3 <- phyloseq(otu_table(t(asv.rare), taxa_are_rows=TRUE), 
                sample_data(data), 
                tax_table(tax_tab))

# unless we clearly specify it, taxa with no counts will be still left.
# Here we remove all taxa (if any) that has no counts after the rarefaction.
ps3<- prune_taxa(taxa_sums(ps3) > 0, ps3)

# summarize the outcome of pre-processing
sum(sample_sums(ps3)) # total of reads 

ps3
sort(sample_sums(ps3))
data.table(summarize_phyloseq(ps3)) 

sum(otu_table(ps3))
sum(sample_sums(ps3))

Question 3: Provide a summary of the IN and OUT for each preprocessing step:
- total number of reads IN and OUT. You can use for example or
* For Ps2 total number of reads 2048520, Min 105902 and max is 1127913. * For Ps3 min numbers was 105902 and max is 846133. Total 2019379. * Numbers remmoved 2048520-2019379=29141

Prevalence filtering (Remove Taxa)

It is usually a good idea to remove ASVs that have a very low prevalence and when detected they have very few reads. This is a measure to remove potential contaminants and also ASVs with a small mean and large coefficient of variance (i.e. “noise”) that will influence the downstream statistical analyses.
Filtering out low abundance taxa, for instance, will prevent spurious diversity to be considered in your dataset.
Remember filtering is often subjective. You should be honest about that and keep focused on your goal, but always justify and keep a record for your decision for filtering.

# Plot a histogram of counts per ASV.
# The Histogram shows the number of taxa (count) according to the number of reads (TotalReads) across samples. 
tdt = data.table(tax_table(ps3),
   TotalReads = taxa_sums(ps3),
   OTU = taxa_names(ps3))
ggplot(tdt, aes(TotalReads)) + scale_x_log10() + 
 geom_histogram (bins=100) + 
 ggtitle("Histogram of Total Counts")


# Here we will remove ASVs detected in only 1 sample and with less than 30 reads.

#First we will count the ASVs prevalence
prev1 = apply(X = otu_table(ps3),
              MARGIN = 1,
              FUN = function(x){sum(x > 0)})
# Then we will create a table including the prevalence, the total abundance, and the taxonomy.
prev1 = data.frame(Prevalence = prev1,
                   TotalAbundance = taxa_sums(ps3),
                   tax_table(ps3))

# We define the ASVs we want to keep: prevalence>=1 with > 30 reads.
keepTaxa = rownames(prev1)[(prev1$Prevalence >= 1) & (prev1$TotalAbundance > 30)]
ps4 = prune_taxa(keepTaxa, ps3)

ps4

# check the distributions of total reds per taxa again. Has it changed? Were taxa with few total reads filtered out?
tdt = data.table(tax_table(ps4),
   TotalReads = taxa_sums(ps4),
   OTU = taxa_names(ps4))
ggplot(tdt, aes(TotalReads)) + scale_x_log10() + 
 geom_histogram (bins=100) + 
 ggtitle("Histogram of Total Counts")

Question 4: Why do we want to remove low prevalence reads? How many taxa were “lost” after the prevalence filtering?

We lost 110 taxa durring the filtering. If it is only present one time it probably lacks function or is a result of contamination or enviromental factors.

SUMMARY OF PREPROCESSIING

Considerations for your dataset: Check if you have an imbalanced dataset and act accordingly.
For instance, you may need to filter out samples of a very poor sequencing depth compared to the rest of the samples in the dataset or to use a rarefaction process to normalise the number of reads per sample (of all or some samples).
Use the graphs and table with sorted number of reads to decide what is a reasonable number of reads/sample.

MICROBIAL COMMUNITY COMPOSITION

# bar chart using Categories
pseq.fam <- aggregate_rare(ps4, level="Family", detection = 500, prevalence = 2/10)
p.fam <- plot_composition(pseq.fam, sample.sort = NULL, 
    otu.sort = NULL,
    group_by = "Term",
    plot.type = "barplot",
    verbose = FALSE) + 
 theme_bw() + scale_fill_brewer("Family", palette = "Paired")
p.fam

Question 5: why the abundance differs that much across samples? Can you notice some difference already between the two groups?

Answer: The abundance in the full term babies have had longer time to grow. The preterm bacterium is less diverse and contains mostly Staphylococcaceae bacteria while there is no clear dominant in the fullterm group. It also seems like there is less bacteria compared to fullterm in the samples.

#Transform to relative abundance. Save as new object.
ps4_rel = transform_sample_counts(ps4, function(x){x / sum(x)})

# Repeat the previous plot
pseq.fam <- aggregate_rare(ps4_rel, level="Family", detection = 0.1, prevalence = 1/10)
p.fam <- plot_composition(pseq.fam, sample.sort = NULL, 
    otu.sort = NULL,
    group_by = "Term",
    plot.type = "barplot", 
    verbose = FALSE) + 
 theme_bw() + scale_fill_brewer("Family", palette = "Paired")
p.fam

MICROBIAL COMMUNITY STRUCTURE

Alpha diversity

Visualization

#Calculate alpha metrics; richness and Shannon diversity index
alpha_div<-estimate_richness(ps4, measures=c("Observed", "Shannon"))
alpha_div

# Plot the Alpha diversity measures according to Term
P1 <- plot_richness(ps4, x = "Term", measures=c("Observed", "Shannon"), color = "Term") + geom_boxplot()
P1

Testing

#extract metadata file from the filtered and rarefied phyloseq object.
metadata = as(sample_data(ps4), "matrix")
# Coerce to data.frame named metadata
metadata = as.data.frame(metadata)

# Add Shannon diversity values to the metadata file, now meta2
metadata$Shannon <- alpha_div$Shannon 
metadata$Observed <- alpha_div$Observed 

# check equality of variances
require(dplyr)
require(rstatix)

metadata %>%
  levene_test(Shannon ~ Term)

metadata %>%
  levene_test(Observed ~ Term)


# check for normality
shapiro.test(alpha_div$Shannon)
shapiro.test(alpha_div$Observed)

# We will apply a t-test based on the assumptions checked above.
# Change to the appropriate test if necessary. You can follow the tutorials on the Links above.
# unpaired two-sample t-test Shannon
x <- filter(metadata, Term == "Preterm") %>% select(Shannon)
y <- filter(metadata, Term == "Fullterm") %>% select(Shannon)        
t.test(x, y, alternative = "two.sided", var.equal = FALSE)


# unpaired two-sample t-test Observed
z <- filter(metadata, Term == "Preterm") %>% select(Observed)
a <- filter(metadata, Term == "Fullterm") %>% select(Observed)        
t.test(a, z, alternative = "two.sided", var.equal = FALSE)

Adapt the code above for the alpha-diversity indicator “Observed”

<span style="color: red;">**Question 6**</span>: Make sure you used the correct statistical test. Interpret the alpha-diversity results. What can you conclude?

Answers:
The shannon data was found to be non-normaly distributed so a wilcon test would be used (I didnt get it to work)
The observed data was found to be normaly distributed the t tes shall be used. This showed a difference between preterm and Fullterm baybies. 


### Beta diversity

#### Visualization

```r
# Ordination of Samples: NMDS based on Bray Curtis distances
nmds_bray <- ordinate(ps4_rel, method="NMDS", distance="bray") 
# you can test many other methods and distances here. 
plot_nmds_bray = plot_ordination(ps4_rel, nmds_bray, type="samples", color="Term", shape="Term", label="Term") + geom_point(size=3) + ggtitle("NMDS ordination of samples (Bray Curtis)")
plot_nmds_bray

# Ordination of Samples: NMDS based on VST
# First need to to the VST
dds = phyloseq_to_deseq2(ps4, ~Term) # we must define the variable we want to VST to.
deseq_counts <- estimateSizeFactors(dds, type = "poscounts") # add pseudcounts
NormVST = t(assay(varianceStabilizingTransformation(deseq_counts))) # normalization VST and pull out transformed table
euc_dist <- dist(t(NormVST)) # calculate euclidean distance on the VST data
ps4.VST = ps4
otu_table(ps4.VST)<-otu_table(NormVST,FALSE) # new phyloseq object with otu_table replaced by VST

# NMDS based on VST
pcoa_vst<- ordinate(ps4.VST, method="MDS", distance="euclidean")
plot_pco_vst = plot_ordination(ps4.VST, pcoa_vst, type="samples", color="Term", shape="Term", label="Term") + geom_point(size=3) + ggtitle("MDS ordination of samples (VST)")
plot_pco_vst

Testing

# Extract ASV table from the phyloseq object 
ASV = as(otu_table(ps4), "matrix")
# transpose if necessary
if(taxa_are_rows(ps4)){ASV <- t(ASV)}
# Coerce to data.frame named metadata
ASVdf= as.data.frame(ASV)

# Extract Taxonomy table from the  phyloseq object 
TAXA = as(tax_table(ps4), "matrix")
# transpose if necessary
if(taxa_are_rows(ps4)){TAXA <- t(TAXA)}
# Coerce to a data.frame named TAXAdf
TAXAdf = as.data.frame(TAXA)

# Bray-Curtis dissimilarity index with hellinger transformation of data
hell.dist <- vegdist(decostand(ASVdf, method="hellinger"), method = "bray")
dim(hell.dist)

# betadisper calculation
betadisper.factor <- betadisper(hell.dist, as.factor(metadata$Term)) # you had extracted the metadata above. Here you must change the variable of interest.
permutest(betadisper.factor, permutations=999, pairwise = TRUE)
# Order factor levels
betadisper.factor$group <- factor(betadisper.factor$group, levels = c("Preterm","Fullterm"))

# # Posthoc comparisons for when you have more than two categories
(factor.HSD <- TukeyHSD(betadisper.factor))
plot(factor.HSD)

# betadisper graph
plot_betadisper <-qplot(x = factor(betadisper.factor$group), y = betadisper.factor$distances, geom="boxplot") + theme_bw()+
  geom_boxplot(aes(fill = factor(betadisper.factor$group), alpha=0.6))+ geom_jitter(width=0.25, alpha=0.4, size=2, color = "black") + labs(x="Term", y="Distance to centroid", caption = "")+
  theme(legend.position="none", axis.text.x=element_text(size = rel(1), angle=0, vjust=0, hjust = 0.5), aspect.ratio = 4/4)
plot_betadisper
# OPTION 1. anosim test
ps4_anosim <- anosim(x = hell.dist, grouping = metadata$Term, permutations = 999) 
summary (ps4_anosim)
plot (ps4_anosim) #plot option of summary data

# OPTION 2. Adonis test. We can't use adonis, but it is written here so you know how to run it.
ps4_adonis <- adonis(hell.dist ~ Term, data = metadata, permutations = 999)
ps4_adonis

Question 7: Interpret the beta-diversity results.
Which plotting method works better?

Answer: The Beta values shows a broader variance in the variance of bacterium.

DIFFERENTIAL ABUNDANCE

Venn diagrams

##Venn diagrams. Venn diagrams show in a very easy way the number of ASV that are exclussive for a category
require(MicEco)

Venn_Treat <- ps_venn(
  ps4,
  group= 'Term',
  fraction = 0,
  weight = FALSE,
  relative = FALSE,
  plot = TRUE, quantities = list(type=c("counts")), labels = list(cex = 1), fill= c("#4682B4", "#CC0033"), alpha = 0.6, adjust.labels = TRUE) 
  #here I used manual colors, If more than three categories in your data, add more colors
Venn_Treat

Take advantage of the additional feature of this Venn diagram by settingrelative=TRUE and weigth=TRUE to get an idea of the relative importance of the ASVs for the community.

## Perform the plot again with the arguments modification

Venn_Treat <- ps_venn(
  ps4,
  group= 'Term',
  fraction = 0,
  weight = TRUE,
  relative = TRUE,
  plot = TRUE, quantities = list(type=c("counts")), labels = list(cex = 1), fill= c("#4682B4", "#CC0033"), alpha = 0.6, adjust.labels = TRUE) #here I used manual colors, If more than three categories in your data, add more colors
Venn_Treat

Question 8: Interpret the results. How many different ASVs (taxa) are shared between the two groups? How much do they represent in terms of relative abundance?

ANSWER: 5 ASV:s are shared between fullterm and preterm. This makes up aproxmatly 7.3%

Indicator species

require(indicspecies) 
# You  will need an ASV table file and a metadata file. Please, note in this tutorial ASVdf and metadata files have been extracted previously from the phyloseq object and can be used here. In the code below we also define the factor (factor1) to be used for comparisons, "Term" in this example.
dt.iv <- data.frame("sample_id"=rownames(ASVdf), "factor1"=metadata$Term, ASVdf)

#extract dimension of the data frame. This info is needed to set the colSums([VALUES]) in the next line.
dim(dt.iv) 
head(names(dt.iv))

# check and remove if any column contains all zero values
dt.iv.f <- as.data.frame(dt.iv[ , colSums(dt.iv[3:365]) != 0] ) 
# the [3:365] refer to all the ASV in our date set. This information is obtained from the dim(dt.iv)
dim(dt.iv.f)
df.iv <- dt.iv.f
levels(as.factor(dt.iv$factor1))
indval2 = multipatt(x = dt.iv[,-c(1:2)], cluster = dt.iv$factor1, func="IndVal", control = how(nperm=999))
decode <- colnames(indval2$comb)

# Lists these species with significant association to one combination, 
# alpha: Significance level for selected species in the summary.
# minstat: Minimum value of the statistic for selecting species in the summary
# This results informas as of each ASV are singifiant but without knowing their assigned taxonomy it is imossible to interpret.
summary(indval2, alpha=0.01, minstat = 0.5)

# You capture output as a txt file
capture.output(summary(indval2, alpha=0.01, minstat = 0.5),file = "./output_indval2.txt")

# Here we add taxonomy information to IndVal results and explore. TAXAdf obtained earlier is needed.
# Include a new column with ASV names
TAXA<-as.data.frame(t(TAXAdf))
TAXA$ASV_id <- rownames(TAXA)
# convert indVal2 sign file from the output to a data frame
indval2.df <- as.data.frame(indval2$sign)
# Include ASV_id to indVal result and to TAXAdf
indval2.df$ASV_id <- rownames(indval2.df);names(indval2.df)
indval2.df.tax <- inner_join(x = indval2.df, y = TAXA, by = "ASV_id")
head(indval2.df.tax)
# Capture output as a txt file,now including the taxonomy.
write.table(indval2.df.tax, file = "indval2.df.tax.txt", sep = "\t", na = "NA", row.names = F, quote = F)

# explore the indval2.df.tax file

Question 9: Did you find any indicator taxa? Interpete the results.

Answer: The main families were Enterobacterias and Bacteroidaceae. Bacteroides, staphylococer and Veillonella was the biggest genus and ### Differential abundance We will use ANCOMBC. Several papers suggest this is the best option.
But, Here you could encounter problems installing the “mia” package. If this is the case, you can’t run this part. But at last you have it so you know how to do it.
If this is thecase, then procees with DESeq2, also commonly used, althouh it retuerns more false-positives.

Ancom

# prerequisists:
remotes::install_github("FelixErnst/mia")

library(mia)
require(ANCOMBC)
require(tidyverse)
require(caret)
require(DT)


tse<-makeTreeSEFromPhyloseq(ps4)
colData(tse)$outcome = factor(colData(tse)$outcome, levels = c("Fullterm", "Preterm"))
levels(colData(tse)$outcome)


set.seed(123)
output = ancombc2(data = tse, assay_name = "counts",  tax_level = NULL,
                  fix_formula = "outcome", rand_formula = NULL,
                  p_adj_method = "fdr", pseudo = 0, pseudo_sens = TRUE,
                  prv_cut = 0.10, lib_cut = 0, s0_perc = 0.05,
               struc_zero = FALSE, neg_lb = FALSE,
                  alpha = 0.05, n_cl = 2, verbose = TRUE,
                  global = FALSE, pairwise = FALSE, 
                  dunnet = FALSE, trend = FALSE,
                  iter_control = list(tol = 1e-5, max_iter = 20, 
                                      verbose = FALSE),
                  em_control = list(tol = 1e-5, max_iter = 100),
                  lme_control = NULL, mdfdr_control = NULL, 
                  trend_control = NULL)

 output$res

DESeq2

#Option 3. Differential abundance
require (DESeq2)

# Abvoe you already convert phyloseq data to DESeq format and run basic analysis (dds = phyloseq_to_deseq2(ps4, ~Term))
dds_deseq = DESeq(dds, test="Wald", fitType="parametric")

#To analyze the results, I used an alpha parameter (probability) of 0.01 for differences to be accepted. The method computes a log2fold change in the order you entered the categories. 
deseq_res = results(dds_deseq, cooksCutoff = FALSE, alpha = 0.01, contrast=c("Fullterm", "Preterm"))
summary(deseq_res) 
sigtab = deseq_res[which(deseq_res$padj < 0.01), ]

#we now add taxonomy in a data frame
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps4)[rownames(sigtab), ], "matrix"))
head(sigtab)

#we can also plot data as a graph. I used the Phylum and Class ranks since you may see from sigtab some of the differentially abundant taxa are not classified at lower taxonomic ranks
library("ggplot2")
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
  scale_fill_brewer(palette = palname, ...)
}

# Phylum ordering of data
x = tapply(sigtab$log2FoldChange, sigtab$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtab$Phylum = factor(as.character(sigtab$Phylum), levels=names(x))

# Class ordering of data
x = tapply(sigtab$log2FoldChange, sigtab$Class, function(x) max(x))
x = sort(x, TRUE)
sigtab$Class = factor(as.character(sigtab$Class), levels=names(x))
# plot data
ggplot(sigtab, aes(x=Class, y=log2FoldChange, color=Phylum)) + geom_point(size=3) + 
  theme(axis.text.x = element_text(angle = -45, hjust = 0, vjust=-0.1))

Question 10: Hopefully you could run some of the differential abundance test. Can you interpret the results?
No I couldn´t get it work.