11  Figure 5

11.1 Figure 5 code

11.1.1 Panel A code

# import and prep data
raw_data = read.delim("./data/p02-05/grouped phenotype data with annotations.csv", sep = ",", header = TRUE)

raw_data = raw_data %>%
  dplyr::filter(group != "WL004") %>%
  dplyr::filter(group != "LA001")
raw_data$Industry = ifelse(
  raw_data$region %in% c("South-West Norway", "Lithuania", "North-West Norway", "Russia",
                         "Central-Eastern Norway", "Latvia", "South-Eastern Norway"),
  "Farmhouse",
  raw_data$region
)
raw_data$Industry = ifelse(
  raw_data$Outside == "yes",
  "Allochthonous\nyeast",
  raw_data$Industry
)
raw_data$region = ifelse(
  raw_data$Industry == "Farmhouse",
  raw_data$region,
  "NA"
)

raw_data$region = ifelse(
  raw_data$culture %in% c("7", "38"),
  "South-West Norway",
  ifelse(
    raw_data$culture == "40",
    "Russia",
    ifelse(
      raw_data$culture == "45",
      "Latvia",
      ifelse(
        raw_data$culture == "57",
        "Central-Eastern Norway",
        raw_data$region
      )
    )
  )
)


raw_data = raw_data %>%
  dplyr::mutate(region = dplyr::case_when(
    region == "South-West Norway" ~ "SW Norway",
    region == "North-West Norway" ~ "NW Norway",
    region == "Central-Eastern Norway" ~ "CE Norway",
    region == "South-Eastern Norway" ~ "SE Norway",
    .default = as.character(region)
  ))

metadata = raw_data %>%
  dplyr::select(c("group", "culture", "region", "Industry", "Outside", "method"))
rownames(metadata) = metadata$group

rownames(raw_data) = raw_data$group
counts = raw_data %>%
  dplyr::select(-c("group", "culture", "region", "Industry", "Outside", "method"))

# import final clade list
final_clades = read.table(
  "data/p02-05/final_clades_for_pub.txt",
  sep = "\t",
  header = TRUE,
  stringsAsFactors = FALSE
)
# replace
for(i in 1:nrow(final_clades)){

  strain = final_clades[i, "Strain"]
  clade = final_clades[i, "Clade"]
  metadata[which(metadata$group == strain), "Industry"] = clade

}
metadata$Industry = ifelse(metadata$Industry == "Beer", "Beer2", metadata$Industry)
metadata$Industry = ifelse(metadata$group == "BI001", "Asia", metadata$Industry)
metadata$Industry = ifelse(metadata$group == "BR003", "Mixed", metadata$Industry)
metadata$Industry = ifelse(metadata$group == "BI002", "Other", metadata$Industry)
metadata$Industry = ifelse(metadata$group == "BI004", "Asia", metadata$Industry)
metadata$Industry = ifelse(metadata$group == "BI005", "Other", metadata$Industry)


## background: no farmhouse
metadata_no_farm = metadata[which(metadata$Industry != "Farmhouse"), ]
counts_no_farm = counts[which(rownames(counts) %in% metadata_no_farm$group), ]

## farmhouse focus
metadata_farm_only = metadata[which(metadata$Industry %in% c("Farmhouse", "Allochthonous\nyeast")), ]
colnames(metadata_farm_only)[5] = "Allochthonous"
metadata_farm_only$Allochthonous = ifelse(metadata_farm_only$Allochthonous == "yes", "yes", "no")
counts_farm_only = counts[which(rownames(counts) %in% metadata_farm_only$group), ]


# sPLS-DA
counts_splsda = sapply(
  counts,
  function(counts) (counts-mean(counts))/stats::sd(counts)
)

# initial sPLS_DA
initial_splsda = mixOmics::splsda(
  counts_splsda,
  factor(metadata$Industry),
  ncomp = 10
) # set ncomp to 10 for performance assessment later

metadata$mock = "mock"
p_all_spls = mixOmics::plotIndiv(
  initial_splsda,
  comp = c(1, 2),
  group = factor(metadata$Industry),
  pch = factor(metadata$mock),
  ind.names = FALSE,
  ellipse = TRUE,
  legend = TRUE,
  legend.position = "left",
  legend.title = "Clade",
  title = "",
  col = c("red", "#df536b", "#61d04f", "#28e2e5", "#2297e6", "#cd0bbc", "grey75", "#bcf60c")
)

p_all_spls = p_all_spls$graph + theme(strip.background = element_blank())

11.1.2 Panel B code

# initial sPLS_DA
initial_splsda = mixOmics::splsda(
  counts_farm_only,
  factor(metadata_farm_only$region),
  ncomp = 10
) # set ncomp to 10 for performance assessment later

p_farm_spls = mixOmics::plotIndiv(
  initial_splsda,
  comp = c(1, 2),
  group = factor(metadata_farm_only$region),
  pch = factor(metadata_farm_only$Allochthonous),
  ind.names = FALSE,
  ellipse = TRUE,
  legend = TRUE,
  legend.position = "left",
  legend.title = "region",
  legend.title.pch = "Allochthnous\nyeast",
  title = "",
  col = c('#0571B0', '#FBA01D',"#FFDA00", "steelblue", '#A6611A',"#008470",'#92C5DE')
)

p_farm_spls = p_farm_spls$graph + theme(strip.background = element_blank())

11.1.3 Panel C code

# Load the dataset
mydata <- read.csv('./data/p02-05/grouped phenotype data with annotations.csv')

# Define region colors
RegionColors <- c(
  'North-West Norway' = '#0571B0',
  'South-West Norway' = '#92C5DE',
  'Central-Eastern Norway' = '#018571',
  'South-Eastern Norway' = '#80CDC1',
  'Latvia' = '#FFDA00',
  'Lithuania' = '#FBA01D',
  'Russia' = '#A6611A',
  'Beer' = '#FF0000',
  'Wine' = 'limegreen',
  'Outside' = '#000000'  # Black for the Outside group
)

# Create a new grouping variable:
# - "Outside" if Outside == "yes"
# - culture if available; otherwise, use region
mydata$grouping <- ifelse(
  mydata$Outside == "yes", "Outside",
  ifelse(is.na(mydata$culture), mydata$region, as.character(mydata$culture))
)

# Reorder the grouping factor with "Outside" first, then by region order
group_levels <- c("Outside", unique(mydata$grouping[mydata$grouping != "Outside"][order(mydata$region)]))
mydata$grouping <- factor(mydata$grouping, levels = group_levels)

# Select phenotypic data columns
phenotypic_data <- mydata %>% dplyr::select(Biggy:T43C)

# Calculate mean pairwise distance (multivariate diversity) per group
groupwise_distance <- mydata %>%
  dplyr::group_by(grouping) %>%
  dplyr::summarise(
    Mean_Pairwise_Distance = mean(
      vegdist(phenotypic_data[cur_group_rows(), ], method = "euclidean"),
      na.rm = TRUE
    ),
    n_individuals = n()
  )

# Combine the diversity metrics with region information
total_diversity <- groupwise_distance %>%
  left_join(mydata %>% dplyr::select(grouping, region) %>% distinct(), by = "grouping")

# (Optional) Check the diversity summary
print(total_diversity)
# A tibble: 55 × 4
   grouping Mean_Pairwise_Distance n_individuals region              
   <fct>                     <dbl>         <int> <chr>               
 1 Outside                    66.8            24 Russia              
 2 Outside                    66.8            24 South-West Norway   
 3 Outside                    66.8            24 Latvia              
 4 Outside                    66.8            24 South-Eastern Norway
 5 Wild                       56.3             4 Wild                
 6 Beer                      105.             72 Beer                
 7 Wine                       72.1            18 Wine                
 8 Spirits                    69.5             6 Spirits             
 9 Bread                      80.8             4 Bread               
10 Sake                       55.6             7 Sake                
# ℹ 45 more rows
# Remove specific rows if needed
total_diversity <- total_diversity[-c(1:10, 54:55), ]

# Test correlation between Mean Pairwise Distance and Number of Individuals
cor_test_distance <- cor.test(total_diversity$n_individuals, total_diversity$Mean_Pairwise_Distance, method = "pearson")
print(cor_test_distance)

    Pearson's product-moment correlation

data:  total_diversity$n_individuals and total_diversity$Mean_Pairwise_Distance
t = 3.0334, df = 41, p-value = 0.004183
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1466415 0.6454742
sample estimates:
      cor 
0.4281293 
# Extract correlation results for annotation
r_squared <- round(cor_test_distance$estimate^2, 2)
p_value <- signif(cor_test_distance$p.value, 3)

# Plot: Mean Pairwise Distance vs. Number of Individuals (colored by Region)
# Remove the legend from this plot since the color scheme is shared.
p_diversity_individuals <- ggplot(total_diversity, aes(x = n_individuals, y = Mean_Pairwise_Distance, color = region)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  scale_color_manual(values = RegionColors) +
  labs(
    x = "Number of individuals in culture",
    y = "Phenotypic diversity (mean pairwise distance)"
  ) +
  theme_classic(base_size = 14) +
  theme(
    legend.position = "none",
    axis.line.x = element_line(color = "black"),
    axis.line.y = element_line(color = "black"),
    axis.ticks = element_line(color = "black")
  ) +
  annotate("text", 
           x = max(total_diversity$n_individuals) * 0.8, 
           y = max(total_diversity$Mean_Pairwise_Distance) * 0.9,
           label = paste0("R² = ", r_squared, "\np = ", p_value), 
           size = 5, hjust = 0)

# ------------------------------
# Load the pairwise similarities dataset
pairwise_similarities <- read.csv('./data/p02-05/pairwise-similarities.csv')

# Ensure the culture column is character
pairwise_similarities$culture <- as.character(pairwise_similarities$culture)

# Add culture info to total_diversity from the original dataset
total_diversity <- total_diversity %>%
  left_join(mydata %>% dplyr::select(grouping, culture) %>% distinct(), by = "grouping")

# Merge datasets based on the culture column
combined_data <- total_diversity %>%
  left_join(pairwise_similarities, by = "culture")

# (Optional) Check the merged data structure
print(head(combined_data))
# A tibble: 6 × 6
  grouping Mean_Pairwise_Distance n_individuals region        culture similarity
  <fct>                     <dbl>         <int> <chr>         <chr>        <dbl>
1 7                          41.9             3 South-West N… 7             97.4
2 13                         35.9            18 North-West N… 13            75.2
3 27                         30.6            22 Central-East… 27            47.8
4 8                          44.4            15 North-West N… 8             78.8
5 5                          46.7            33 North-West N… 5             76.9
6 9                          39.4            22 North-West N… 9             79.9
# Test correlation between Mean Pairwise Distance and Similarity
cor_test_similarity_distance <- cor.test(combined_data$similarity, combined_data$Mean_Pairwise_Distance, method = "pearson")
print(cor_test_similarity_distance)

    Pearson's product-moment correlation

data:  combined_data$similarity and combined_data$Mean_Pairwise_Distance
t = -2.1192, df = 41, p-value = 0.04018
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.56155297 -0.01530232
sample estimates:
       cor 
-0.3142021 
# Extract correlation results for annotation
r_squared2 <- round(cor_test_similarity_distance$estimate^2, 2)
p_value2 <- signif(cor_test_similarity_distance$p.value, 3)

# Plot: Mean Pairwise Distance vs. Similarity
p_similarity_distance <- ggplot(combined_data, aes(x = similarity, y = Mean_Pairwise_Distance, color = region)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  scale_color_manual(values = RegionColors) +
  labs(
    x = "Culture strain similarity",
    y = "Phenotypic diversity (mean pairwise distance)",
    color = "Region"
  ) +
  theme_classic(base_size = 14) +
  theme(
    axis.line.x = element_line(color = "black"),
    axis.line.y = element_line(color = "black"),
    axis.ticks = element_line(color = "black"),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  ) +
  annotate("text", 
           x = (max(combined_data$similarity)-min(combined_data$similarity)) * 0.8 + min(combined_data$similarity), 
           y = max(combined_data$Mean_Pairwise_Distance) * 0.9,
           label = paste0("R² = ", r_squared2, "\np = ", p_value2), 
           size = 5, hjust = 0)

# ------------------------------
# Combine and display the two plots side by side
panel_c <- p_diversity_individuals + p_similarity_distance

11.1.4 Panel D code

pheno_list = c(
  #"Biggy",
  "T12C",
  "T39C",
  "T41C",
  "T43C"
  #"T37C",
  #"EtOH_12",
  #"CuSO4_0.1",
  #"EtOH_10"
)

counts_long1 = counts %>%
  dplyr::mutate(group = rownames(counts)) %>%
  dplyr::left_join(metadata, by = "group") %>%
  reshape2::melt() %>%
  dplyr::filter(Industry %in% c("Farmhouse", "Allochthonous\nyeast")) %>%
  dplyr::filter(variable %in% pheno_list)
counts_long1$category = counts_long1$region
counts_long1$Industry = "Farmhouse"

counts_long2 = counts %>%
  dplyr::mutate(group = rownames(counts)) %>%
  dplyr::left_join(metadata, by = "group") %>%
  reshape2::melt() %>%
  dplyr::filter(variable %in% pheno_list)
counts_long2$category = ifelse(
  counts_long2$Industry %in% c("Farmhouse", "Allochthonous\nyeast"),
  "Farmhouse",
  counts_long2$Industry
)
counts_long2$region = counts_long2$category

counts_long3 = rbind(counts_long1, counts_long2)%>%
  dplyr::filter(region != "Other")
counts_long3$category = ifelse(
  counts_long3$category == "Allochthonous\nyeast", "Farmhouse", counts_long3$category
) 
# counts_long3$region = ifelse(
#   is.na(counts_long3$region),
#   counts_long3$category,
#   counts_long3$region
# ) 

counts_long3$region = factor(
  counts_long3$region,
  levels = c(
    "SW Norway", "Lithuania", "NW Norway", "Russia",
    "CE Norway", "Latvia", "SE Norway",
    "Farmhouse", "Asia", "Beer1", "Beer2", "Mixed", "Wine"
    )
)

counts_long3$facet_factor = ifelse(
  counts_long3$region %in% c("SW Norway", "Lithuania", "NW Norway", "Russia",
                             "CE Norway", "Latvia", "SE Norway"),
  "Farmhouse, by region",
  ifelse(
    counts_long3$region %in% c("Asia", "Beer1", "Beer2", "Mixed", "Wine"),
    "Industrial",
    "Farmhouse"
  )
) %>% factor(
  levels = c("Farmhouse", "Industrial", "Farmhouse, by region")
)


text_annot = matrix(nrow = 13, ncol = 3) %>% data.frame
colnames(text_annot) = c("category", "facet_factor", "value")
text_annot$category = c(
  "Farmhouse",
  "CE Norway", "Latvia", "Lithuania", "NW Norway", "Russia", "SE Norway", "SW Norway",
  "Asia", "Beer1", "Beer2", "Mixed", "Wine"
)
text_annot$facet_factor = c(
  "Farmhouse",
  rep("Farmhouse, by region", 7),
  rep("Industrial", 5)
)
text_annot$value = c(
  661,
  54, 17, 43, 380, 23, 21, 123,
  32, 172, 68, 56, 96
)


color_annot = c(
  '#df536b', # Asia
  '#61d04f', # Beer1
  '#46f0f0', # Beer2
  "#008470", # CE Norway
  '#2297e6', # Farmhouse
  "#FFDA00", # Latvia
  '#FBA01D', # Lithuania
  '#cd0bbc', # Mixed
  '#0571B0', # NW Norway
  '#A6611A', # Russia
  "steelblue", # SE Norway
  '#92C5DE', # SW Norway
  '#bcf60c' # wine
)
color_annot = stats::setNames(
  color_annot,
  counts_long3$category %>% factor() %>% levels()
)


p_violins_final = list()

for(k in 1:length(pheno_list)){
  
  my_pheno = pheno_list[[k]]
  tmp_df = counts_long3 %>% dplyr::filter(variable == my_pheno)
  
  tests_pheno = tmp_df %>% 
    dplyr::filter(facet_factor %in% c("Farmhouse", "Industrial")) %>%
    rstatix::t_test(value ~ Industry, ref.group = "Farmhouse")
  tests_pheno_annot = matrix(nrow = 5, ncol = 3) %>% data.frame
  colnames(tests_pheno_annot) = c("category", "facet_factor", "value")
  tests_pheno_annot$category = c("Asia", "Beer1", "Beer2", "Mixed", "Wine")
  tests_pheno_annot$facet_factor = c(rep("Industrial", 5))
  tests_pheno_annot$value = c(
    tests_pheno[2, "p.adj.signif"], tests_pheno[3, "p.adj.signif"], tests_pheno[4, "p.adj.signif"],
    tests_pheno[5, "p.adj.signif"], tests_pheno[6, "p.adj.signif"]
  )

  p_violins_final[[k]] = ggplot(tmp_df,
                                aes(x = forcats::fct_reorder(category, value, median),
                                    y = value)) +
    geom_violin(aes(fill = category),
                alpha = 0.5,
                scale = "width",
                trim = FALSE) +
    geom_point(data = tmp_df %>% dplyr::filter(Outside == "yes"),
               aes(x = category, y = value), fill = "red",
               position = position_jitter(width = 0.25),
               shape = 21, 
               alpha = 0.375,
               size = 3) + 
    geom_text(data = text_annot,
              aes(x = category, y = 115, label = paste0("n=", as.character(value)))) +
    geom_text(data = tests_pheno_annot,
              aes(x = category, y = 105, label = as.character(value))) +
    stat_summary(fun = "median", colour = "red", geom = "crossbar") +
    facet_grid(~ factor(facet_factor, levels = c("Farmhouse", "Industrial", "Farmhouse, by region")), scales = "free_x", space = "free") +
    scale_y_continuous(breaks = c(0, 25, 50, 75, 100), limits = c(0, 120), ) +
    scale_fill_manual(values = color_annot) +
    scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 10)) +
    labs(y = my_pheno) +
    theme(title = element_blank(),
          axis.text.x = element_text(size = 12),
          axis.text.y = element_text(size = 14),
          axis.title.x = element_blank(),
          axis.title.y = element_text(size = 18),
          legend.position = "none",
          panel.background = element_rect(colour = "black", fill = NA),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          panel.grid.major.y = element_line(colour = "grey75"),
          panel.grid.minor.y = element_blank(),
          strip.background = element_rect(colour = "black", fill = NA),
          strip.text = element_text(size = 12))
  
}

11.1.5 Merge

final_plot = cowplot::plot_grid(
  cowplot::plot_grid(
    p_all_spls, p_farm_spls,
    nrow = 1,
    rel_widths = c(1, 1),
    labels = c("A", "B")
  ),
  panel_c,
  cowplot::plot_grid(
    p_violins_final[[1]], p_violins_final[[2]], p_violins_final[[3]],
    p_violins_final[[4]],
    ncol = 1,
    labels = c("D", NA, NA, NA)
  ),
  nrow = 3,
  rel_heights = c(1.5, 1, 2.5)
)

11.2 Figure 5 plot

Figure 5: High-throughput phenotyping of farmhouse and industrial yeasts.

11.3 Session Information

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: openSUSE Tumbleweed

Matrix products: default
BLAS/LAPACK: /home/andrea/miniforge3/envs/moai/lib/libmkl_rt.so.2;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=it_IT.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Brussels
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] vegan_2.7-1        permute_0.9-7      umap_0.2.10.0      RColorBrewer_1.1-3
 [5] PCAtools_2.14.0    patchwork_1.3.0    mixOmics_6.26.0    lattice_0.22-6    
 [9] MASS_7.3-60.0.1    ggrepel_0.9.6      ggplot2_3.5.1      forcats_1.0.0     
[13] dplyr_1.1.4       

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1          farver_2.1.2             
 [3] fastmap_1.2.0             digest_0.6.37            
 [5] rsvd_1.0.5                lifecycle_1.0.4          
 [7] cluster_2.1.8             magrittr_2.0.3           
 [9] compiler_4.3.2            rlang_1.1.5              
[11] tools_4.3.2               utf8_1.2.4               
[13] igraph_2.1.4              yaml_2.3.10              
[15] knitr_1.49                labeling_0.4.3           
[17] askpass_1.2.1             S4Arrays_1.2.1           
[19] dqrng_0.4.1               rARPACK_0.11-0           
[21] htmlwidgets_1.6.4         reticulate_1.40.0        
[23] DelayedArray_0.28.0       plyr_1.8.9               
[25] abind_1.4-8               BiocParallel_1.36.0      
[27] withr_3.0.2               purrr_1.0.2              
[29] BiocGenerics_0.48.1       grid_4.3.2               
[31] stats4_4.3.2              beachmat_2.18.1          
[33] colorspace_2.1-1          scales_1.3.0             
[35] cli_3.6.3                 ellipse_0.5.0            
[37] rmarkdown_2.29            crayon_1.5.3             
[39] generics_0.1.3            RSpectra_0.16-2          
[41] reshape2_1.4.4            DelayedMatrixStats_1.24.0
[43] stringr_1.5.1             splines_4.3.2            
[45] zlibbioc_1.48.2           parallel_4.3.2           
[47] XVector_0.42.0            matrixStats_1.5.0        
[49] vctrs_0.6.5               Matrix_1.6-5             
[51] carData_3.0-5             jsonlite_1.8.9           
[53] car_3.1-3                 BiocSingular_1.18.0      
[55] IRanges_2.36.0            S4Vectors_0.40.2         
[57] rstatix_0.7.2             Formula_1.2-5            
[59] irlba_2.3.5.1             tidyr_1.3.1              
[61] glue_1.8.0                codetools_0.2-20         
[63] cowplot_1.1.3             stringi_1.8.4            
[65] gtable_0.3.6              ScaledMatrix_1.10.0      
[67] munsell_0.5.1             tibble_3.2.1             
[69] pillar_1.10.1             htmltools_0.5.8.1        
[71] openssl_2.3.1             R6_2.5.1                 
[73] sparseMatrixStats_1.14.0  evaluate_1.0.3           
[75] backports_1.5.0           png_0.1-8                
[77] broom_1.0.7               corpcor_1.6.10           
[79] Rcpp_1.0.14               nlme_3.1-167             
[81] gridExtra_2.3             SparseArray_1.2.4        
[83] mgcv_1.9-1                xfun_0.50                
[85] MatrixGenerics_1.14.0     pkgconfig_2.0.3