---
title: "Comprehensive Guide to the RHISEA Package for Mixed-Stock Analysis"
author:
- name: Sosthene Akia
  email: "sosthene.akia@dfo-mpo.ca"
  affiliation: Fisheries and Oceans Canada
- name: Alex Hanke
  email: "alex.hanke@dfo-mpo.ca"
  affiliation: Fisheries and Oceans Canada
date: "2026-03-16"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Comprehensive Guide to the RHISEA Package for Mixed-Stock Analysis}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---


# 1) Data Preparation

We start by loading the baseline and mixture datasets and preparing them.


``` r
baseline_file <- system.file("extdata", "baseline.rda", package = "RHISEA")
mixture_file <- system.file("extdata", "mixture.rda", package = "RHISEA")
load(baseline_file)
load(mixture_file)

baseline$population <- as.factor(baseline$population)
stocks_names <- levels(baseline$population)
np <- length(stocks_names)
nv <- 2
Nsamps <- 1000
Nmix <- nrow(mixture)
resample_baseline <- FALSE
resampled_baseline_sizes <- c(50, 50)
stock_labels <- c("East", "West")
```

# 2) Using `run_hisea_all()` with Built-in Models

Run RHISEA analyses using the built-in Random Forest and LDA models, specifying `phi_method` as either "standard" or "cv".


``` r
# Run LDA with standard phi matrix
LDA_results <- run_hisea_all(
  type = "ANALYSIS",
  np = np,
  phi_method = "standard",
  nv = nv,
  resample_baseline = resample_baseline,
  resampled_baseline_sizes = resampled_baseline_sizes,
  seed_val = 123,
  nsamps = Nsamps,
  Nmix = Nmix,
  actual = NULL,
  baseline_input = baseline,
  mix_input = mixture,
  method_class = "LDA",
  stocks_names = stock_labels,
  stock_col = "population",
  var_cols_std = c("d13c", "d18o"),
  var_cols_mix = c("d13c_ukn", "d18o_ukn")
)

round(LDA_results$mean, 4)
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3771 0.3376 0.3376 0.3376 0.3217
#> West 0.6229 0.6624 0.6624 0.6624 0.6783
```


``` r
# Run RF with cross-validation phi matrix
RF_results <- run_hisea_all(
  type = "ANALYSIS",
  np = np,
  phi_method = "cv",
  nv = nv,
  resample_baseline = resample_baseline,
  resampled_baseline_sizes = resampled_baseline_sizes,
  seed_val = 123,
  nsamps = Nsamps,
  Nmix = Nmix,
  actual = NULL,
  baseline_input = baseline,
  mix_input = mixture,
  method_class = "RF",
  stocks_names = stock_labels,
  stock_col = "population",
  var_cols_std = c("d13c", "d18o"),
  var_cols_mix = c("d13c_ukn", "d18o_ukn"),
  ntree = 500
)

round(RF_results$mean, 4)
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3807 0.3793 0.3793 0.3793 0.3516
#> West 0.6193 0.6207 0.6207 0.6207 0.6484
```

# 3) Testing Parameters: `phi_method` and `type` in RF Model

Run analyses comparing `phi_method` ("standard" vs "cv") and types ("ANALYSIS" vs "BOOTSTRAP") for Random Forest.


``` r
phi_methods <- c("standard", "cv")
types <- c("ANALYSIS", "BOOTSTRAP")
results_list <- list()

for (phi_m in phi_methods) {
  for (typ in types) {
    cat(sprintf("Running RF with type = %s, phi_method = %s\n", typ, phi_m))
    args <- list(
      type = typ,
      np = np,
      phi_method = phi_m,
      nv = nv,
      resample_baseline = resample_baseline,
      resampled_baseline_sizes = resampled_baseline_sizes,
      seed_val = 123,
      nsamps = Nsamps,
      Nmix = Nmix,
      actual = NULL,
      baseline_input = baseline,
      mix_input = mixture,
      method_class = "RF",
      stocks_names = stock_labels,
      stock_col = "population",
      var_cols_std = c("d13c", "d18o"),
      var_cols_mix = c("d13c_ukn", "d18o_ukn"),
      ntree = 500
    )
    res <- do.call(run_hisea_all, args)
    key <- paste("RF", typ, phi_m, sep = "_")
    results_list[[key]] <- res
    cat("Mean stock estimates:\n")
    print(round(res$mean_estimates, 4))
    cat("\n\n")
  }
}
#> Running RF with type = ANALYSIS, phi_method = standard
#> Mean stock estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3789 0.3789 0.3789 0.3789 0.3484
#> West 0.6211 0.6211 0.6211 0.6211 0.6516
#> 
#> 
#> Running RF with type = BOOTSTRAP, phi_method = standard
#> Mean stock estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3787 0.3787 0.3787 0.3787 0.3485
#> West 0.6213 0.6213 0.6213 0.6213 0.6515
#> 
#> 
#> Running RF with type = ANALYSIS, phi_method = cv
#> Mean stock estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3807 0.3793 0.3793 0.3793 0.3516
#> West 0.6193 0.6207 0.6207 0.6207 0.6484
#> 
#> 
#> Running RF with type = BOOTSTRAP, phi_method = cv
#> Mean stock estimates:
#>         RAW COOK COOKC   EM     ML
#> East 0.3907 0.39  0.39 0.39 0.3588
#> West 0.6093 0.61  0.61 0.61 0.6412
```

# 4) Using `run_hisea_estimates()` with External Models and Comparison

Train external LDA and RF models and provide their outputs with appropriate phi matrices to `run_hisea_estimates()` for estimation, comparing internal and external results.


``` r
# Function to perform stratified k-fold CV and compute phi matrix for LDA
get_phi_results_lda_standard <- function(data, formula) {
  # Entraînement du modèle LDA sur l'ensemble complet
  model <- lda(formula, data = data)

  # Prédiction des classes et probabilités sur le même jeu de données
  pred <- predict(model, data)
  all_predictions <- pred$class
  all_probabilities <- pred$posterior

  # Matrice de confusion (in-sample)
  conf_matrix <- table(Predicted = all_predictions, Actual = data$population)

  # Matrice phi normalisée par colonne (probabilité conditionnelle)
  phi_matrix <- prop.table(conf_matrix, margin = 2)

  list(
    confusion_matrix = conf_matrix,
    phi_matrix = phi_matrix,
    predictions = all_predictions,
    probabilities = all_probabilities
  )
}

# External LDA phi and estimates
lda_formula <- population ~ d13c + d18o
lda_cv_phi <- get_phi_results_lda_standard(baseline, lda_formula)

lda_model <- lda(lda_formula, data = baseline)
mix_prepared <- data.frame(
  d13c = as.numeric(as.character(mixture$d13c_ukn)),
  d18o = as.numeric(as.character(mixture$d18o_ukn))
)
lda_pred <- predict(lda_model, mix_prepared)
lda_classes <- as.integer(lda_pred$class)
lda_probs <- lda_pred$posterior

lda_ext_results <- run_hisea_estimates(
  pseudo_classes = lda_classes,
  likelihoods = lda_probs,
  phi_matrix = as.matrix(lda_cv_phi$phi_matrix),
  np = np,
  type = "ANALYSIS",
  stocks_names = stocks_names,
  export_csv = FALSE
)

print(round(lda_ext_results$mean_estimates, 4))
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3771 0.3376 0.3376 0.3376 0.3217
#> West 0.6229 0.6624 0.6624 0.6624 0.6783
```


``` r
# External RF phi and estimates
get_cv_results_rf <- function(data, formula, k = 10, ntree = 500) {
  set.seed(123)
  folds <- createFolds(data$population, k = k, list = TRUE)
  all_predictions <- factor(rep(NA, nrow(data)), levels = levels(data$population))
  all_probabilities <- matrix(NA,
    nrow = nrow(data), ncol = length(levels(data$population)),
    dimnames = list(NULL, levels(data$population))
  )
  for (i in seq_along(folds)) {
    test_idx <- folds[[i]]
    train_data <- data[-test_idx, ]
    test_data <- data[test_idx, ]
    model <- randomForest(formula, data = train_data, ntree = ntree)
    all_predictions[test_idx] <- predict(model, test_data)
    all_probabilities[test_idx, ] <- predict(model, test_data, type = "prob")
  }
  conf_matrix <- table(Predicted = all_predictions, Actual = data$population)
  phi_matrix <- prop.table(conf_matrix, margin = 2)
  list(confusion_matrix = conf_matrix, phi_matrix = phi_matrix, predictions = all_predictions, probabilities = all_probabilities)
}

rf_cv_phi <- get_cv_results_rf(baseline, lda_formula, ntree = 500)
rf_model_ext <- randomForest(lda_formula, data = baseline, ntree = 500)
rf_probs_ext <- predict(rf_model_ext, mix_prepared, type = "prob")
rf_classes_ext <- as.integer(predict(rf_model_ext, mix_prepared))

rf_ext_results <- run_hisea_estimates(
  pseudo_classes = rf_classes_ext,
  likelihoods = rf_probs_ext,
  phi_matrix = as.matrix(rf_cv_phi$phi_matrix),
  np = np,
  type = "ANALYSIS",
  stocks_names = stocks_names,
  export_csv = FALSE
)

print(round(rf_ext_results$mean_estimates, 4))
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3807 0.3793 0.3793 0.3793 0.3516
#> West 0.6193 0.6207 0.6207 0.6207 0.6484
```

***

# 5) Tuning Built-in Random Forest Models with `run_hisea_all()`

Evaluate the effect of RF hyperparameters (`ntree`, `mtry`, `nodesize`) on estimation results.


``` r
rf_params_list <- list(
  list(ntree = 100, mtry = 1, nodesize = 1),
  list(ntree = 500, mtry = 2, nodesize = 1),
  list(ntree = 1000, mtry = 2, nodesize = 5),
  list(ntree = 2000, mtry = 1, nodesize = 10)
)
results_rf_tuning <- list()

for (i in seq_along(rf_params_list)) {
  params <- rf_params_list[[i]]
  cat(sprintf("Running RF with ntree=%d, mtry=%d, nodesize=%d\n", params$ntree, params$mtry, params$nodesize))

  res <- run_hisea_all(
    type = "ANALYSIS",
    np = np,
    phi_method = "cv",
    nv = nv,
    resample_baseline = resample_baseline,
    resampled_baseline_sizes = resampled_baseline_sizes,
    seed_val = 123,
    nsamps = Nsamps,
    Nmix = Nmix,
    actual = NULL,
    baseline_input = baseline,
    mix_input = mixture,
    method_class = "RF",
    stocks_names = stock_labels,
    stock_col = "population",
    var_cols_std = c("d13c", "d18o"),
    var_cols_mix = c("d13c_ukn", "d18o_ukn"),
    ntree = params$ntree,
    mtry = params$mtry,
    nodesize = params$nodesize
  )
  key <- paste0("RF_ntree", params$ntree, "_mtry", params$mtry, "_nodesize", params$nodesize)
  results_rf_tuning[[key]] <- res

  cat("Mean estimates:\n")
  print(round(res$mean_estimates, 4))
  cat("\n\n")
}
#> Running RF with ntree=100, mtry=1, nodesize=1
#> Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3829 0.3772 0.3772 0.3772 0.3549
#> West 0.6171 0.6228 0.6228 0.6228 0.6451
#> 
#> 
#> Running RF with ntree=500, mtry=2, nodesize=1
#> Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3807 0.3793 0.3793 0.3793 0.3516
#> West 0.6193 0.6207 0.6207 0.6207 0.6484
#> 
#> 
#> Running RF with ntree=1000, mtry=2, nodesize=5
#> Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3784 0.3796 0.3796 0.3796 0.3477
#> West 0.6216 0.6204 0.6204 0.6204 0.6523
#> 
#> 
#> Running RF with ntree=2000, mtry=1, nodesize=10
#> Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3793 0.3806 0.3806 0.3806 0.3491
#> West 0.6207 0.6194 0.6194 0.6194 0.6509
```

# 6) Tuning External Random Forest Models with `run_hisea_estimates()` and Comparison

This section evaluates the effect of RF hyperparameters on external RF models,
then compares these estimates with the built-in RF tuning results from section 5.


``` r
results_rf_external_tuning <- list()

for (i in seq_along(rf_params_list)) {
  params <- rf_params_list[[i]]
  cat(sprintf(
    "Training external RF with ntree=%d, mtry=%d, nodesize=%d\n",
    params$ntree, params$mtry, params$nodesize
  ))

  # Train external RF model on baseline
  ext_rf_model <- randomForest(
    formula = population ~ d13c + d18o,
    data = baseline,
    ntree = params$ntree,
    mtry = params$mtry,
    nodesize = params$nodesize
  )

  # Prepare mixture data as before
  mix_data_prepared <- mixture %>%
    mutate(
      d13c = as.numeric(as.character(d13c_ukn)),
      d18o = as.numeric(as.character(d18o_ukn))
    )

  # Predict posterior class probabilities and pseudo-classes
  pred_probs <- predict(ext_rf_model, newdata = mix_data_prepared, type = "prob")
  pred_classes <- as.integer(predict(ext_rf_model, newdata = mix_data_prepared))

  # Compute phi matrix by cross-validation on baseline with same params
  get_cv_results_rf <- function(data, formula, k = 10, ntree = 500, mtry = NULL, nodesize = NULL) {
    set.seed(123)
    folds <- createFolds(data$population, k = k, list = TRUE)
    all_predictions <- factor(rep(NA, nrow(data)), levels = levels(data$population))
    all_probabilities <- matrix(NA,
      nrow = nrow(data), ncol = length(levels(data$population)),
      dimnames = list(NULL, levels(data$population))
    )
    for (i in seq_along(folds)) {
      test_idx <- folds[[i]]
      train_data <- data[-test_idx, ]
      test_data <- data[test_idx, ]
      model <- randomForest(
        formula,
        data = train_data,
        ntree = ntree,
        mtry = ifelse(is.null(mtry), floor(sqrt(ncol(train_data) - 1)), mtry),
        nodesize = ifelse(is.null(nodesize), 1, nodesize)
      )
      all_predictions[test_idx] <- predict(model, test_data)
      all_probabilities[test_idx, ] <- predict(model, test_data, type = "prob")
    }
    conf_matrix <- table(Predicted = all_predictions, Actual = data$population)
    phi_matrix <- prop.table(conf_matrix, margin = 2)
    list(
      confusion_matrix = conf_matrix, phi_matrix = phi_matrix,
      predictions = all_predictions, probabilities = all_probabilities
    )
  }

  cv_results <- get_cv_results_rf(baseline, population ~ d13c + d18o,
    k = 10,
    ntree = params$ntree,
    mtry = params$mtry,
    nodesize = params$nodesize
  )

  # Run HISEA estimates with external RF results
  ext_rf_estimates <- run_hisea_estimates(
    pseudo_classes = pred_classes,
    likelihoods = pred_probs,
    phi_matrix = as.matrix(cv_results$phi_matrix),
    np = np,
    type = "ANALYSIS",
    stocks_names = stocks_names,
    export_csv = FALSE,
    verbose = FALSE
  )

  key <- paste0("Ext_RF_ntree", params$ntree, "_mtry", params$mtry, "_nodesize", params$nodesize)
  results_rf_external_tuning[[key]] <- ext_rf_estimates

  cat("External RF Mean estimates:\n")
  print(round(ext_rf_estimates$mean_estimates, 4))
  cat("\n\n")
}
#> Training external RF with ntree=100, mtry=1, nodesize=1
#> External RF Mean estimates:
#>         RAW   COOK  COOKC     EM    ML
#> East 0.3831 0.3775 0.3775 0.3775 0.353
#> West 0.6169 0.6225 0.6225 0.6225 0.647
#> 
#> 
#> Training external RF with ntree=500, mtry=2, nodesize=1
#> External RF Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3689 0.3701 0.3701 0.3701 0.3583
#> West 0.6311 0.6299 0.6299 0.6299 0.6417
#> 
#> 
#> Training external RF with ntree=1000, mtry=2, nodesize=5
#> External RF Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3976 0.4047 0.4047 0.4047 0.3695
#> West 0.6024 0.5953 0.5953 0.5953 0.6305
#> 
#> 
#> Training external RF with ntree=2000, mtry=1, nodesize=10
#> External RF Mean estimates:
#>         RAW   COOK  COOKC     EM     ML
#> East 0.3613 0.3518 0.3518 0.3518 0.3523
#> West 0.6387 0.6482 0.6482 0.6482 0.6477

# Comparison between built-in RF tuning (section 5) and external RF tuning (section 6)

# Preparing data.frames for comparison
library(reshape2)

compare_df <- bind_rows(
  lapply(names(results_rf_tuning), function(name) {
    res <- results_rf_tuning[[name]]
    df <- as.data.frame(res$mean_estimates)
    df$Stock <- rownames(df)
    df_long <- reshape2::melt(df, id.vars = "Stock", variable.name = "Estimator", value.name = "Proportion")
    df_long$Model <- paste0("BuiltIn_", name)
    df_long
  }),
  lapply(names(results_rf_external_tuning), function(name) {
    res <- results_rf_external_tuning[[name]]
    df <- as.data.frame(res$mean_estimates)
    df$Stock <- rownames(df)
    df_long <- reshape2::melt(df, id.vars = "Stock", variable.name = "Estimator", value.name = "Proportion")
    df_long$Model <- paste0("External_", name)
    df_long
  }),
  .id = NULL
)

# Plot comparison
ggplot(compare_df, aes(x = Estimator, y = Proportion, fill = Stock)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  facet_wrap(~Model, scales = "free_x") +
  theme_minimal() +
  labs(
    title = "Comparison of Estimated Stock Proportions\nBuilt-in RF vs External RF Models with Various Tunings",
    x = "Estimator", y = "Estimated Proportion"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

![plot of chunk unnamed-chunk-8](figure/unnamed-chunk-8-1.png)


***

This structured vignette provides a full, stepwise walkthrough of RHISEA usage from fundamental data steps, through modelling and estimation, to tuning and comparison of internal vs external methods, supporting robust mixed-stock fishery assessments.