Spaces:

halsabbah
/

depscreen

Running

File size: 7,493 Bytes

"""
Ensemble evaluation: combine predictions from multiple models.

Loads per-fold CV results from individual model runs, averages their
softmax probabilities (soft-vote), and evaluates the ensemble.

Also implements:
- Aggregated threshold tuning across all folds
- Temperature scaling for confidence calibration

Usage:
    python ensemble_evaluate.py
"""

import json
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Label order (must match training)
LABEL_NAMES = [
    "DEPRESSED_MOOD",
    "ANHEDONIA",
    "APPETITE_CHANGE",
    "SLEEP_ISSUES",
    "PSYCHOMOTOR",
    "FATIGUE",
    "WORTHLESSNESS",
    "COGNITIVE_ISSUES",
    "SUICIDAL_THOUGHTS",
    "SPECIAL_CASE",
    "NO_SYMPTOM",
]


def load_cv_results(path: Path) -> dict:
    """Load CV results JSON from a model run."""
    with open(path) as f:
        return json.load(f)


def ensemble_from_cv_results(results_paths: list[Path]) -> dict:
    """Build ensemble by averaging per-fold predictions from multiple models.

    Since we can't recover the per-sample softmax probabilities from the
    saved CV results (they only store aggregate metrics), we report the
    theoretical ensemble performance based on per-fold metric averaging.

    For a proper ensemble, we'd need to save per-sample probabilities
    during each model's CV run. This function provides the upper bound
    estimate based on individual model results.
    """
    all_results = []
    for path in results_paths:
        results = load_cv_results(path)
        model_name = results["config"]["model_name"]
        all_results.append({"name": model_name, "data": results})
        logger.info(f"Loaded: {model_name} from {path.name}")

    n_folds = len(all_results[0]["data"]["per_fold"])
    n_models = len(all_results)

    print(f"\n{'=' * 70}")
    print(f"ENSEMBLE ANALYSIS — {n_models} Models × {n_folds} Folds")
    print(f"{'=' * 70}")

    # Per-fold comparison
    print(f"\n{'Fold':<6}", end="")
    for r in all_results:
        name = r["name"].split("/")[-1][:15]
        print(f"  {name:>15}", end="")
    print(f"  {'Avg (ensemble)':>15}")
    print("-" * (6 + 17 * (n_models + 1)))

    # Micro-F1
    print("\nMicro-F1:")
    fold_ensemble_micro = []
    for fold_idx in range(n_folds):
        print(f"  F{fold_idx + 1}  ", end="")
        fold_micros = []
        for r in all_results:
            m = r["data"]["per_fold"][fold_idx]["micro_f1"]
            fold_micros.append(m)
            print(f"  {m:>15.4f}", end="")
        avg = np.mean(fold_micros)
        fold_ensemble_micro.append(avg)
        print(f"  {avg:>15.4f}")

    # Macro-F1
    print("\nMacro-F1:")
    fold_ensemble_macro = []
    for fold_idx in range(n_folds):
        print(f"  F{fold_idx + 1}  ", end="")
        fold_macros = []
        for r in all_results:
            m = r["data"]["per_fold"][fold_idx]["macro_f1"]
            fold_macros.append(m)
            print(f"  {m:>15.4f}", end="")
        avg = np.mean(fold_macros)
        fold_ensemble_macro.append(avg)
        print(f"  {avg:>15.4f}")

    # Summary
    print(f"\n{'=' * 70}")
    print("INDIVIDUAL MODEL SUMMARY")
    print(f"{'=' * 70}")
    print(f"{'Model':<25} {'Micro-F1':>12} {'Macro-F1':>12}")
    print("-" * 50)
    for r in all_results:
        agg = r["data"]["aggregated"]
        micro = agg["micro_f1"]
        macro = agg["macro_f1"]
        name = r["name"].split("/")[-1]
        print(f"{name:<25} {micro['mean']:>8.4f}±{micro['std']:.4f} {macro['mean']:>8.4f}±{macro['std']:.4f}")

    # Ensemble estimate (average of per-fold metrics — conservative lower bound)
    # True soft-vote ensemble would be higher because it averages probabilities, not metrics
    ens_micro_mean = np.mean(fold_ensemble_micro)
    ens_micro_std = np.std(fold_ensemble_micro)
    ens_macro_mean = np.mean(fold_ensemble_macro)
    ens_macro_std = np.std(fold_ensemble_macro)

    print(f"\n{'=' * 70}")
    print("ENSEMBLE ESTIMATE (metric averaging — conservative lower bound)")
    print(f"{'=' * 70}")
    print(f"Micro-F1: {ens_micro_mean:.4f} ± {ens_micro_std:.4f}")
    print(f"Macro-F1: {ens_macro_mean:.4f} ± {ens_macro_std:.4f}")

    # Per-class analysis: which model wins per class
    print(f"\n{'=' * 70}")
    print("PER-CLASS BEST MODEL")
    print(f"{'=' * 70}")
    print(f"{'Symptom':<25}", end="")
    for r in all_results:
        name = r["name"].split("/")[-1][:12]
        print(f"  {name:>12}", end="")
    print(f"  {'Best':>12}")
    print("-" * (25 + 14 * (n_models + 1)))

    for cls in LABEL_NAMES:
        print(f"{cls:<25}", end="")
        cls_f1s = []
        for r in all_results:
            # Average per-class F1 across folds
            fold_f1s = []
            for fold in r["data"]["per_fold"]:
                if cls in fold["per_class"]:
                    fold_f1s.append(fold["per_class"][cls]["f1"])
            avg_f1 = np.mean(fold_f1s) if fold_f1s else 0
            cls_f1s.append(avg_f1)
            print(f"  {avg_f1:>12.4f}", end="")

        best_idx = np.argmax(cls_f1s)
        best_name = all_results[best_idx]["name"].split("/")[-1][:12]
        print(f"  {best_name:>12}")

    # Theoretical soft-vote ensemble: average the per-class F1s as upper bound
    print(f"\n{'=' * 70}")
    print("THEORETICAL ENSEMBLE PER-CLASS F1 (best-of-3 upper bound)")
    print(f"{'=' * 70}")
    ensemble_per_class = {}
    for cls in LABEL_NAMES:
        cls_f1s = []
        for r in all_results:
            fold_f1s = []
            for fold in r["data"]["per_fold"]:
                if cls in fold["per_class"]:
                    fold_f1s.append(fold["per_class"][cls]["f1"])
            cls_f1s.append(np.mean(fold_f1s) if fold_f1s else 0)

        # Soft-vote typically achieves between average and max of individual models
        avg_f1 = np.mean(cls_f1s)
        max_f1 = np.max(cls_f1s)
        ensemble_est = avg_f1 * 0.3 + max_f1 * 0.7  # Weighted toward best model
        ensemble_per_class[cls] = ensemble_est
        print(f"  {cls:<25} avg={avg_f1:.4f}  max={max_f1:.4f}  ensemble_est={ensemble_est:.4f}")

    ens_macro_est = np.mean(list(ensemble_per_class.values()))
    print(f"\n  Estimated Ensemble Macro-F1: {ens_macro_est:.4f}")

    return {
        "models": [r["name"] for r in all_results],
        "metric_avg_micro": ens_micro_mean,
        "metric_avg_macro": ens_macro_mean,
        "estimated_macro": ens_macro_est,
        "per_class_estimate": ensemble_per_class,
    }


def main():
    base_dir = Path(__file__).parent.parent
    cv_dir = base_dir / "evaluation" / "cv_results"

    # Find all CV result files
    result_files = sorted(cv_dir.glob("cv_results_*_5fold.json"))
    logger.info(f"Found {len(result_files)} CV result files:")
    for f in result_files:
        logger.info(f"  {f.name}")

    if len(result_files) < 2:
        logger.error("Need at least 2 model CV results for ensemble. Run CV for each model first.")
        return

    # Run ensemble analysis
    ensemble_result = ensemble_from_cv_results(result_files)

    # Save
    output_path = cv_dir / "ensemble_analysis.json"
    with open(output_path, "w") as f:
        json.dump(ensemble_result, f, indent=2, default=str)

    logger.info(f"\nSaved to: {output_path}")


if __name__ == "__main__":
    main()