Spaces:

techatcreated
/

menopause-ml

Sleeping

App Files Files Community

techatcreated commited on Feb 27

Commit

66d45ea

verified ·

1 Parent(s): 6f5fd0f

v1

Browse files

Files changed (7) hide show

app.py +1823 -0
menopause.py +1383 -0
predict_csv.py +143 -0
requirements.txt +26 -0
swan_ml_output/forecast_metadata.json +60 -0
swan_ml_output/lr_pipeline.pkl +3 -0
swan_ml_output/rf_pipeline.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,1823 @@

+"""
+SWAN Menopause Stage Prediction & Forecasting — Gradio UI
+Hugging Face Spaces deployment-ready.
+Run locally:  python app.py
+Deploy:       Push to a HF Space with SDK=gradio
+Output structure (per execution):
+  swan_ml_output/
+    <YYYYMMDD_HHMMSS>/
+      charts/       ← PNG visualizations
+      predictions/  ← CSV result files
+      reports/      ← TXT summary reports
+"""
+import os
+import json
+import warnings
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+warnings.filterwarnings("ignore")
+# ── Gradio ────────────────────────────────────────────────────────────────────
+import gradio as gr
+# ── Local ML module ───────────────────────────────────────────────────────────
+try:
+    from menopause import (
+        MenopauseForecast,
+        SymptomCycleForecaster,
+        load_forecast_model,
+    )
+    _MODULE_AVAILABLE = True
+except ImportError:
+    _MODULE_AVAILABLE = False
+# ── Model loading ─────────────────────────────────────────────────────────────
+FORECAST_DIR = os.environ.get("FORECAST_DIR", "swan_ml_output")
+OUTPUT_BASE   = Path(FORECAST_DIR)
+_forecast: Optional[MenopauseForecast] = None   # type: ignore[type-arg]
+_metadata: dict = {}
+def _load_models():
+    """Attempt to load saved joblib pipelines. Returns (success, message)."""
+    global _forecast, _metadata
+    if not _MODULE_AVAILABLE:
+        return False, "menopause.py not found. Make sure it is in the same directory."
+    meta_path = Path(FORECAST_DIR) / "forecast_metadata.json"
+    rf_path   = Path(FORECAST_DIR) / "rf_pipeline.pkl"
+    lr_path   = Path(FORECAST_DIR) / "lr_pipeline.pkl"
+    if not all(p.exists() for p in (meta_path, rf_path, lr_path)):
+        return (
+            False,
+            f"Model artifacts not found in '{FORECAST_DIR}'. "
+            "Run  `python menopause.py`  to train and save the models first.",
+        )
+    try:
+        _forecast = load_forecast_model(FORECAST_DIR)
+        with open(meta_path) as fh:
+            _metadata = json.load(fh)
+        return True, f"✅ Models loaded — {len(_metadata.get('feature_names', []))} features"
+    except Exception as exc:
+        return False, f"Error loading models: {exc}"
+_MODEL_OK, _MODEL_MSG = _load_models()
+# ── Output directory management ───────────────────────────────────────────────
+def _make_run_dir() -> Path:
+    """Create and return a unique timestamped run directory under swan_ml_output/."""
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_dir = OUTPUT_BASE / ts
+    (run_dir / "charts").mkdir(parents=True, exist_ok=True)
+    (run_dir / "predictions").mkdir(parents=True, exist_ok=True)
+    (run_dir / "reports").mkdir(parents=True, exist_ok=True)
+    return run_dir
+def _get_file_path(file_obj) -> Optional[str]:
+    """
+    Safely extract a file-system path from a Gradio file component value.
+    Gradio ≤ 3.x  → returns a file-like object with a .name attribute.
+    Gradio 4.x    → returns a str path (or NamedString subclass).
+    This helper handles both.
+    """
+    if file_obj is None:
+        return None
+    if hasattr(file_obj, "name"):
+        return file_obj.name
+    return str(file_obj)
+# ── Constants & helpers ───────────────────────────────────────────────────────
+STAGE_COLORS = {"pre": "#16a34a", "peri": "#d97706", "post": "#7c3aed"}
+STAGE_EMOJI  = {"pre": "🟢",      "peri": "🟡",      "post": "🟣"}
+STAGE_LABELS = {
+    "pre":  "Pre-Menopausal",
+    "peri": "Peri-Menopausal",
+    "post": "Post-Menopausal",
+}
+STAGE_INFO = {
+    "pre": {
+        "title":       "Pre-Menopausal",
+        "description": "Regular menstrual cycles with typical hormonal fluctuations. Ovarian function is normal.",
+        "symptoms":    ["Regular periods", "Normal hormone levels", "Potential mild PMS"],
+        "guidance":    "Maintain regular check-ups. Track your cycle and note any changes.",
+    },
+    "peri": {
+        "title":       "Peri-Menopausal (Transition)",
+        "description": "Hormonal changes begin — estrogen and progesterone levels fluctuate. Cycles become irregular.",
+        "symptoms":    ["Irregular periods", "Hot flashes", "Sleep disturbances", "Mood changes", "Night sweats"],
+        "guidance":    "Consult your healthcare provider. Lifestyle adjustments (diet, exercise, sleep) can help.",
+    },
+    "post": {
+        "title":       "Post-Menopausal",
+        "description": "12+ months since last menstrual period. Estrogen remains at consistently lower levels.",
+        "symptoms":    ["No periods", "Possible continued hot flashes", "Vaginal dryness", "Bone density changes"],
+        "guidance":    "Focus on bone health, cardiovascular health, and regular screenings. Discuss HRT options.",
+    },
+}
+# Feature descriptions keyed by the model's canonical feature names
+FEATURE_DESCRIPTIONS = {
+    "PAIN17":    "Pain indicator (visit-specific)",
+    "PAINTW17":  "Pain two-week indicator",
+    "PAIN27":    "Secondary pain indicator",
+    "PAINTW27":  "Secondary pain two-week indicator",
+    "SLEEP17":   "Sleep disturbance pattern 1",
+    "SLEEP27":   "Sleep disturbance pattern 2",
+    "BCOHOTH7":  "Birth control — other method",
+    "EXERCIS7":  "General exercise indicator",
+    "EXERHAR7":  "Vigorous exercise",
+    "EXEROST7":  "Osteoporosis exercise",
+    "EXERMEN7":  "Exercise — mental health",
+    "EXERLOO7":  "Exercise lookalike",
+    "EXERMEM7":  "Exercise — memory",
+    "EXERPER7":  "Exercise perception",
+    "EXERGEN7":  "General exercise type",
+    "EXERWGH7":  "Weight exercise",
+    "EXERADV7":  "Exercise advice indicator",
+    "EXEROTH7":  "Other exercise",
+    "EXERSPE7":  "Specific exercise",
+    "ABBLEED7":  "Abnormal bleeding (0=no, 1=yes)",   # ← correct feature name
+    "BLEEDNG7":  "Bleeding pattern",
+    "LMPDAY7":   "Last menstrual period day",
+    "DEPRESS7":  "Depression indicator",
+    "SEX17":     "Sexual activity indicator 1",
+    "SEX27":     "Sexual activity indicator 2",
+    "SEX37":     "Sexual activity indicator 3",
+    "SEX47":     "Sexual activity indicator 4",
+    "SEX57":     "Sexual activity indicator 5",
+    "SEX67":     "Sexual activity indicator 6",
+    "SEX77":     "Sexual activity indicator 7",
+    "SEX87":     "Sexual activity indicator 8",
+    "SEX97":     "Sexual activity indicator 9",
+    "SEX107":    "Sexual activity indicator 10",
+    "SEX117":    "Sexual activity indicator 11",
+    "SEX127":    "Sexual activity indicator 12",
+    "SMOKERE7":  "Smoking status",
+    "HOTFLAS7":  "Hot flash severity (1=none, 5=very severe)",
+    "NUMHOTF7":  "Number of hot flashes per week",
+    "BOTHOTF7":  "How bothersome are hot flashes",
+    "IRRITAB7":  "Irritability level",
+    "VAGINDR7":  "Vaginal dryness",
+    "MOODCHG7":  "Mood change frequency",
+    "SLEEPQL7":  "Sleep quality score",
+    "PHYSILL7":  "Physical illness indicators",
+    "HOTHEAD7":  "Hot flashes with headache",
+    "EXER12H7":  "Exercise in last 12 hours",
+    "ALCO24H7":  "Alcohol in last 24h",
+    "AGE7":      "Age (years)",
+    "RACE":      "Race (1=White, 2=Black, 3=Chinese, 4=Japanese, 5=Hispanic)",
+    "LANGINT7":  "Interview language indicator",
+}
+def _confidence_color(conf: float) -> str:
+    if conf >= 0.8:
+        return "#16a34a"
+    elif conf >= 0.6:
+        return "#d97706"
+    return "#dc2626"
+# ── Chart builders ────────────────────────────────────────────────────────────
+def _make_proba_chart(
+    probabilities: dict,
+    predicted_stage: str,
+    save_path: Optional[Path] = None,
+) -> plt.Figure:
+    """Horizontal bar chart for stage probabilities. Optionally saves PNG."""
+    fig, ax = plt.subplots(figsize=(6, 3.5))
+    fig.patch.set_facecolor("#1a1a2e")
+    ax.set_facecolor("#16213e")
+    stages      = list(probabilities.keys())
+    probs       = [probabilities[s] * 100 for s in stages]
+    colors      = [STAGE_COLORS.get(s, "#607d8b") for s in stages]
+    edge_colors = ["white" if s == predicted_stage else "none" for s in stages]
+    lws         = [2.5 if s == predicted_stage else 0 for s in stages]
+    bars = ax.barh(stages, probs, color=colors, edgecolor=edge_colors,
+                   linewidth=lws, height=0.5, zorder=3)
+    for bar, prob in zip(bars, probs):
+        ax.text(
+            min(prob + 1, 98), bar.get_y() + bar.get_height() / 2,
+            f"{prob:.1f}%",
+            va="center", ha="left", color="white", fontsize=11, fontweight="bold",
+        )
+    labels = [STAGE_LABELS.get(s, s) for s in stages]
+    ax.set_yticks(range(len(stages)))
+    ax.set_yticklabels(labels, color="white", fontsize=10)
+    ax.set_xlim(0, 105)
+    ax.tick_params(colors="white", labelsize=11)
+    ax.spines[["top", "right", "left", "bottom"]].set_visible(False)
+    ax.xaxis.set_visible(False)
+    for spine in ax.spines.values():
+        spine.set_color("#333")
+    ax.set_title("Stage Probabilities", color="white", fontsize=12,
+                 pad=10, fontweight="bold")
+    ax.grid(axis="x", color="#333", linestyle="--", linewidth=0.5, zorder=0)
+    fig.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=150, bbox_inches="tight",
+                    facecolor=fig.get_facecolor())
+    return fig
+def _make_cycle_chart(
+    cycle_day: int,
+    cycle_length: int = 28,
+    hot_prob: float = None,
+    mood_prob: float = None,
+    save_path: Optional[Path] = None,
+) -> plt.Figure:
+    """Circular cycle-day visualization. Optionally saves PNG."""
+    fig, ax = plt.subplots(figsize=(5, 5), subplot_kw=dict(polar=True))
+    fig.patch.set_facecolor("#1a1a2e")
+    ax.set_facecolor("#16213e")
+    days = np.linspace(0, 2 * np.pi, cycle_length, endpoint=False)
+    for i, d in enumerate(days):
+        phase = i / cycle_length
+        color = plt.cm.RdYlGn(1 - phase)
+        ax.bar(d, 1, width=2 * np.pi / cycle_length * 0.9,
+               bottom=0.5, color=color, alpha=0.4, zorder=1)
+    if cycle_day is not None:
+        angle = (cycle_day - 1) / cycle_length * 2 * np.pi
+        ax.scatter([angle], [1.05], s=200, color="#ff6b6b", zorder=5, linewidths=2)
+        ax.annotate(
+            f"Day {cycle_day}",
+            xy=(angle, 1.05), xytext=(0, 0),
+            textcoords="offset points", ha="center", va="center",
+            color="white", fontsize=12, fontweight="bold",
+        )
+    ax.set_rticks([])
+    ax.set_xticks([i * 2 * np.pi / 4 for i in range(4)])
+    ax.set_xticklabels(["Day 1", "Day 7", "Day 14", "Day 21"],
+                       color="#aaa", fontsize=9)
+    ax.set_yticklabels([])
+    ax.spines["polar"].set_color("#333")
+    ax.grid(color="#333", linewidth=0.5)
+    title = "Cycle Position"
+    if hot_prob is not None:
+        title += f"\n🔥 {hot_prob:.0%}  😤 {mood_prob:.0%}"
+    ax.set_title(title, color="white", fontsize=11, pad=20, fontweight="bold")
+    fig.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=150, bbox_inches="tight",
+                    facecolor=fig.get_facecolor())
+    return fig
+def _make_batch_summary_chart(results_df: pd.DataFrame,
+                               save_path: Optional[Path] = None) -> None:
+    """Stage distribution + confidence histogram for batch runs. Saves PNG."""
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
+    fig.patch.set_facecolor("#1a1a2e")
+    # Stage distribution pie
+    stage_counts = results_df["predicted_stage"].value_counts()
+    colors = [STAGE_COLORS.get(s, "#607d8b") for s in stage_counts.index]
+    ax1.set_facecolor("#16213e")
+    wedges, texts, autotexts = ax1.pie(
+        stage_counts.values, labels=stage_counts.index,
+        colors=colors, autopct="%1.0f%%",
+        textprops={"color": "white", "fontsize": 10},
+    )
+    for at in autotexts:
+        at.set_color("white")
+    ax1.set_title("Stage Distribution", color="white", fontsize=11, fontweight="bold")
+    # Confidence histogram
+    ax2.set_facecolor("#16213e")
+    if "confidence" in results_df.columns:
+        conf = results_df["confidence"].dropna()
+        ax2.hist(conf, bins=min(10, len(conf)), color="#3B82F6",
+                 edgecolor="#1a1a2e", alpha=0.8)
+        ax2.axvline(0.8, color="#4CAF50", linestyle="--",
+                    linewidth=1.5, label="High (0.80)")
+        ax2.axvline(0.6, color="#FF9800", linestyle="--",
+                    linewidth=1.5, label="Med (0.60)")
+        ax2.legend(fontsize=8, labelcolor="white", facecolor="#0d0d1a")
+        ax2.set_xlabel("Confidence", color="#aaa", fontsize=9)
+        ax2.set_ylabel("Count", color="#aaa", fontsize=9)
+        ax2.tick_params(colors="white", labelsize=9)
+        for sp in ["top", "right"]:
+            ax2.spines[sp].set_visible(False)
+        for sp in ["left", "bottom"]:
+            ax2.spines[sp].set_color("#333")
+    ax2.set_title("Confidence Distribution", color="white",
+                  fontsize=11, fontweight="bold")
+    fig.tight_layout()
+    if save_path:
+        fig.savefig(save_path, dpi=150, bbox_inches="tight",
+                    facecolor=fig.get_facecolor())
+    plt.close(fig)
+# ── Text report writers ───────────────────────────────────────────────────────
+def _write_single_stage_report(
+    path: Path,
+    stage: str,
+    confidence: float,
+    probabilities: dict,
+    model: str,
+    comparison: dict,
+    input_features: dict,
+):
+    lines = [
+        "=" * 60,
+        "SWAN MENOPAUSE STAGE PREDICTION REPORT",
+        f"Generated : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "=" * 60,
+        "",
+        f"Predicted Stage : {STAGE_LABELS.get(stage, stage)}",
+        f"Model           : {model}",
+        f"Confidence      : {confidence:.1%}",
+        "",
+        "Stage Probabilities:",
+    ]
+    for s, p in probabilities.items():
+        bar = "█" * int(p * 20)
+        lines.append(f"  {s:<6} : {p:.4f}  {bar}")
+    lines += [
+        "",
+        "Model Comparison:",
+        f"  RandomForest       → {comparison['RandomForest']['stage']}"
+        f"  ({comparison['RandomForest'].get('confidence', 0):.1%})",
+        f"  LogisticRegression → {comparison['LogisticRegression']['stage']}"
+        f"  ({comparison['LogisticRegression'].get('confidence', 0):.1%})",
+        "",
+        "Input Features (non-NaN):",
+    ]
+    for k, v in input_features.items():
+        if v is not None and not (isinstance(v, float) and np.isnan(v)):
+            lines.append(f"  {k:<12} = {v}")
+    lines += [
+        "",
+        "⚠️  For research/educational use only. Not a clinical diagnosis.",
+        "=" * 60,
+    ]
+    path.write_text("\n".join(lines), encoding="utf-8")
+def _write_batch_report(
+    path: Path,
+    results: pd.DataFrame,
+    model: str,
+    run_dir: Path,
+):
+    total = len(results)
+    dist  = results["predicted_stage"].value_counts().to_dict() \
+            if "predicted_stage" in results.columns else {}
+    if "confidence" in results.columns:
+        conf = results["confidence"]
+        mean_c = conf.mean();  min_c = conf.min();  max_c = conf.max()
+        high   = int((conf > 0.8).sum())
+        medium = int(((conf > 0.6) & (conf <= 0.8)).sum())
+        low    = int((conf <= 0.6).sum())
+    else:
+        mean_c = min_c = max_c = high = medium = low = 0
+    lines = [
+        "=" * 60,
+        "SWAN BATCH STAGE PREDICTION REPORT",
+        f"Generated : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        f"Model     : {model}",
+        "=" * 60,
+        "",
+        f"Total Individuals : {total}",
+        "",
+        "Stage Distribution:",
+    ]
+    for stage in ["pre", "peri", "post"]:
+        count = dist.get(stage, 0)
+        pct   = count / total * 100 if total else 0
+        lines.append(f"  {stage:<6} : {count} ({pct:.1f}%)")
+    lines += [
+        "",
+        "Confidence Scores:",
+        f"  Mean : {mean_c:.4f}",
+        f"  Min  : {min_c:.4f}",
+        f"  Max  : {max_c:.4f}",
+        "",
+        "Confidence Distribution:",
+        f"  High   (>0.80)     : {high}/{total} ({high/total*100:.1f}%)"   if total else "  N/A",
+        f"  Medium (0.60-0.80) : {medium}/{total} ({medium/total*100:.1f}%)" if total else "  N/A",
+        f"  Low    (≤0.60)     : {low}/{total} ({low/total*100:.1f}%)"    if total else "  N/A",
+        "",
+        f"Output Directory : {run_dir}",
+        "",
+        "⚠️  For research/educational use only. Not a clinical diagnosis.",
+        "=" * 60,
+    ]
+    path.write_text("\n".join(lines), encoding="utf-8")
+def _write_symptom_report(
+    path: Path,
+    individual_id: str,
+    lmp: str,
+    target_date: str,
+    cycle_day: int,
+    cycle_length: int,
+    hot_prob: float,
+    hot_pred: bool,
+    mood_prob: float,
+    mood_pred: bool,
+):
+    hp = float(hot_prob)  if (hot_prob  is not None and not np.isnan(hot_prob))  else 0.0
+    mp = float(mood_prob) if (mood_prob is not None and not np.isnan(mood_prob)) else 0.0
+    lines = [
+        "=" * 60,
+        "SWAN SYMPTOM CYCLE FORECAST REPORT",
+        f"Generated   : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        "=" * 60,
+        "",
+        f"Individual   : {individual_id or 'N/A'}",
+        f"LMP          : {lmp}",
+        f"Target Date  : {target_date or 'Today'}",
+        f"Cycle Length : {cycle_length} days",
+        f"Cycle Day    : {cycle_day}",
+        "",
+        "Symptom Probabilities:",
+        f"  Hot Flash   : {hp:.4f}  {'[ELEVATED RISK]' if hot_pred else '[LOW RISK]'}",
+        f"  Mood Change : {mp:.4f}  {'[ELEVATED RISK]' if mood_pred else '[LOW RISK]'}",
+        "",
+        "⚠️  For research/educational use only. Not a clinical diagnosis.",
+        "=" * 60,
+    ]
+    path.write_text("\n".join(lines), encoding="utf-8")
+def _write_batch_symptom_report(
+    path: Path,
+    results: pd.DataFrame,
+    cycle_length: int,
+    run_dir: Path,
+):
+    total      = len(results)
+    hot_flags  = int(results["hotflash_pred"].sum()) \
+                 if "hotflash_pred" in results.columns else 0
+    mood_flags = int(results["mood_pred"].sum()) \
+                 if "mood_pred" in results.columns else 0
+    mean_hot   = float(results["hotflash_prob"].mean()) \
+                 if "hotflash_prob" in results.columns else 0.0
+    mean_mood  = float(results["mood_prob"].mean()) \
+                 if "mood_prob" in results.columns else 0.0
+    lines = [
+        "=" * 60,
+        "SWAN BATCH SYMPTOM FORECAST REPORT",
+        f"Generated    : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+        f"Cycle Length : {cycle_length} days",
+        "=" * 60,
+        "",
+        f"Total Individuals  : {total}",
+        f"Hot Flash Risk     : {hot_flags}/{total} elevated",
+        f"Mood Change Risk   : {mood_flags}/{total} elevated",
+        f"Avg Hot Flash Prob : {mean_hot:.4f}",
+        f"Avg Mood Prob      : {mean_mood:.4f}",
+        "",
+        f"Output Directory : {run_dir}",
+        "",
+        "⚠️  For research/educational use only. Not a clinical diagnosis.",
+        "=" * 60,
+    ]
+    path.write_text("\n".join(lines), encoding="utf-8")
+# ── Core prediction functions ─────────────────────────────────────────────────
+def predict_single_stage(
+    age, race, langint,
+    hot_flash, num_hot_flash, bothersome_hf,
+    sleep_quality, depression_indicator, mood_change, irritability,
+    pain_indicator, abbleed, vaginal_dryness, lmp_day,
+    model_choice,
+):
+    """
+    Single-person stage prediction.
+    Returns (stage_html, chart_fig, conf_note, compare_html, csv_download_path).
+    """
+    if not _MODEL_OK:
+        return f"⚠️ {_MODEL_MSG}", None, "Models unavailable.", "", None
+    # Build feature dict using the model's canonical feature names
+    def _v(x):
+        return float(x) if x is not None else np.nan
+    feature_dict = {
+        "AGE7":      _v(age),
+        "RACE":      _v(race),
+        "LANGINT7":  _v(langint),
+        "HOTFLAS7":  _v(hot_flash),
+        "NUMHOTF7":  _v(num_hot_flash),
+        "BOTHOTF7":  _v(bothersome_hf),
+        "SLEEPQL7":  _v(sleep_quality),
+        "DEPRESS7":  _v(depression_indicator),
+        "MOODCHG7":  _v(mood_change),
+        "IRRITAB7":  _v(irritability),
+        "PAIN17":    _v(pain_indicator),
+        "ABBLEED7":  _v(abbleed),          # ← correct feature name (was ABLEED7)
+        "VAGINDR7":  _v(vaginal_dryness),
+        "LMPDAY7":   _v(lmp_day) if lmp_day else np.nan,
+    }
+    try:
+        result     = _forecast.predict_single(feature_dict, model=model_choice, return_proba=True)
+        stage      = result["stage"]
+        confidence = result.get("confidence") or 0.0
+        proba      = result.get("probabilities") or {}
+        # ── Create timestamped run directory ──────────────────────────────────
+        run_dir = _make_run_dir()
+        # ── Save probability chart (PNG) ──────────────────────────────────────
+        chart_path = run_dir / "charts" / "stage_probabilities.png"
+        chart_fig  = _make_proba_chart(proba, stage, save_path=chart_path) if proba else None
+        # ── Save prediction CSV ───────────────────────────────────────────────
+        pred_row = {
+            "predicted_stage": stage,
+            "model":           model_choice,
+            "confidence":      round(confidence, 4),
+            **{f"prob_{k}": round(v, 4) for k, v in proba.items()},
+            "timestamp":       datetime.now().isoformat(),
+        }
+        csv_path = run_dir / "predictions" / "stage_prediction.csv"
+        pd.DataFrame([pred_row]).to_csv(csv_path, index=False)
+        # ── Model comparison ──────────────────────────────────────────────────
+        comparison = _forecast.compare_models(feature_dict)
+        rf_stage   = comparison["RandomForest"]["stage"]
+        lr_stage   = comparison["LogisticRegression"]["stage"]
+        agree      = rf_stage == lr_stage
+        # ── Save text report ──────────────────────────────────────────────────
+        txt_path = run_dir / "reports" / "prediction_summary.txt"
+        _write_single_stage_report(
+            txt_path, stage, confidence, proba,
+            model_choice, comparison, feature_dict,
+        )
+        # ── Build result card HTML ────────────────────────────────────────────
+        info       = STAGE_INFO.get(stage, {})
+        emoji      = STAGE_EMOJI.get(stage, "⚪")
+        color      = STAGE_COLORS.get(stage, "#607d8b")
+        conf_color = _confidence_color(confidence)
+        symptom_tags = "".join(
+            f'<span style="background:{color}14;color:{color};padding:4px 10px;'
+            f'border-radius:20px;border:1px solid {color}44;font-size:12px;'
+            f'font-weight:500">{s}</span>'
+            for s in info.get("symptoms", [])
+        )
+        stage_html = f"""
+        <div class="result-card" style="border-left:4px solid {color}">
+          <div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
+            <span style="font-size:40px;flex-shrink:0">{emoji}</span>
+            <div style="flex:1;min-width:140px">
+              <div style="color:#6b7280;font-size:12px;text-transform:uppercase;letter-spacing:2px">
+                Predicted Stage
+              </div>
+              <div style="color:{color};font-size:26px;font-weight:700">
+                {STAGE_LABELS.get(stage, stage)}
+              </div>
+            </div>
+            <div style="text-align:right;flex-shrink:0">
+              <div style="color:#6b7280;font-size:11px">Confidence</div>
+              <div style="color:{conf_color};font-size:28px;font-weight:700">
+                {confidence:.0%}
+              </div>
+            </div>
+          </div>
+          <hr style="border:none;border-top:1px solid #e2e8f0;margin:12px 0">
+          <p style="color:#374151;font-size:14px;margin:8px 0">
+            {info.get('description', '')}
+          </p>
+          <div style="margin-top:12px">
+            <div style="color:#6b7280;font-size:11px;text-transform:uppercase;
+                        letter-spacing:1px;margin-bottom:6px">Common Symptoms</div>
+            <div style="display:flex;flex-wrap:wrap;gap:6px">{symptom_tags}</div>
+          </div>
+          <div style="background:{color}0d;border-left:3px solid {color};
+                      padding:10px 14px;margin-top:14px;border-radius:0 8px 8px 0">
+            <span style="color:{color};font-size:12px;font-weight:600">💡 Guidance: </span>
+            <span style="color:#374151;font-size:13px">{info.get('guidance', '')}</span>
+          </div>
+          <div style="color:#9ca3af;font-size:11px;margin-top:12px">
+            Model: {model_choice} · {datetime.now().strftime('%Y-%m-%d %H:%M')}
+          </div>
+        </div>
+        """
+        # Confidence note
+        if confidence >= 0.8:
+            conf_note = "✅ High confidence — the model is quite certain about this stage."
+        elif confidence >= 0.6:
+            conf_note = ("⚠️ Moderate confidence — consider providing more feature values "
+                         "or consulting a clinician.")
+        else:
+            conf_note = ("🔴 Low confidence — prediction is uncertain; "
+                         "clinical consultation is strongly recommended.")
+        # Model comparison panel + run-dir info
+        compare_html = f"""
+        <div class="result-card" style="margin-top:0">
+          <div style="color:#6b7280;font-size:11px;text-transform:uppercase;
+                      letter-spacing:1px;margin-bottom:10px;font-weight:600">
+            Model Comparison
+          </div>
+          <div class="stat-grid-2">
+            <div class="stat-item" style="border-top:3px solid #16a34a">
+              <div style="color:#16a34a;font-size:11px;font-weight:600">Random Forest</div>
+              <div style="color:#111827;font-size:17px;margin-top:4px">
+                {STAGE_EMOJI.get(rf_stage,'')} {STAGE_LABELS.get(rf_stage, rf_stage)}
+              </div>
+              <div style="color:#6b7280;font-size:12px">
+                {comparison['RandomForest'].get('confidence', 0):.0%} confidence
+              </div>
+            </div>
+            <div class="stat-item" style="border-top:3px solid #2563eb">
+              <div style="color:#2563eb;font-size:11px;font-weight:600">
+                Logistic Regression
+              </div>
+              <div style="color:#111827;font-size:17px;margin-top:4px">
+                {STAGE_EMOJI.get(lr_stage,'')} {STAGE_LABELS.get(lr_stage, lr_stage)}
+              </div>
+              <div style="color:#6b7280;font-size:12px">
+                {comparison['LogisticRegression'].get('confidence', 0):.0%} confidence
+              </div>
+            </div>
+          </div>
+          <div style="margin-top:10px;padding:8px;border-radius:8px;
+                      background:{'#d1fae5' if agree else '#fef2f2'};
+                      color:{'#065f46' if agree else '#9f1239'};
+                      font-size:13px;text-align:center;font-weight:500">
+            {"✅ Both models agree — prediction is robust"
+             if agree else
+             "⚠️ Models disagree — interpret with caution"}
+          </div>
+          <div class="output-path-box">
+            <div class="output-path-title">📁 Outputs saved to:</div>
+            <div class="output-path-dir">{run_dir}/</div>
+            <div class="output-path-files">
+              charts/stage_probabilities.png<br>
+              predictions/stage_prediction.csv<br>
+              reports/prediction_summary.txt
+            </div>
+          </div>
+        </div>
+        """
+        return stage_html, chart_fig, conf_note, compare_html, str(csv_path)
+    except Exception as exc:
+        return f"❌ Prediction error: {exc}", None, "", "", None
+def predict_batch_stage(file, model_choice):
+    """
+    Batch stage prediction from uploaded CSV.
+    Returns (csv_download_path, summary_html, preview_df).
+    """
+    if not _MODEL_OK:
+        return None, f"⚠️ {_MODEL_MSG}", None
+    if file is None:
+        return None, "Please upload a CSV file.", None
+    file_path = _get_file_path(file)
+    try:
+        df = pd.read_csv(file_path)
+    except Exception as exc:
+        return None, f"Could not read CSV: {exc}", None
+    if df.empty:
+        return None, "Uploaded CSV is empty.", None
+    # Identify ID column
+    id_col_candidates = ["individual", "Individual", "ID", "id",
+                         "SWANID", "subject", "Subject"]
+    id_col = next((c for c in id_col_candidates if c in df.columns), None)
+    # Validate features
+    feature_names = _metadata.get("feature_names", [])
+    matching      = [c for c in df.columns if c in feature_names]
+    missing_pct   = 1 - len(matching) / max(len(feature_names), 1)
+    warnings_list = []
+    if not matching:
+        return None, (
+            "❌ No matching feature columns found. "
+            "Please include columns from the training feature set "
+            "(see 'Feature Reference' tab)."
+        ), None
+    if missing_pct > 0.5:
+        warnings_list.append(
+            f"⚠️ {missing_pct:.0%} of training features are missing — "
+            "prediction accuracy may be reduced."
+        )
+    try:
+        results = _forecast.predict_batch(df, model=model_choice, return_proba=True)
+        # Insert individual ID
+        if id_col:
+            results.insert(0, "individual", df[id_col].values)
+        else:
+            results.insert(0, "individual",
+                           [f"Row_{i+1}" for i in range(len(results))])
+        results["model"] = model_choice
+        results["notes"] = ""
+        if "confidence" in results.columns:
+            low_mask = results["confidence"] < 0.6
+            results.loc[low_mask, "notes"] = "Low confidence — review manually"
+        # ── Create timestamped run directory ──────────────────────────────────
+        run_dir = _make_run_dir()
+        # ── Save predictions CSV ──────────────────────────────────────────────
+        csv_path = run_dir / "predictions" / "batch_stage_predictions.csv"
+        results.to_csv(csv_path, index=False)
+        # ── Save confidence/distribution chart (PNG) ──────────────────────────
+        chart_path = run_dir / "charts" / "batch_summary_chart.png"
+        _make_batch_summary_chart(results, save_path=chart_path)
+        # ── Save text report ──────────────────────────────────────────────────
+        txt_path = run_dir / "reports" / "batch_summary.txt"
+        _write_batch_report(txt_path, results, model_choice, run_dir)
+        # ── Build summary HTML ────────────────────────────────────────────────
+        total     = len(results)
+        dist      = results["predicted_stage"].value_counts().to_dict()
+        mean_conf = results["confidence"].mean() \
+                    if "confidence" in results.columns else 0.0
+        high_conf = int((results["confidence"] > 0.8).sum()) \
+                    if "confidence" in results.columns else 0
+        dist_bars = ""
+        for stage in ["pre", "peri", "post"]:
+            count = dist.get(stage, 0)
+            pct   = count / total * 100
+            dist_bars += f"""
+            <div style="margin:6px 0">
+              <div style="display:flex;justify-content:space-between;margin-bottom:2px">
+                <span style="color:#374151;font-size:13px">
+                  {STAGE_EMOJI.get(stage,'')} {STAGE_LABELS.get(stage, stage)}
+                </span>
+                <span style="color:#6b7280;font-size:12px">{count} ({pct:.0f}%)</span>
+              </div>
+              <div style="background:#e2e8f0;border-radius:4px;height:8px">
+                <div style="background:{STAGE_COLORS.get(stage,'#6b7280')};
+                            width:{pct}%;height:8px;border-radius:4px"></div>
+              </div>
+            </div>"""
+        warn_html = "".join(
+            f'<div style="color:#d97706;font-size:12px;margin-top:4px">{w}</div>'
+            for w in warnings_list
+        )
+        summary_html = f"""
+        <div class="result-card">
+          <div style="color:#111827;font-size:16px;font-weight:700;margin-bottom:14px">
+            📊 Batch Results — {total} individuals
+          </div>
+          {warn_html}
+          <div class="stat-grid-3">
+            <div class="stat-item">
+              <div class="stat-label">Total</div>
+              <div class="stat-value">{total}</div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">Avg Confidence</div>
+              <div class="stat-value" style="color:{_confidence_color(mean_conf)}">
+                {mean_conf:.0%}
+              </div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">High Conf (&gt;80%)</div>
+              <div class="stat-value" style="color:#16a34a">{high_conf}/{total}</div>
+            </div>
+          </div>
+          <div style="margin-top:12px">{dist_bars}</div>
+          <div class="output-path-box">
+            <div class="output-path-title">📁 Outputs saved to:</div>
+            <div class="output-path-dir">{run_dir}/</div>
+            <div class="output-path-files">
+              predictions/batch_stage_predictions.csv<br>
+              charts/batch_summary_chart.png<br>
+              reports/batch_summary.txt
+            </div>
+          </div>
+        </div>
+        """
+        return str(csv_path), summary_html, results.head(20)
+    except Exception as exc:
+        return None, f"❌ Batch prediction error: {exc}", None
+def predict_symptoms(individual_id, lmp_input, target_date_input, cycle_length):
+    """
+    Cycle-based symptom forecasting (single person).
+    Returns (result_html, chart_fig, csv_download_path).
+    """
+    if not lmp_input:
+        return "Please enter your Last Menstrual Period date.", None, None
+    try:
+        cycle_length = int(cycle_length) if cycle_length else 28
+        fore         = SymptomCycleForecaster(cycle_length=cycle_length)
+        target_date  = target_date_input if target_date_input else None
+        result       = fore.predict_single(lmp=lmp_input, target_date=target_date)
+        cycle_day = result.get("cycle_day")
+        hot_prob  = result.get("hotflash_prob", 0)
+        hot_pred  = result.get("hotflash_pred", False)
+        mood_prob = result.get("mood_prob", 0)
+        mood_pred = result.get("mood_pred", False)
+        # Safe float helpers
+        hp = float(hot_prob)  if (hot_prob  is not None and not np.isnan(hot_prob))  else 0.0
+        mp = float(mood_prob) if (mood_prob is not None and not np.isnan(mood_prob)) else 0.0
+        # ── Create timestamped run directory ──────────────────────────────────
+        run_dir = _make_run_dir()
+        # ── Save cycle chart (PNG) ────────────────────────────────────────────
+        chart_path = run_dir / "charts" / "cycle_position.png"
+        chart_fig  = _make_cycle_chart(
+            cycle_day, cycle_length, hp, mp, save_path=chart_path
+        )
+        # ── Save forecast CSV ─────────────────────────────────────────────────
+        csv_path = run_dir / "predictions" / "symptom_forecast.csv"
+        pd.DataFrame([{
+            "individual":    individual_id or "N/A",
+            "LMP":           lmp_input,
+            "date":          target_date_input or datetime.now().strftime("%Y-%m-%d"),
+            "cycle_day":     cycle_day,
+            "hotflash_prob": round(hp,  6),
+            "hotflash_pred": bool(hot_pred),
+            "mood_prob":     round(mp,  6),
+            "mood_pred":     bool(mood_pred),
+        }]).to_csv(csv_path, index=False)
+        # ── Save text report ──────────────────────────────────────────────────
+        txt_path = run_dir / "reports" / "symptom_summary.txt"
+        _write_symptom_report(
+            txt_path, individual_id, lmp_input, target_date_input,
+            cycle_day, cycle_length, hp, hot_pred, mp, mood_pred,
+        )
+        # ── Build result HTML ─────────────────────────────────────────────────
+        def _prob_bar(prob, label, color):
+            pct = min(prob * 100, 100)
+            return f"""
+            <div style="margin:10px 0">
+              <div style="display:flex;justify-content:space-between;margin-bottom:4px">
+                <span style="color:#374151;font-size:14px">{label}</span>
+                <span style="color:{color};font-size:16px;font-weight:700">{pct:.0f}%</span>
+              </div>
+              <div style="background:#e2e8f0;border-radius:6px;height:10px">
+                <div style="background:{color};width:{pct}%;height:10px;
+                            border-radius:6px;transition:width 0.5s"></div>
+              </div>
+            </div>"""
+        hot_alert  = "🔴 Elevated risk" if hot_pred  else "🟢 Low risk"
+        mood_alert = "🔴 Elevated risk" if mood_pred else "🟢 Low risk"
+        html = f"""
+        <div class="result-card">
+          <div style="color:#111827;font-size:18px;font-weight:700;margin-bottom:4px">
+            {individual_id or 'Forecast'} — Cycle Day {cycle_day or '?'}
+          </div>
+          <div style="color:#6b7280;font-size:13px;margin-bottom:20px">
+            LMP: {lmp_input} | Target: {target_date_input or 'Today'}
+            | Cycle: {cycle_length} days
+          </div>
+          {_prob_bar(hp, '🔥 Hot Flash Probability', '#ef4444')}
+          <div style="color:#6b7280;font-size:12px;margin:-6px 0 10px 2px">{hot_alert}</div>
+          {_prob_bar(mp, '😤 Mood Change Probability', '#7c3aed')}
+          <div style="color:#6b7280;font-size:12px;margin:-6px 0 10px 2px">{mood_alert}</div>
+          <div style="background:#f8fafc;border:1px solid #e2e8f0;border-radius:8px;
+                      padding:12px;margin-top:14px;font-size:12px;color:#6b7280">
+            ℹ️ Probabilities are computed from a cycle-phase model (Gaussian heuristic).
+            They represent symptom likelihood based on cycle day, not a clinical diagnosis.
+          </div>
+          <div class="output-path-box">
+            <div class="output-path-title">📁 Outputs saved to:</div>
+            <div class="output-path-dir">{run_dir}/</div>
+            <div class="output-path-files">
+              charts/cycle_position.png<br>
+              predictions/symptom_forecast.csv<br>
+              reports/symptom_summary.txt
+            </div>
+          </div>
+        </div>
+        """
+        return html, chart_fig, str(csv_path)
+    except Exception as exc:
+        return f"❌ Error: {exc}", None, None
+def predict_symptoms_batch(file, lmp_col_name, date_col_name, cycle_length):
+    """
+    Batch symptom forecasting from CSV.
+    Returns (csv_download_path, summary_html, preview_df).
+    """
+    if file is None:
+        return None, "Please upload a CSV file.", None
+    file_path = _get_file_path(file)
+    try:
+        df = pd.read_csv(file_path)
+    except Exception as exc:
+        return None, f"Could not read CSV: {exc}", None
+    if lmp_col_name not in df.columns:
+        return None, (
+            f"LMP column '{lmp_col_name}' not found in CSV. "
+            f"Columns present: {list(df.columns)}"
+        ), None
+    try:
+        cycle_length = int(cycle_length) if cycle_length else 28
+        fore         = SymptomCycleForecaster(cycle_length=cycle_length)
+        date_col     = date_col_name \
+                       if (date_col_name and date_col_name in df.columns) else None
+        results      = fore.predict_df(df, lmp_col=lmp_col_name, date_col=date_col)
+        # ── Create timestamped run directory ──────────────────────────────────
+        run_dir = _make_run_dir()
+        # ── Save predictions CSV ──────────────────────────────────────────────
+        csv_path = run_dir / "predictions" / "batch_symptom_forecast.csv"
+        results.to_csv(csv_path, index=False)
+        # ── Save text report ──────────────────────────────────────────────────
+        txt_path = run_dir / "reports" / "batch_symptom_summary.txt"
+        _write_batch_symptom_report(txt_path, results, cycle_length, run_dir)
+        # ── Build summary HTML ────────────────────────────────────────────────
+        total      = len(results)
+        hot_flags  = int(results["hotflash_pred"].sum()) \
+                     if "hotflash_pred" in results.columns else 0
+        mood_flags = int(results["mood_pred"].sum()) \
+                     if "mood_pred" in results.columns else 0
+        mean_hot   = float(results["hotflash_prob"].mean()) \
+                     if "hotflash_prob" in results.columns else 0.0
+        mean_mood  = float(results["mood_prob"].mean()) \
+                     if "mood_prob" in results.columns else 0.0
+        summary_html = f"""
+        <div class="result-card">
+          <div style="color:#111827;font-size:16px;font-weight:700;margin-bottom:14px">
+            🌊 Symptom Forecast — {total} individuals
+          </div>
+          <div class="stat-grid-3">
+            <div class="stat-item">
+              <div class="stat-label">Total</div>
+              <div class="stat-value">{total}</div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">🔥 Hot Flash Risk</div>
+              <div class="stat-value" style="color:#ef4444">{hot_flags}</div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">😤 Mood Risk</div>
+              <div class="stat-value" style="color:#7c3aed">{mood_flags}</div>
+            </div>
+          </div>
+          <div class="stat-grid-2">
+            <div class="stat-item">
+              <div class="stat-label">Avg Hot Flash Prob</div>
+              <div class="stat-value" style="color:#ef4444;font-size:18px">
+                {mean_hot:.1%}
+              </div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">Avg Mood Prob</div>
+              <div class="stat-value" style="color:#7c3aed;font-size:18px">
+                {mean_mood:.1%}
+              </div>
+            </div>
+          </div>
+          <div class="output-path-box">
+            <div class="output-path-title">📁 Outputs saved to:</div>
+            <div class="output-path-dir">{run_dir}/</div>
+            <div class="output-path-files">
+              predictions/batch_symptom_forecast.csv<br>
+              reports/batch_symptom_summary.txt
+            </div>
+          </div>
+        </div>
+        """
+        return str(csv_path), summary_html, results
+    except Exception as exc:
+        return None, f"❌ Error: {exc}", None
+# ── Feature reference & model status ─────────────────────────────────────────
+def get_feature_reference() -> str:
+    feature_names = _metadata.get("feature_names", list(FEATURE_DESCRIPTIONS.keys()))
+    rows = ""
+    for i, f in enumerate(feature_names[:60]):
+        desc = FEATURE_DESCRIPTIONS.get(f, f.split("_")[0])
+        rows += f"""
+        <tr>
+          <td class="feature-num">{i + 1}</td>
+          <td class="feature-code">{f}</td>
+          <td class="feature-desc">{desc}</td>
+        </tr>"""
+    remaining = len(feature_names) - 60
+    if remaining > 0:
+        rows += f"""
+        <tr>
+          <td colspan="3" style="padding:8px;color:#9ca3af;font-size:12px;text-align:center">
+            … and {remaining} more features (one-hot encoded categories)
+          </td>
+        </tr>"""
+    return f"""
+    <div class="feature-table-wrap">
+      <div style="color:#111827;font-size:16px;font-weight:700;margin-bottom:14px">
+        📋 Training Features ({len(feature_names)} total after encoding)
+      </div>
+      <table>
+        <thead>
+          <tr>
+            <th>#</th>
+            <th>Feature</th>
+            <th>Description</th>
+          </tr>
+        </thead>
+        <tbody>{rows}</tbody>
+      </table>
+    </div>
+    """
+def get_model_status() -> str:
+    if _MODEL_OK:
+        fc  = len(_metadata.get("feature_names", []))
+        sc  = _metadata.get("stage_classes", ["pre", "peri", "post"])
+        badges = "".join(
+            f'<span style="background:{STAGE_COLORS.get(s,"#607d8b")}18;'
+            f'color:{STAGE_COLORS.get(s,"#555")};padding:4px 12px;'
+            f'border-radius:20px;border:1px solid {STAGE_COLORS.get(s,"#607d8b")}44;'
+            f'font-size:13px;font-weight:600">{STAGE_EMOJI.get(s,"")} {s}</span>'
+            for s in sc
+        )
+        return f"""
+        <div class="status-card">
+          <div style="display:flex;align-items:center;gap:10px;margin-bottom:14px">
+            <span style="font-size:24px">✅</span>
+            <div>
+              <div style="color:#059669;font-size:16px;font-weight:700">
+                Models Loaded Successfully
+              </div>
+              <div style="color:#6b7280;font-size:12px">Ready for predictions</div>
+            </div>
+          </div>
+          <div class="stat-grid-3">
+            <div class="stat-item">
+              <div class="stat-label">Features</div>
+              <div class="stat-value">{fc}</div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">Models</div>
+              <div class="stat-value">2</div>
+            </div>
+            <div class="stat-item">
+              <div class="stat-label">Stages</div>
+              <div class="stat-value">{len(sc)}</div>
+            </div>
+          </div>
+          <div style="margin-top:14px">
+            <div style="color:#6b7280;font-size:11px;text-transform:uppercase;
+                        letter-spacing:0.5px;margin-bottom:6px">Available Stages</div>
+            <div style="display:flex;gap:8px;flex-wrap:wrap">{badges}</div>
+          </div>
+        </div>
+        """
+    return f"""
+    <div class="status-card">
+      <div style="display:flex;align-items:center;gap:10px;margin-bottom:10px">
+        <span style="font-size:24px">⚠️</span>
+        <div>
+          <div style="color:#dc2626;font-size:16px;font-weight:700">
+            Models Not Loaded
+          </div>
+          <div style="color:#6b7280;font-size:12px">{_MODEL_MSG}</div>
+        </div>
+      </div>
+      <div style="background:#fef2f2;border:1px solid #fecaca;border-radius:8px;
+                  padding:12px;color:#9f1239;font-size:13px">
+        To train and save models:<br>
+        <code style="background:#1e293b;color:#a3e635;padding:4px 8px;border-radius:4px;
+                     margin-top:6px;display:inline-block">python menopause.py</code>
+        <br><br>
+        This generates <code style="background:#e2e8f0;padding:2px 5px;border-radius:3px;
+        color:#1e293b">swan_ml_output/rf_pipeline.pkl</code>,
+        <code style="background:#e2e8f0;padding:2px 5px;border-radius:3px;
+        color:#1e293b">lr_pipeline.pkl</code>, and
+        <code style="background:#e2e8f0;padding:2px 5px;border-radius:3px;
+        color:#1e293b">forecast_metadata.json</code>.
+      </div>
+    </div>
+    """
+# ── Education content ─────────────────────────────────────────────────────────
+EDUCATION_HTML = """
+<div class="edu-card">
+  <h2>🌸 Understanding Menopause</h2>
+  <p>Menopause is a natural biological process marking the end of menstrual cycles.
+  It is officially diagnosed after 12 consecutive months without a menstrual period
+  and typically occurs in women in their late 40s to early 50s.</p>
+  <h3>Three Stages</h3>
+  <div class="stage-cards-grid">
+    <div class="stage-card-pre">
+      <div style="color:#16a34a;font-weight:700;margin-bottom:8px">🟢 Pre-Menopause</div>
+      <p style="font-size:13px;margin:0;color:#374151">Regular ovarian function. Periods are predictable.
+      Hormones (estrogen, progesterone) follow a consistent monthly pattern.</p>
+    </div>
+    <div class="stage-card-peri">
+      <div style="color:#d97706;font-weight:700;margin-bottom:8px">🟡 Peri-Menopause</div>
+      <p style="font-size:13px;margin:0;color:#374151">Transition phase — usually begins in the mid-40s.
+      Hormone levels fluctuate. Periods become irregular.
+      Hot flashes and sleep issues may begin.</p>
+    </div>
+    <div class="stage-card-post">
+      <div style="color:#7c3aed;font-weight:700;margin-bottom:8px">🟣 Post-Menopause</div>
+      <p style="font-size:13px;margin:0;color:#374151">12+ months after the last period.
+      Lower estrogen levels. Risk factors for osteoporosis and
+      cardiovascular disease increase.</p>
+    </div>
+  </div>
+  <h3>Common Symptoms by Stage</h3>
+  <table style="width:100%;border-collapse:collapse;font-size:13px">
+    <thead>
+      <tr style="background:#f8fafc">
+        <th style="padding:8px;text-align:left;color:#6b7280;font-weight:600">Symptom</th>
+        <th style="padding:8px;text-align:center;color:#16a34a;font-weight:600">Pre</th>
+        <th style="padding:8px;text-align:center;color:#d97706;font-weight:600">Peri</th>
+        <th style="padding:8px;text-align:center;color:#7c3aed;font-weight:600">Post</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr style="border-bottom:1px solid #e2e8f0">
+        <td style="padding:8px;color:#374151">Hot flashes</td>
+        <td style="text-align:center;color:#9ca3af">–</td>
+        <td style="text-align:center">✅</td>
+        <td style="text-align:center">✅</td>
+      </tr>
+      <tr style="border-bottom:1px solid #e2e8f0">
+        <td style="padding:8px;color:#374151">Irregular periods</td>
+        <td style="text-align:center;color:#9ca3af">–</td>
+        <td style="text-align:center">✅</td>
+        <td style="text-align:center;color:#9ca3af">N/A</td>
+      </tr>
+      <tr style="border-bottom:1px solid #e2e8f0">
+        <td style="padding:8px;color:#374151">Sleep disturbances</td>
+        <td style="text-align:center;color:#6b7280">Mild</td>
+        <td style="text-align:center">✅</td>
+        <td style="text-align:center">✅</td>
+      </tr>
+      <tr style="border-bottom:1px solid #e2e8f0">
+        <td style="padding:8px;color:#374151">Mood changes</td>
+        <td style="text-align:center;color:#6b7280">PMS</td>
+        <td style="text-align:center">✅</td>
+        <td style="text-align:center;color:#6b7280">Possible</td>
+      </tr>
+      <tr style="border-bottom:1px solid #e2e8f0">
+        <td style="padding:8px;color:#374151">Vaginal dryness</td>
+        <td style="text-align:center;color:#9ca3af">–</td>
+        <td style="text-align:center;color:#6b7280">Possible</td>
+        <td style="text-align:center">✅</td>
+      </tr>
+      <tr>
+        <td style="padding:8px;color:#374151">Bone density changes</td>
+        <td style="text-align:center;color:#9ca3af">–</td>
+        <td style="text-align:center;color:#6b7280">Begins</td>
+        <td style="text-align:center">✅</td>
+      </tr>
+    </tbody>
+  </table>
+  <h3>About This Tool</h3>
+  <p style="font-size:13px">This application uses machine learning models trained on the
+  SWAN (Study of Women's Health Across the Nation) dataset — a landmark multisite,
+  multiethnic longitudinal study. The models were trained on self-reported symptom and
+  behavioral data to predict menopausal stage.</p>
+  <div class="disclaimer-box">
+    ⚠️ <strong style="color:#d97706">Disclaimer:</strong>
+    This tool is for educational and research purposes only.
+    Predictions should not substitute clinical diagnosis.
+    Always consult a qualified healthcare provider for medical advice.
+  </div>
+</div>
+"""
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+CUSTOM_CSS = """
+/* ── Core ────────────────────────────────────────────────────────────── */
+.gradio-container {
+  max-width: 1200px !important;
+  margin: 0 auto !important;
+  font-family: 'Segoe UI', system-ui, -apple-system, sans-serif !important;
+  background: #f0f4f8 !important;
+}
+/* ── Header banner ──────────────────────────────────────────────────── */
+.header-banner {
+  background: linear-gradient(135deg, #faf5ff 0%, #fff0f9 50%, #eff6ff 100%);
+  border: 1px solid #e9d5ff;
+  border-radius: 16px;
+  padding: 28px 32px;
+  margin-bottom: 20px;
+  box-shadow: 0 2px 8px rgba(139,92,246,0.08);
+  position: relative;
+  overflow: hidden;
+}
+.header-banner::before {
+  content: '';
+  position: absolute;
+  top: -40%; right: -5%;
+  width: 280px; height: 280px;
+  background: radial-gradient(circle, rgba(139,92,246,0.08) 0%, transparent 70%);
+  pointer-events: none;
+}
+/* ── Reusable info boxes ─────────────────────────────────────────────── */
+.info-box {
+  background: #f8fafc;
+  border: 1px solid #e2e8f0;
+  border-left: 3px solid #3b82f6;
+  border-radius: 8px;
+  padding: 12px 16px;
+  color: #475569;
+  font-size: 13px;
+  margin-bottom: 16px;
+  line-height: 1.5;
+}
+.info-box code {
+  background: #e2e8f0;
+  color: #1e293b;
+  padding: 1px 5px;
+  border-radius: 3px;
+  font-family: monospace;
+  font-size: 0.9em;
+}
+.section-label {
+  color: #2563eb;
+  font-size: 12px;
+  font-weight: 700;
+  text-transform: uppercase;
+  letter-spacing: 0.6px;
+  margin-bottom: 10px;
+  margin-top: 10px;
+}
+.format-hint {
+  background: #f8fafc;
+  border: 1px solid #e2e8f0;
+  border-radius: 8px;
+  padding: 14px;
+  margin-top: 10px;
+  font-size: 12px;
+  color: #475569;
+}
+.format-hint-title { color: #2563eb; font-weight: 600; margin-bottom: 6px; }
+.format-hint pre   { color: #475569; margin: 0; font-size: 11px; white-space: pre-wrap; }
+.format-hint-note  { color: #94a3b8; font-size: 11px; margin-top: 8px; }
+.placeholder-msg   { color: #9ca3af; text-align: center; padding: 40px; font-size: 14px; }
+.section-divider   { border: none; border-top: 1px solid #e2e8f0; margin: 24px 0; }
+.batch-section-label { color: #2563eb; font-size: 14px; font-weight: 600; margin-bottom: 12px; }
+/* ── Result & summary cards ─────────────────────────────────────────── */
+.result-card {
+  background: #ffffff;
+  border: 1px solid #e2e8f0;
+  border-radius: 16px;
+  padding: 24px;
+  box-shadow: 0 1px 4px rgba(0,0,0,0.06);
+  font-family: 'Segoe UI', system-ui, sans-serif;
+}
+.stat-grid-3 { display:grid; grid-template-columns:repeat(3,1fr); gap:12px; margin:14px 0; }
+.stat-grid-2 { display:grid; grid-template-columns:1fr 1fr; gap:10px; margin-top:10px; }
+.stat-item   { background:#f8fafc; border:1px solid #e2e8f0; padding:12px; border-radius:8px; text-align:center; }
+.stat-label  { color:#6b7280; font-size:11px; text-transform:uppercase; letter-spacing:0.4px; }
+.stat-value  { color:#111827; font-size:22px; font-weight:700; line-height:1.2; margin-top:2px; }
+.output-path-box   { background:#f0fdf4; border:1px solid #bbf7d0; border-radius:8px; padding:10px 14px; margin-top:12px; font-family:monospace; }
+.output-path-title { color:#059669; font-size:12px; font-weight:600; }
+.output-path-dir   { color:#065f46; font-size:11px; margin-top:4px; }
+.output-path-files { color:#6b7280; font-size:10px; margin-top:4px; line-height:1.6; }
+/* ── Code blocks ────────────────────────────────────────────────────── */
+.code-block {
+  background: #1e293b;
+  color: #a3e635;
+  border-radius: 8px;
+  padding: 12px;
+  font-size: 12px;
+  font-family: monospace;
+  white-space: pre;
+  overflow-x: auto;
+}
+/* ── Setup instructions card ─────────────────────────────────────────── */
+.setup-card  { background:#ffffff; border:1px solid #e2e8f0; border-radius:12px; padding:20px; margin-top:16px; font-family:'Segoe UI',system-ui,sans-serif; }
+.setup-title { color:#111827; font-size:15px; font-weight:700; margin-bottom:12px; }
+.setup-step  { color:#374151; font-size:13px; line-height:1.8; }
+.setup-step strong { color:#2563eb; }
+/* ── Education ──────────────────────────────────────────────────────── */
+.edu-card    { background:#ffffff; border:1px solid #e2e8f0; border-radius:16px; padding:28px; font-family:'Segoe UI',system-ui,sans-serif; color:#374151; line-height:1.7; }
+.edu-card h2 { color:#111827; font-size:22px; margin-top:0; }
+.edu-card h3 { color:#7c3aed; font-size:16px; margin-top:20px; }
+.stage-cards-grid { display:grid; grid-template-columns:repeat(3,1fr); gap:16px; margin:14px 0; }
+.stage-card-pre  { background:#f0fdf4; border-top:4px solid #16a34a; padding:16px; border-radius:10px; }
+.stage-card-peri { background:#fffbeb; border-top:4px solid #d97706; padding:16px; border-radius:10px; }
+.stage-card-post { background:#faf5ff; border-top:4px solid #7c3aed; padding:16px; border-radius:10px; }
+.disclaimer-box  { background:#fffbeb; border-left:3px solid #d97706; padding:12px 16px; border-radius:0 8px 8px 0; margin-top:14px; font-size:13px; color:#374151; }
+/* ── Feature reference table ────────────────────────────────────────── */
+.feature-table-wrap { background:#ffffff; border:1px solid #e2e8f0; border-radius:12px; padding:20px; max-height:500px; overflow-y:auto; font-family:'Segoe UI',system-ui,sans-serif; }
+.feature-table-wrap table { width:100%; border-collapse:collapse; }
+.feature-table-wrap thead tr { background:#f8fafc; }
+.feature-table-wrap th   { padding:8px; color:#6b7280; font-size:11px; text-align:left; text-transform:uppercase; letter-spacing:0.4px; }
+.feature-table-wrap tr   { border-bottom:1px solid #e2e8f0; }
+.feature-table-wrap td   { padding:8px; }
+.feature-code { color:#2563eb; font-family:monospace; font-size:13px; }
+.feature-desc { color:#374151; font-size:12px; }
+.feature-num  { color:#9ca3af; font-size:12px; }
+/* ── Model status card ──────────────────────────────────────────────── */
+.status-card  { background:#ffffff; border:1px solid #e2e8f0; border-radius:12px; padding:20px; font-family:'Segoe UI',system-ui,sans-serif; }
+/* ── Footer ─────────────────────────────────────────────────────────── */
+.app-footer   { text-align:center; color:#9ca3af; font-size:11px; margin-top:24px; padding:16px; border-top:1px solid #e2e8f0; }
+.app-footer a { color:#2563eb; text-decoration:none; }
+/* ── Responsive — Tablet (≤ 768 px) ────────────────────────────────── */
+@media (max-width: 768px) {
+  .gradio-container  { padding: 8px !important; }
+  .header-banner     { padding: 16px 20px !important; margin-bottom: 12px !important; }
+  .header-status-badge { display: none !important; }
+  .stat-grid-3       { grid-template-columns: 1fr !important; }
+  .stat-grid-2       { grid-template-columns: 1fr !important; }
+  .stage-cards-grid  { grid-template-columns: 1fr !important; }
+}
+/* ── Responsive — Mobile (≤ 480 px) ────────────────────────────────── */
+@media (max-width: 480px) {
+  .header-banner h1  { font-size: 18px !important; }
+  .result-card       { padding: 16px !important; }
+  .edu-card          { padding: 16px !important; }
+  .setup-card        { padding: 14px !important; }
+}
+"""
+HEADER_HTML = """
+<div class="header-banner">
+  <div style="display:flex;align-items:center;gap:16px;flex-wrap:wrap">
+    <div style="font-size:48px;flex-shrink:0">🌸</div>
+    <div style="flex:1;min-width:200px">
+      <h1 style="margin:0;font-size:26px;font-weight:800;
+                 background:linear-gradient(135deg,#7c3aed,#db2777);
+                 -webkit-background-clip:text;-webkit-text-fill-color:transparent">
+        SWAN Menopause Prediction
+      </h1>
+      <p style="margin:4px 0 0;color:#6b7280;font-size:13px">
+        AI-powered menopausal stage prediction &amp; symptom forecasting ·
+        Based on the SWAN dataset
+      </p>
+    </div>
+    <div class="header-status-badge" style="text-align:right;flex-shrink:0">
+      <div style="background:#ffffff;border:1px solid #e2e8f0;border-radius:8px;
+                  padding:8px 16px;display:inline-block;box-shadow:0 1px 3px rgba(0,0,0,0.06)">
+        <div style="color:#9ca3af;font-size:10px;text-transform:uppercase;letter-spacing:1px">
+          Status
+        </div>
+        <div style="color:{color};font-size:13px;font-weight:600">{status}</div>
+      </div>
+    </div>
+  </div>
+</div>
+""".format(
+    color  = "#059669" if _MODEL_OK else "#dc2626",
+    status = "Models Ready ✅" if _MODEL_OK else "Models Needed ⚠️",
+)
+# ── App builder ───────────────────────────────────────────────────────────────
+def build_app():
+    with gr.Blocks(
+        css   = CUSTOM_CSS,
+        title = "SWAN Menopause Prediction",
+        theme = gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
+    ) as app:
+        gr.HTML(HEADER_HTML)
+        with gr.Tabs():
+            # ── TAB 1: Single Stage Prediction ────────────────────────────────
+            with gr.Tab("🔮 Stage Prediction"):
+                gr.HTML("""
+                <div class="info-box">
+                  Fill in the fields below to predict menopausal stage for a single individual.
+                  All fields are optional — the pipeline handles missing values automatically.
+                  A timestamped output folder is created in
+                  <code>swan_ml_output/</code> for every run.
+                </div>""")
+                with gr.Row():
+                    # ── Input column ──────────────────────────────────────────
+                    with gr.Column(scale=2):
+                        with gr.Group():
+                            gr.HTML('<div class="section-label">Demographics</div>')
+                            with gr.Row():
+                                age = gr.Slider(
+                                    minimum=35, maximum=75, value=48, step=1,
+                                    label="Age (AGE7)",
+                                )
+                                race = gr.Dropdown(
+                                    choices=[1, 2, 3, 4, 5], value=1,
+                                    label="Race (RACE)",
+                                    info="1=White, 2=Black, 3=Chinese, 4=Japanese, 5=Hispanic",
+                                )
+                            langint = gr.Dropdown(
+                                choices=[1, 2, 3], value=1,
+                                label="Interview Language (LANGINT7)",
+                                info="1=English, 2=Spanish, 3=Other",
+                            )
+                        with gr.Group():
+                            gr.HTML('<div class="section-label">Vasomotor Symptoms</div>')
+                            with gr.Row():
+                                hot_flash = gr.Slider(
+                                    minimum=1, maximum=5, value=1, step=1,
+                                    label="Hot Flash Severity (HOTFLAS7)",
+                                    info="1=None, 5=Very severe",
+                                )
+                                num_hot_flash = gr.Slider(
+                                    minimum=0, maximum=15, value=0, step=1,
+                                    label="# Hot Flashes/Week (NUMHOTF7)",
+                                )
+                            bothersome_hf = gr.Slider(
+                                minimum=1, maximum=4, value=1, step=1,
+                                label="How Bothersome (BOTHOTF7)",
+                                info="1=Not at all, 4=Extremely",
+                            )
+                        with gr.Group():
+                            gr.HTML('<div class="section-label">Sleep &amp; Mood</div>')
+                            with gr.Row():
+                                sleep_quality = gr.Slider(
+                                    minimum=1, maximum=5, value=2, step=1,
+                                    label="Sleep Quality (SLEEPQL7)",
+                                    info="1=Very good, 5=Very poor",
+                                )
+                                depression = gr.Slider(
+                                    minimum=0, maximum=4, value=0, step=1,
+                                    label="Depression Indicator (DEPRESS7)",
+                                    info="0=No, higher=more severe",
+                                )
+                            with gr.Row():
+                                mood_change = gr.Slider(
+                                    minimum=1, maximum=5, value=1, step=1,
+                                    label="Mood Changes (MOODCHG7)",
+                                    info="1=None, 5=Severe",
+                                )
+                                irritability = gr.Slider(
+                                    minimum=1, maximum=5, value=1, step=1,
+                                    label="Irritability (IRRITAB7)",
+                                )
+                        with gr.Group():
+                            gr.HTML('<div class="section-label">Physical &amp; Gynaecological</div>')
+                            with gr.Row():
+                                pain = gr.Slider(
+                                    minimum=0, maximum=5, value=0, step=1,
+                                    label="Pain Indicator (PAIN17)",
+                                )
+                                abbleed = gr.Dropdown(
+                                    choices=[0, 1, 2], value=0,
+                                    label="Abnormal Bleeding (ABBLEED7)",
+                                    info="0=No, 1=Yes, 2=Unsure",
+                                )
+                            with gr.Row():
+                                vaginal_dryness = gr.Slider(
+                                    minimum=0, maximum=5, value=0, step=1,
+                                    label="Vaginal Dryness (VAGINDR7)",
+                                )
+                                lmp_day = gr.Number(
+                                    value=None,
+                                    label="LMP Day (LMPDAY7)",
+                                    info="Day of last menstrual period (optional)",
+                                )
+                        model_choice = gr.Radio(
+                            choices=["RandomForest", "LogisticRegression"],
+                            value="RandomForest",
+                            label="Model",
+                            info="RandomForest: higher accuracy | "
+                                 "LogisticRegression: more interpretable",
+                        )
+                        predict_btn = gr.Button(
+                            "🔮 Predict Stage", variant="primary", size="lg"
+                        )
+                    # ── Output column ─────────────────────────────────────────
+                    with gr.Column(scale=3):
+                        result_html = gr.HTML(
+                            '<div class="placeholder-msg">Fill in the form and click Predict Stage</div>'
+                        )
+                        result_chart    = gr.Plot(label="Stage Probabilities")
+                        confidence_note = gr.Textbox(
+                            label="Confidence Note", interactive=False, lines=2
+                        )
+                        compare_html    = gr.HTML()
+                        stage_download  = gr.File(
+                            label="Download Prediction CSV", interactive=False
+                        )
+                predict_btn.click(
+                    fn      = predict_single_stage,
+                    inputs  = [
+                        age, race, langint,
+                        hot_flash, num_hot_flash, bothersome_hf,
+                        sleep_quality, depression, mood_change, irritability,
+                        pain, abbleed, vaginal_dryness, lmp_day,
+                        model_choice,
+                    ],
+                    outputs = [
+                        result_html, result_chart, confidence_note,
+                        compare_html, stage_download,
+                    ],
+                )
+            # ── TAB 2: Batch Stage Prediction ─────────────────────────────────
+            with gr.Tab("📁 Batch Stage Prediction"):
+                gr.HTML("""
+                <div class="info-box">
+                  Upload a CSV file with individual feature values for batch prediction.
+                  Results + charts + a summary report are saved to a timestamped folder
+                  inside <code>swan_ml_output/</code>.
+                </div>""")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        batch_file = gr.File(
+                            label="Upload stage_input.csv",
+                            file_types=[".csv"],
+                        )
+                        batch_model = gr.Radio(
+                            choices=["RandomForest", "LogisticRegression"],
+                            value="RandomForest",
+                            label="Model",
+                        )
+                        gr.HTML("""
+                        <div class="format-hint">
+                          <div class="format-hint-title">Expected CSV Format</div>
+                          <pre>individual,AGE7,RACE,HOTFLAS7,...
+Person_001,48,1,2,...
+Person_002,52,2,1,...</pre>
+                          <div class="format-hint-note">
+                            See the test-csv/ folder for an approved example.
+                          </div>
+                        </div>""")
+                        batch_predict_btn = gr.Button(
+                            "🚀 Run Batch Prediction", variant="primary"
+                        )
+                    with gr.Column(scale=2):
+                        batch_summary_html = gr.HTML(
+                            '<div class="placeholder-msg">Upload a CSV to begin</div>'
+                        )
+                        batch_download  = gr.File(
+                            label="Download Predictions CSV", interactive=False
+                        )
+                        batch_results_df = gr.DataFrame(
+                            label="Results Preview (first 20 rows)",
+                            interactive=False,
+                        )
+                batch_predict_btn.click(
+                    fn      = predict_batch_stage,
+                    inputs  = [batch_file, batch_model],
+                    outputs = [batch_download, batch_summary_html, batch_results_df],
+                )
+            # ── TAB 3: Symptom Forecast ───────────────────────────────────────
+            with gr.Tab("🌊 Symptom Forecast"):
+                gr.HTML("""
+                <div class="info-box">
+                  Predict hot flash and mood change probability based on cycle day
+                  (calculated from Last Menstrual Period date).
+                  All outputs are saved to a timestamped folder inside
+                  <code>swan_ml_output/</code>.
+                </div>""")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        sym_individual = gr.Textbox(
+                            label="Individual ID (optional)",
+                            placeholder="e.g., Patient_001",
+                        )
+                        sym_lmp = gr.Textbox(
+                            label="Last Menstrual Period (LMP)",
+                            placeholder="2026-01-15  or  15  (day of month)",
+                            info="Full date (YYYY-MM-DD) or day-of-month integer",
+                        )
+                        sym_date = gr.Textbox(
+                            label="Target Date (optional)",
+                            placeholder="2026-02-27  (defaults to today)",
+                            info="Date to forecast for (YYYY-MM-DD)",
+                        )
+                        sym_cycle = gr.Slider(
+                            minimum=21, maximum=40, value=28, step=1,
+                            label="Cycle Length (days)",
+                        )
+                        sym_predict_btn = gr.Button(
+                            "🌊 Forecast Symptoms", variant="primary"
+                        )
+                    with gr.Column(scale=2):
+                        sym_result_html = gr.HTML(
+                            '<div class="placeholder-msg">Enter LMP date and click Forecast</div>'
+                        )
+                        sym_chart    = gr.Plot(label="Cycle Position")
+                        sym_download = gr.File(
+                            label="Download Forecast CSV", interactive=False
+                        )
+                sym_predict_btn.click(
+                    fn      = predict_symptoms,
+                    inputs  = [sym_individual, sym_lmp, sym_date, sym_cycle],
+                    outputs = [sym_result_html, sym_chart, sym_download],
+                )
+                gr.HTML('<hr class="section-divider">')
+                gr.HTML('<div class="batch-section-label">📁 Batch Symptom Forecasting</div>')
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        sym_batch_file = gr.File(
+                            label="Upload symptoms_input.csv",
+                            file_types=[".csv"],
+                        )
+                        sym_lmp_col = gr.Textbox(
+                            label="LMP Column Name", value="LMP"
+                        )
+                        sym_date_col = gr.Textbox(
+                            label="Date Column Name (optional)", value="date"
+                        )
+                        sym_cycle_batch = gr.Slider(
+                            minimum=21, maximum=40, value=28, step=1,
+                            label="Default Cycle Length",
+                        )
+                        sym_batch_btn = gr.Button(
+                            "🌊 Run Batch Forecast", variant="primary"
+                        )
+                    with gr.Column(scale=2):
+                        sym_batch_summary  = gr.HTML(
+                            '<div class="placeholder-msg">Upload a CSV to begin</div>'
+                        )
+                        sym_batch_download = gr.File(
+                            label="Download Symptom Forecast CSV", interactive=False
+                        )
+                        sym_batch_df = gr.DataFrame(
+                            label="Results Preview",
+                            interactive=False,
+                        )
+                sym_batch_btn.click(
+                    fn      = predict_symptoms_batch,
+                    inputs  = [
+                        sym_batch_file, sym_lmp_col,
+                        sym_date_col, sym_cycle_batch,
+                    ],
+                    outputs = [sym_batch_download, sym_batch_summary, sym_batch_df],
+                )
+            # ── TAB 4: Education ──────────────────────────────────────────────
+            with gr.Tab("📚 Menopause Education"):
+                gr.HTML(EDUCATION_HTML)
+            # ── TAB 5: Feature Reference ──────────────────────────────────────
+            with gr.Tab("🔬 Feature Reference"):
+                gr.HTML("""
+                <div class="info-box">
+                  Canonical list of features used by the trained models
+                  (from <code>forecast_metadata.json</code>).
+                  For batch CSV uploads, column names must match these feature names.
+                </div>""")
+                gr.HTML(get_feature_reference())
+            # ── TAB 6: Model Status ───────────────────────────────────────────
+            with gr.Tab("⚙️ Model Status"):
+                gr.HTML(get_model_status())
+                gr.HTML("""
+                <div class="setup-card">
+                  <div class="setup-title">🚀 Setup Instructions</div>
+                  <div class="setup-step">
+                    <p><strong>Step 1 — Train models:</strong></p>
+                    <pre class="code-block">python menopause.py</pre>
+                    <p><strong>Step 2 — Verify artifacts:</strong></p>
+                    <pre class="code-block">ls swan_ml_output/
+# rf_pipeline.pkl  lr_pipeline.pkl  forecast_metadata.json</pre>
+                    <p><strong>Step 3 — Run this app:</strong></p>
+                    <pre class="code-block">python app.py</pre>
+                    <p><strong>Step 4 — Deploy on Hugging Face Spaces:</strong></p>
+                    <pre class="code-block">git lfs install
+git lfs track "*.pkl"
+git add .
+git commit -m "SWAN menopause prediction app"
+git push</pre>
+                    <p><strong>Output folder structure (per run):</strong></p>
+                    <pre class="code-block">swan_ml_output/
+  &lt;YYYYMMDD_HHMMSS&gt;/
+    charts/       &larr; PNG visualizations
+    predictions/  &larr; CSV result files
+    reports/      &larr; TXT summary reports</pre>
+                  </div>
+                </div>
+                """)
+        gr.HTML("""
+        <div class="app-footer">
+          SWAN Menopause Prediction App · Built with Gradio ·
+          For research &amp; educational use only · Not for clinical diagnosis ·
+          <a href="https://www.swanstudy.org/" target="_blank">SWAN Study</a>
+        </div>""")
+    return app
+# ── Entry point ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    demo = build_app()
+    demo.launch(
+        server_name = "0.0.0.0",
+        server_port = int(os.environ.get("PORT", 7860)),
+        share       = False,
+        show_error  = True,
+    )

menopause.py ADDED Viewed

	@@ -0,0 +1,1383 @@

+"""
+SWAN Menopause Stage Prediction (pre / peri / post) using self-reported features
+Uses only the uploaded SWAN TSV file (no synthetic data, no external datasets).
+Outputs:
+    - saved artifacts in ./swan_ml_output/
+    - documentation.md summarizing steps and results
+    - optional CSV outputs for stage predictions and symptom predictions (separate files)
+Notes:
+ - The script attempts to locate a menopause-stage column heuristically (common names like MENOSTAT,
+   MENO, MENOSYM, MENOP etc.). Please verify the chosen stage column against the codebook.
+ - Self-reported features are identified using name-pattern heuristics (VMS/HOT/SLEEP/CESD/STRESS/MOOD/SMOK/ALCOH/EXER/PHYS/VAG/URINE/SEX/PAIN etc).
+ - Duplicate column names are tolerantly handled by renaming duplicates.
+"""
+import os, re, sys, argparse
+import numpy as np
+import pandas as pd
+import importlib
+import sklearn
+import matplotlib
+# Use a non-interactive backend by default so the script can run on servers/CI
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
+from sklearn.model_selection import train_test_split
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
+from sklearn.inspection import permutation_importance
+from sklearn.preprocessing import label_binarize
+# --------------------------
+# Environment / CLI defaults
+# --------------------------
+# Defaults may be overridden by environment variables or CLI args below
+DATA_PATH = os.environ.get('MENOPAUSE_DATA', "ICPSR_31901/DS0001/31901-0001-Data.tsv")
+OUTPUT_DIR = os.environ.get('MENOPAUSE_OUT', "swan_ml_output")
+# Parse CLI args (safe to parse here for a script; this will be ignored when imported)
+parser = argparse.ArgumentParser(description='Run menopause stage prediction pipeline')
+parser.add_argument('--data', '-d', default=DATA_PATH, help='Path to SWAN TSV file')
+parser.add_argument('--output', '-o', default=OUTPUT_DIR, help='Output directory for artifacts')
+parser.add_argument('--show', action='store_true', help='Show plots interactively (default: off)')
+parser.add_argument('--stage-col', default=None, help='Override detected stage column name')
+# Symptom cycle prediction CLI options
+parser.add_argument('--predict-symptoms', action='store_true', help='Run symptom cycle prediction from CSV input')
+parser.add_argument('--symptoms-input', default=None, help='Input CSV for symptom predictions')
+parser.add_argument('--symptoms-output', default=None, help='Output CSV to write symptom predictions')
+parser.add_argument('--lmp-col', default='LMP', help='Column name used as LMP (date string or day-of-month integer)')
+parser.add_argument('--date-col', default=None, help='Column name for target date; if omitted, uses today or VISIT date if present')
+parser.add_argument('--cycle-length', type=int, default=28, help='Average cycle length in days for symptom prediction')
+# Dual prediction CLI options (separate inputs/outputs for each model)
+parser.add_argument('--predict-dual', action='store_true', help='Run stage + symptom predictions using separate input/output files')
+parser.add_argument('--stage-input', default=None, help='Input CSV for menopause stage predictions')
+parser.add_argument('--stage-output', default=None, help='Output CSV for menopause stage predictions')
+parser.add_argument('--stage-model', default='RandomForest', help='Model for stage prediction: RandomForest or LogisticRegression')
+parser.add_argument('--forecast-dir', default=OUTPUT_DIR, help='Directory containing saved forecast models')
+parser.add_argument('--menopause-stage-col', default=None, help='(Deprecated) Kept for backward compatibility; symptom forecasting no longer uses menopause stage')
+# Parse CLI args only when script is run directly; when imported (e.g., during testing), avoid consuming external argv
+if __name__ == '__main__':
+    args = parser.parse_args()
+else:
+    # Use defaults when module is imported to avoid interfering with external CLI (pytest, etc.)
+    args = parser.parse_args([])
+DATA_PATH = args.data
+OUTPUT_DIR = args.output
+SHOW_PLOTS = bool(args.show)
+STAGE_COL_OVERRIDE = args.stage_col
+# If user only wants symptom-cycle predictions, provide a fast-path before loading the large TSV
+# Define a light-weight cycle-based symptom forecaster and CSV helper so users can run predictions
+# without training the menopause models (useful for small CSV inputs).
+class SymptomCycleForecaster:
+    def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4,
+                 base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5):
+        self.cycle_length = cycle_length
+        self.hot_mu = hot_mu
+        self.hot_sigma = hot_sigma
+        self.mood_mu = mood_mu
+        self.mood_sigma = mood_sigma
+        self.base_hot = base_hot
+        self.amp_hot = amp_hot
+        self.base_mood = base_mood
+        self.amp_mood = amp_mood
+        self.threshold = threshold
+    def _parse_lmp(self, lmp, reference_date=None):
+        if pd.isna(lmp):
+            return None
+        try:
+            lmp_int = int(lmp)
+            if reference_date is None:
+                ref = pd.Timestamp(datetime.today()).to_pydatetime()
+            else:
+                ref = pd.to_datetime(reference_date, errors='coerce')
+                if pd.isna(ref):
+                    ref = pd.Timestamp(datetime.today()).to_pydatetime()
+                else:
+                    ref = ref.to_pydatetime()
+            day = max(1, min(lmp_int, 28))
+            return datetime(ref.year, ref.month, day)
+        except Exception:
+            try:
+                return pd.to_datetime(lmp, errors='coerce').to_pydatetime()
+            except Exception:
+                return None
+    def compute_cycle_day(self, lmp, target_date=None):
+        if target_date is None:
+            tdate = datetime.today()
+        else:
+            tdate = pd.to_datetime(target_date, errors='coerce')
+            if pd.isna(tdate):
+                tdate = datetime.today()
+            else:
+                tdate = tdate.to_pydatetime()
+        lmp_date = self._parse_lmp(lmp, reference_date=tdate)
+        if lmp_date is None:
+            return None
+        delta = (tdate - lmp_date).days
+        if delta < 0:
+            lmp_date = lmp_date - timedelta(days=self.cycle_length)
+            delta = (tdate - lmp_date).days
+        cycle_day = (delta % self.cycle_length) + 1
+        return int(cycle_day)
+    def _gauss_prob(self, day, mu, sigma, base, amp):
+        if day is None:
+            return np.nan
+        val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2)
+        return float(min(max(val, 0.0), 1.0))
+    def predict_single(self, lmp, target_date=None):
+        day = self.compute_cycle_day(lmp, target_date=target_date)
+        hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot)
+        mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood)
+        return {
+            'cycle_day': day,
+            'hotflash_prob': hot_p,
+            'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None,
+            'mood_prob': mood_p,
+            'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None
+        }
+    def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None):
+        df = df.copy()
+        results = df.apply(
+            lambda row: pd.Series(self.predict_single(
+                lmp=row.get(lmp_col),
+                target_date=(row.get(date_col) if date_col is not None else None)
+            )), axis=1
+        )
+        out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1)
+        return out
+def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None,
+                              menopause_stage_col=None, cycle_length=28, **kwargs):
+    df = pd.read_csv(input_csv)
+    fore = SymptomCycleForecaster(cycle_length=cycle_length)
+    out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col)
+    out_df.to_csv(output_csv, index=False)
+    print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}")
+    print("Sample predictions (first 5 rows):")
+    print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string())
+# If the user requested only symptom predictions from a CSV, run fast-path and exit
+if args.predict_symptoms:
+    if not args.symptoms_input or not args.symptoms_output:
+        print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set")
+        sys.exit(1)
+    else:
+        predict_symptoms_from_csv(
+            input_csv=args.symptoms_input,
+            output_csv=args.symptoms_output,
+            lmp_col=args.lmp_col,
+            date_col=args.date_col,
+            menopause_stage_col=None,
+            cycle_length=args.cycle_length
+        )
+        sys.exit(0)
+# Fast-path for dual predictions (separate stage + symptoms) without loading large TSV
+if args.predict_dual:
+    if not args.stage_input or not args.stage_output or not args.symptoms_input or not args.symptoms_output:
+        print("Error: --stage-input, --stage-output, --symptoms-input, and --symptoms-output are required when --predict-dual is set")
+        sys.exit(1)
+    # Load saved pipeline directly via joblib to avoid initializing full training pipeline
+    import joblib
+    model_file = os.path.join(args.forecast_dir, 'rf_pipeline.pkl' if args.stage_model == 'RandomForest' else 'lr_pipeline.pkl')
+    try:
+        pipeline = joblib.load(model_file)
+    except Exception as e:
+        print(f"ERROR: Could not load model file '{model_file}': {e}")
+        print("Please train the models first (run the script without --predict-dual) or provide correct --forecast-dir")
+        sys.exit(1)
+    # Stage predictions
+    try:
+        stage_data = pd.read_csv(args.stage_input)
+    except Exception as e:
+        print(f"ERROR: Could not read stage input CSV '{args.stage_input}': {e}")
+        sys.exit(1)
+    id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
+    feature_cols = [c for c in stage_data.columns if c not in id_cols]
+    # Attempt to load feature metadata so we can reindex inputs to expected features
+    import json
+    metadata_path = os.path.join(args.forecast_dir, 'forecast_metadata.json')
+    try:
+        with open(metadata_path, 'r') as f:
+            metadata = json.load(f)
+            expected_features = metadata.get('feature_names', feature_cols)
+    except Exception:
+        expected_features = feature_cols
+    X = stage_data.reindex(columns=expected_features, fill_value=np.nan)
+    preds = pd.DataFrame({'predicted_stage': pipeline.predict(X), 'model': args.stage_model})
+    try:
+        proba = pipeline.predict_proba(X)
+        final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
+        preds['confidence'] = np.max(proba, axis=1)
+        for i, cls in enumerate(final_est.classes_):
+            preds[f'prob_{cls}'] = proba[:, i]
+    except Exception:
+        preds['confidence'] = np.nan
+    id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None
+    if id_data is not None:
+        stage_results = pd.concat([id_data.reset_index(drop=True), preds.reset_index(drop=True)], axis=1)
+    else:
+        stage_results = preds.reset_index(drop=True)
+        stage_results.insert(0, 'individual', range(1, len(stage_results) + 1))
+    stage_results.to_csv(args.stage_output, index=False)
+    print(f"Wrote stage predictions for {stage_results.shape[0]} rows to {args.stage_output}")
+    # Symptom predictions (independent input/output)
+    try:
+        symptom_data = pd.read_csv(args.symptoms_input)
+    except Exception as e:
+        print(f"ERROR: Could not read symptom input CSV '{args.symptoms_input}': {e}")
+        sys.exit(1)
+    date_col = args.date_col if args.date_col else ('date' if 'date' in symptom_data.columns else None)
+    fore = SymptomCycleForecaster(cycle_length=args.cycle_length)
+    symptom_results = fore.predict_df(symptom_data, lmp_col=args.lmp_col, date_col=date_col)
+    symptom_results.to_csv(args.symptoms_output, index=False)
+    print(f"Wrote symptom predictions for {symptom_results.shape[0]} rows to {args.symptoms_output}")
+    sys.exit(0)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# --------------------------
+# Utility: make column names unique (pandas allows duplicates)
+# --------------------------
+def make_unique_columns(cols):
+    counts = {}
+    new_cols = []
+    for c in cols:
+        if c not in counts:
+            counts[c] = 0
+            new_cols.append(c)
+        else:
+            counts[c] += 1
+            new_cols.append(f"{c}__dup{counts[c]}")
+    return new_cols
+# --------------------------
+# 1. Load data
+# --------------------------
+# Guard: only run training and heavy data loading when script is executed directly
+if __name__ == '__main__' and os.path.exists(DATA_PATH):
+    print("Loading data from:", DATA_PATH)
+    df = pd.read_csv(DATA_PATH, sep='\t', low_memory=False)
+    print("Original shape:", df.shape)
+    # make column names unique for robust selection (duplicates -> __dup1, __dup2)
+    df.columns = make_unique_columns(df.columns.tolist())
+    # Show a few columns (first 40) so user can inspect if running interactively
+    print("First 40 column names (for inspection):")
+    print(df.columns[:40].tolist())
+    # --------------------------
+    # 2. Identify candidate self-reported features and menopause-stage variable
+    # --------------------------
+    # Heuristic patterns for self-report variables (adjust if you'd like to include additional columns)
+    selfreport_patterns = [
+        r'VMS', r'HOT', r'HOTFL', r'NIGHTSW', r'SLEEP', r'CESD', r'STRESS', r'MOOD',
+        r'SMOK', r'ALCOH', r'ALCO', r'EXER', r'PHYS', r'ACTIV', r'VAG', r'URINE', r'SEX', r'PAIN',
+        r'FATIG', r'IRRIT', r'ANXI', r'DEPRESS', r'BLEED', r'MENSE', r'PERIOD', r'LMP',
+        r'HOTSW', r'QOL', r'DRY'
+    ]
+    # Exclude laboratory/biomarker variable name patterns
+    biomarker_exclude = r'E2|FSH|GLUCOSE|CHOLESTEROL|HDL|TRIG|SHBG|DHEAS|INSULIN|BMD|BP|HEIGHT|WEIGHT'
+    upper_cols = {c: c.upper() for c in df.columns}
+    selfreport_cols = []
+    for orig, up in upper_cols.items():
+        for pat in selfreport_patterns:
+            if re.search(pat, up):
+                # skip biomarkers that match both symptom patterns and biomarker patterns
+                if re.search(biomarker_exclude, up):
+                    continue
+                selfreport_cols.append(orig)
+                break
+    # Also include basic self-report demographics commonly present (AGE, RACE)
+    for dem in ['AGE7','AGE','RACE','LANGINT7','LANGINT']:
+        if dem in df.columns and dem not in selfreport_cols:
+            selfreport_cols.append(dem)
+    # Deduplicate preserving order
+    seen=set()
+    selfreport_cols = [x for x in selfreport_cols if not (x in seen or seen.add(x))]
+    print(f"Found {len(selfreport_cols)} candidate self-reported columns (first 50 shown):")
+    print(selfreport_cols[:50])
+    # Identify menopause-stage variable heuristically
+    stage_cand_patterns = [r'MENOSTAT', r'MENOSYM', r'MENO', r'MENOP', r'MENST', r'MENSE', r'STATUS']
+    stage_candidates = [c for c in df.columns if any(re.search(p, c, flags=re.I) for p in stage_cand_patterns)]
+    print("Stage-like candidate columns (found):", stage_candidates[:10])
+    # If user provided an override for stage column via CLI, honor it (if present in data)
+    if STAGE_COL_OVERRIDE:
+        if STAGE_COL_OVERRIDE in df.columns:
+            print(f"Using overridden stage column: {STAGE_COL_OVERRIDE}")
+            stage_candidates = [STAGE_COL_OVERRIDE]
+        else:
+            print(f"Warning: requested stage column '{STAGE_COL_OVERRIDE}' not present in data; proceeding with heuristic detection")
+    # If multiple candidates choose one with few unique values (likely coded categories)
+    stage_col = None
+    for c in stage_candidates:
+        nunique = df[c].nunique(dropna=True)
+        # prefer small discrete sets (e.g., 2-6 categories)
+        if 1 < nunique <= 20:
+            stage_col = c
+            break
+    if stage_col is None and stage_candidates:
+        # fallback to first candidate
+        stage_col = stage_candidates[0]
+    if stage_col is None:
+        raise RuntimeError("No menopause-stage-like column found automatically. Inspect df.columns and pick the proper variable (e.g., MENOSTAT).")
+    print("Selected stage column:", stage_col, " unique values:", df[stage_col].nunique(dropna=True))
+    print("Sample raw counts:")
+    print(df[stage_col].value_counts(dropna=False).head(20))
+    # --------------------------
+    # 3. Create working dataframe with self-report features + stage
+    # --------------------------
+    use_cols = [stage_col] + [c for c in selfreport_cols if c in df.columns and c != stage_col]
+    data = df[use_cols].copy()
+    # Replace common SWAN missing codes with NaN
+    missing_values = [-9, -8, -7, -1, '.', 'NA', 'N/A', '999', 9999]
+    data.replace(missing_values, np.nan, inplace=True)
+    # Try convert object columns to numeric when appropriate
+    for col in data.columns:
+        if data[col].dtype == object:
+            coerced = pd.to_numeric(data[col].astype(str).str.strip(), errors='coerce')
+            # If many values become numeric, use numeric version; else leave as categorical string
+            if coerced.notna().sum() > len(coerced) * 0.5:
+                data[col] = coerced
+            else:
+                # replace blank/'nan' strings with np.nan
+                data[col] = data[col].astype(str).str.strip().replace({'nan': np.nan, '': np.nan})
+    # --------------------------
+    # 4. Map stage variable to standardized labels {pre, peri, post}
+    #    *Important*: this is heuristic. Verify using the codebook and adjust mapping if needed.
+    # --------------------------
+    def map_stage_to_labels(series):
+        # Try textual mapping first
+        s = series.copy()
+        try:
+            uniques = [str(x).lower() for x in s.dropna().unique()]
+        except Exception:
+            uniques = []
+        # textual mapping
+        if any(x in ['pre','premenopausal','premenopause','pre-menopausal'] for x in uniques):
+            s = s.astype(str).str.lower()
+            s = s.replace({'premenopausal':'pre','pre-menopausal':'pre','pre-menopause':'pre','pre':'pre'})
+            s = s.replace({'perimenopausal':'peri','peri-menopausal':'peri','peri':'peri'})
+            s = s.replace({'postmenopausal':'post','post-menopausal':'post','post':'post'})
+            return s.map({'pre':'pre','peri':'peri','post':'post'})
+        # numeric mapping heuristic: map min->pre, median->peri, max->post
+        num = pd.to_numeric(s, errors='coerce')
+        num_unique = sorted(num.dropna().unique().tolist())
+        if len(num_unique) >= 3:
+            mapping = {num_unique[0]:'pre', num_unique[len(num_unique)//2]:'peri', num_unique[-1]:'post'}
+            return num.map(mapping)
+        # 2-level mapping (assume 1->pre,2->post) or fallback
+        if len(num_unique) == 2:
+            return num.map({num_unique[0]:'pre', num_unique[1]:'post'})
+        # If not mappable, return NaN series
+        return pd.Series([np.nan]*len(s), index=s.index)
+    mapped_stage = map_stage_to_labels(data[stage_col])
+    # If mapping failed (too many NaNs), attempt a simple bleed-based heuristic (last menstrual period)
+    if mapped_stage.isna().mean() > 0.9:
+        bleed_candidates = [c for c in data.columns if re.search(r'LMP|BLEED|PERIOD|MENSTR', c, flags=re.I)]
+        if len(bleed_candidates) > 0:
+            lcol = bleed_candidates[0]
+            lnum = pd.to_numeric(data[lcol], errors='coerce')
+            mapped_stage = pd.Series(index=data.index, dtype=object)
+            mapped_stage[lnum.isna()] = 'post'
+            mapped_stage[lnum.notna()] = 'pre'
+        else:
+            raise RuntimeError("Failed to map stage variable to pre/peri/post and no bleed/LMP variable found.")
+    data['_menopause_stage'] = mapped_stage
+    print("Mapped stage counts (after heuristic mapping):")
+    print(data['_menopause_stage'].value_counts(dropna=False))
+    # Drop rows with no mapped stage
+    data = data[~data['_menopause_stage'].isna()].copy()
+    print("Rows available for modeling:", data.shape[0])
+    # --------------------------
+    # 5. Feature selection for modeling
+    #    Keep only self-report fields with enough non-missing values and >1 unique value
+    # --------------------------
+    feature_candidates = [c for c in use_cols if c != stage_col]
+    selected_features = []
+    for c in feature_candidates:
+        non_null = data[c].notna().sum()
+        # require at least 2% nonmissing or minimum 50 observations
+        if non_null < max(50, len(data) * 0.02):
+            continue
+        if data[c].nunique(dropna=True) <= 1:
+            continue
+        selected_features.append(c)
+    print("Number of features selected for modeling:", len(selected_features))
+    print("First 40 features (if many):", selected_features[:40])
+    # --------------------------
+    # 6. Preprocessing pipeline
+    #    Numeric features: impute mean
+    #    Categorical features: impute most frequent + one-hot encode
+    #    Normalization: only added for logistic regression pipeline (tree-based RF doesn't need scaling)
+    # --------------------------
+    numeric_feats = [c for c in selected_features if pd.api.types.is_numeric_dtype(data[c])]
+    cat_feats = [c for c in selected_features if c not in numeric_feats]
+    from sklearn.pipeline import Pipeline
+    from sklearn.compose import ColumnTransformer
+    numeric_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='mean'))
+    ])
+    # Construct OneHotEncoder in a sklearn-version compatible way
+    try:
+        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
+    except TypeError:
+        # older sklearn versions use `sparse` kwarg
+        ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
+    categorical_transformer = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='most_frequent')),
+        ('onehot', ohe)
+    ])
+    preprocessor = ColumnTransformer(transformers=[
+        ('num', numeric_transformer, numeric_feats),
+        ('cat', categorical_transformer, cat_feats)
+    ], remainder='drop')
+    # Two pipelines: RandomForest (no scaling) and LogisticRegression (scaling)
+    rf_pipeline = Pipeline(steps=[
+        ('pre', preprocessor),
+        ('rf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
+    ])
+    lr_pipeline = Pipeline(steps=[
+        ('pre', preprocessor),
+        ('scaler', StandardScaler()),
+        ('lr', LogisticRegression(solver='lbfgs', max_iter=1000))
+    ])
+    # --------------------------
+    # 7. Prepare data, train/test split
+    # --------------------------
+    X = data[selected_features].copy()
+    y = data['_menopause_stage'].copy().astype(str)  # values: 'pre','peri','post' (hopefully)
+    print("Target class distribution:")
+    print(y.value_counts())
+    # Stratified split
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
+    print("Train / test sizes:", X_train.shape[0], X_test.shape[0])
+    # --------------------------
+    # 8. Train models
+    # --------------------------
+    print("Training RandomForest...")
+    rf_pipeline.fit(X_train, y_train)
+    print("RandomForest trained.")
+    print("Training LogisticRegression (multinomial)...")
+    lr_pipeline.fit(X_train, y_train)
+    print("LogisticRegression trained.")
+    # --------------------------
+    # 9. Predictions and assessment
+    # --------------------------
+    def evaluate_model(pipeline, X_test, y_test, model_name, output_dir=OUTPUT_DIR):
+        y_pred = pipeline.predict(X_test)
+        report = classification_report(y_test, y_pred)
+        print(f"\n=== {model_name} Classification Report ===\n{report}")
+        # confusion matrix
+        labels = sorted(y_test.unique())
+        cm = confusion_matrix(y_test, y_pred, labels=labels)
+        print(f"{model_name} Confusion Matrix (rows=true, cols=pred):\nLabels: {labels}\n{cm}")
+        # Save classification report
+        with open(os.path.join(output_dir, f"classification_report_{model_name.replace(' ','_')}.txt"), "w") as f:
+            f.write(report)
+        # Plot confusion matrix with matplotlib
+        fig, ax = plt.subplots(figsize=(5,4))
+        im = ax.imshow(cm, interpolation='nearest')
+        ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=45)
+        ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels)
+        ax.set_title(f"{model_name} Confusion Matrix")
+        for i in range(cm.shape[0]):
+            for j in range(cm.shape[1]):
+                ax.text(j, i, format(cm[i, j], 'd'), ha="center", va="center")
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, f"{model_name.replace(' ','_')}_confusion_matrix.png"))
+        # Show plots only when requested; otherwise close to free resources (non-interactive default)
+        if SHOW_PLOTS:
+            plt.show()
+        else:
+            plt.close('all')
+        return y_pred, cm
+    rf_pred, rf_cm = evaluate_model(rf_pipeline, X_test, y_test, "RandomForest")
+    lr_pred, lr_cm = evaluate_model(lr_pipeline, X_test, y_test, "LogisticRegression")
+    # 10. Feature importance
+    # Extract feature names after preprocessing (numerics stay same; categorical one-hot create names)
+    pre = rf_pipeline.named_steps['pre']
+    # Get numeric feature names
+    feature_names = []
+    if len(numeric_feats) > 0:
+        feature_names.extend(numeric_feats)
+    if len(cat_feats) > 0:
+        # Get onehot output names
+        ohe = pre.named_transformers_['cat'].named_steps['onehot']
+        try:
+            cat_onehot_names = ohe.get_feature_names_out(cat_feats)
+        except Exception:
+            # fallback
+            cat_onehot_names = []
+        feature_names.extend(cat_onehot_names.tolist() if hasattr(cat_onehot_names, 'tolist') else list(cat_onehot_names))
+    # Feature importances from RandomForest
+    rf_model = rf_pipeline.named_steps['rf']
+    importances = rf_model.feature_importances_
+    imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
+    imp_df.to_csv(os.path.join(OUTPUT_DIR, "rf_feature_importances.csv"), index=False)
+    print("\nTop 20 RF feature importances:")
+    print(imp_df.head(20).to_string(index=False))
+    # Permutation importance (robust)
+    print("Computing permutation importance (this can take some time)...")
+    perm = permutation_importance(rf_pipeline, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
+    perm_idx = perm.importances_mean.argsort()[::-1]
+    perm_df = pd.DataFrame({
+        'feature': np.array(feature_names)[perm_idx],
+        'importance_mean': perm.importances_mean[perm_idx],
+        'importance_std': perm.importances_std[perm_idx]
+    })
+    perm_df.to_csv(os.path.join(OUTPUT_DIR, "rf_permutation_importances.csv"), index=False)
+    print("Top 20 permutation importances:")
+    print(perm_df.head(20).to_string(index=False))
+    # Plot RF top features
+    topn = min(20, imp_df.shape[0])
+    fig, ax = plt.subplots(figsize=(8,6))
+    ax.barh(imp_df['feature'].head(topn)[::-1], imp_df['importance'].head(topn)[::-1])
+    ax.set_title("RandomForest: Top feature importances")
+    ax.set_xlabel("Importance")
+    plt.tight_layout()
+    plt.savefig(os.path.join(OUTPUT_DIR, "rf_top_feature_importances.png"))
+    if SHOW_PLOTS:
+        plt.show()
+    else:
+        plt.close('all')
+# 11. ROC curves (one-vs-rest) if predict_proba available
+def plot_multiclass_roc(pipeline, X_test, y_test, model_name):
+    if not hasattr(pipeline, "predict_proba"):
+        print(f"{model_name} has no predict_proba; skipping ROC plot.")
+        return
+    # Must use same class order as pipeline's final estimator
+    final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
+    classes = final_est.classes_
+    y_test_bin = label_binarize(y_test, classes=classes)
+    y_score = pipeline.predict_proba(X_test)
+    for i, cls in enumerate(classes):
+        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
+        roc_auc = auc(fpr, tpr)
+        plt.figure()
+        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.3f}")
+        plt.plot([0,1],[0,1], linestyle='--')
+        plt.title(f"{model_name} ROC for class {cls}")
+        plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
+        plt.legend(loc='lower right')
+        plt.savefig(os.path.join(OUTPUT_DIR, f"{model_name.replace(' ','_')}_ROC_{cls}.png"))
+        if SHOW_PLOTS:
+            plt.show()
+        else:
+            plt.close('all')
+print("Plotting ROC curves for RandomForest and LogisticRegression (if available)...")
+if __name__ == '__main__' and 'rf_pipeline' in globals():
+    plot_multiclass_roc(rf_pipeline, X_test, y_test, "RandomForest")
+    plot_multiclass_roc(lr_pipeline, X_test, y_test, "LogisticRegression")
+# ==========================================================================================
+# 12. FORECASTING MODULE: Predict menopausal stage for new individuals
+# ==========================================================================================
+class MenopauseForecast:
+    """
+    Forecasting module for predicting menopausal stage (pre/peri/post) given self-reported features.
+    This class encapsulates the trained models and preprocessing pipeline to make predictions
+    on new data with the same features used during training.
+    """
+    def __init__(self, rf_pipeline, lr_pipeline, feature_names, stage_classes):
+        """
+        Initialize the forecaster with trained pipelines.
+        Parameters:
+        -----------
+        rf_pipeline : sklearn Pipeline
+            Trained RandomForest pipeline
+        lr_pipeline : sklearn Pipeline
+            Trained LogisticRegression pipeline
+        feature_names : list
+            List of feature column names used for training
+        stage_classes : list
+            List of possible menopause stage classes (e.g., ['pre', 'peri', 'post'])
+        """
+        self.rf_pipeline = rf_pipeline
+        self.lr_pipeline = lr_pipeline
+        self.feature_names = feature_names
+        self.stage_classes = stage_classes
+        self.models = {
+            'RandomForest': rf_pipeline,
+            'LogisticRegression': lr_pipeline
+        }
+    def predict_single(self, feature_dict, model='RandomForest', return_proba=True):
+        """
+        Predict menopausal stage for a single individual.
+        Parameters:
+        -----------
+        feature_dict : dict
+            Dictionary with feature names as keys and values for prediction.
+            Example: {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...}
+        model : str
+            Which model to use for prediction: 'RandomForest' or 'LogisticRegression'
+        return_proba : bool
+            If True, return prediction probabilities; otherwise just the class label
+        Returns:
+        --------
+        dict : Contains 'stage', 'confidence', and optionally 'probabilities'
+        """
+        if model not in self.models:
+            raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}")
+        # Create DataFrame with single row, reindex to match training features
+        X = pd.DataFrame([feature_dict]).reindex(columns=self.feature_names, fill_value=np.nan)
+        pipeline = self.models[model]
+        prediction = pipeline.predict(X)[0]
+        result = {
+            'stage': prediction,
+            'model': model,
+            'confidence': None,
+            'probabilities': None
+        }
+        if return_proba:
+            try:
+                proba = pipeline.predict_proba(X)[0]
+                result['confidence'] = float(np.max(proba))
+                result['probabilities'] = {
+                    cls: float(prob)
+                    for cls, prob in zip(pipeline.named_steps[list(pipeline.named_steps.keys())[-1]].classes_, proba)
+                }
+            except Exception as e:
+                print(f"Warning: Could not compute probabilities: {e}")
+        return result
+    def predict_batch(self, df, model='RandomForest', return_proba=True):
+        """
+        Predict menopausal stage for multiple individuals (batch prediction).
+        Parameters:
+        -----------
+        df : pd.DataFrame
+            DataFrame with feature columns matching training features.
+            Missing values will be handled by the preprocessing pipeline.
+        model : str
+            Which model to use: 'RandomForest' or 'LogisticRegression'
+        return_proba : bool
+            If True, return prediction probabilities
+        Returns:
+        --------
+        pd.DataFrame : Contains 'predicted_stage', 'confidence', and probability columns
+        """
+        if model not in self.models:
+            raise ValueError(f"Model '{model}' not found. Available: {list(self.models.keys())}")
+        # Reindex to match training features
+        X = df.reindex(columns=self.feature_names, fill_value=np.nan)
+        pipeline = self.models[model]
+        predictions = pipeline.predict(X)
+        result_df = pd.DataFrame({
+            'predicted_stage': predictions,
+            'model': model
+        })
+        if return_proba:
+            try:
+                proba = pipeline.predict_proba(X)
+                final_est = pipeline.named_steps[list(pipeline.named_steps.keys())[-1]]
+                result_df['confidence'] = np.max(proba, axis=1)
+                # Add probability column for each class
+                for i, cls in enumerate(final_est.classes_):
+                    result_df[f'prob_{cls}'] = proba[:, i]
+            except Exception as e:
+                print(f"Warning: Could not compute probabilities: {e}")
+        return result_df
+    def compare_models(self, feature_dict):
+        """
+        Compare predictions from both RandomForest and LogisticRegression models.
+        Parameters:
+        -----------
+        feature_dict : dict
+            Feature values for the individual
+        Returns:
+        --------
+        dict : Predictions and probabilities from both models
+        """
+        rf_result = self.predict_single(feature_dict, model='RandomForest', return_proba=True)
+        lr_result = self.predict_single(feature_dict, model='LogisticRegression', return_proba=True)
+        return {
+            'RandomForest': rf_result,
+            'LogisticRegression': lr_result
+        }
+    def get_feature_info(self):
+        """Return information about required features."""
+        return {
+            'num_features': len(self.feature_names),
+            'feature_names': self.feature_names,
+            'stage_classes': self.stage_classes
+        }
+def create_forecast_example():
+    """
+    Create an example forecast instance and demonstrate usage.
+    This function is robust: if the training artifacts (`rf_pipeline`, `lr_pipeline`,
+    `selected_features`, `X_train`, `X_test`) are not available in memory (e.g., when
+    the module is imported in another process), it attempts to load saved pipelines
+    from `OUTPUT_DIR` via `load_forecast_model()` and uses placeholder inputs.
+    """
+    print("\n" + "="*80)
+    print("FORECASTING MODULE EXAMPLE: Predicting Menopausal Stage")
+    print("="*80)
+    # Determine pipelines and feature metadata (use in-memory if available, else load from disk)
+    try:
+        _rf = rf_pipeline
+        _lr = lr_pipeline
+        _features = selected_features
+        _stage_classes = sorted(y.unique().tolist())
+        has_training = True
+    except NameError:
+        print("Training artifacts not present in memory; attempting to load from disk...")
+        try:
+            _loaded = load_forecast_model(OUTPUT_DIR)
+            _rf = _loaded.rf_pipeline
+            _lr = _loaded.lr_pipeline
+            _features = _loaded.feature_names
+            _stage_classes = _loaded.stage_classes
+            has_training = False
+        except Exception as e:
+            raise RuntimeError(f"Failed to initialize forecaster from disk: {e}")
+    forecast = MenopauseForecast(
+        rf_pipeline=_rf,
+        lr_pipeline=_lr,
+        feature_names=_features,
+        stage_classes=_stage_classes
+    )
+    print(f"\nForecaster initialized with {len(_features)} features")
+    print(f"Predicting stages: {_stage_classes}")
+    # Example 1: Single individual prediction
+    print("\n--- Example 1: Predict for a single individual ---")
+    example_individual = {}
+    n_example_feats = min(10, len(_features))
+    if has_training:
+        for feat in _features[:n_example_feats]:
+            try:
+                example_individual[feat] = float(pd.to_numeric(X_train[feat], errors='coerce').median())
+            except Exception:
+                # Fallback to mode or NaN
+                try:
+                    example_individual[feat] = X_train[feat].mode().iloc[0]
+                except Exception:
+                    example_individual[feat] = np.nan
+    else:
+        # No training DF available; provide NaN placeholders to let pipeline impute
+        for feat in _features[:n_example_feats]:
+            example_individual[feat] = np.nan
+    result = forecast.predict_single(example_individual, model='RandomForest', return_proba=True)
+    print(f"Predicted stage: {result.get('stage')}")
+    print(f"Confidence: {result.get('confidence'):.3f}" if result.get('confidence') is not None else "Confidence: None")
+    if result.get('probabilities'):
+        print("Stage probabilities:")
+        for stage, prob in sorted(result['probabilities'].items()):
+            print(f"  {stage}: {prob:.3f}")
+    # Example 2: Compare models
+    print("\n--- Example 2: Compare RandomForest vs LogisticRegression ---")
+    comparison = forecast.compare_models(example_individual)
+    for model_name, cres in comparison.items():
+        print(f"\n{model_name}:")
+        print(f"  Predicted stage: {cres.get('stage')}")
+        print(f"  Confidence: {cres.get('confidence'):.3f}" if cres.get('confidence') is not None else "  Confidence: None")
+    # Example 3: Batch prediction on a small sample (either X_test if available or placeholder rows)
+    print("\n--- Example 3: Batch prediction (small sample) ---")
+    if has_training:
+        try:
+            test_sample = X_test.iloc[:5].copy()
+            batch_results = forecast.predict_batch(test_sample, model='RandomForest', return_proba=True)
+            print(batch_results.to_string())
+        except Exception as e:
+            print(f"Batch prediction failed on training sample: {e}")
+    else:
+        # Create a small placeholder DataFrame with feature columns filled with NaN
+        placeholder = pd.DataFrame([{f: np.nan for f in _features[:n_example_feats]}])
+        batch_results = forecast.predict_batch(placeholder, model='RandomForest', return_proba=True)
+        print(batch_results.to_string())
+    return forecast
+def save_forecast_model(forecast_instance, output_dir=OUTPUT_DIR):
+    """
+    Save the forecast model instance for later use (optional: can use joblib for production).
+    For now, saves metadata about features and classes that can be used to reinitialize
+    the forecaster.
+    Parameters:
+    -----------
+    forecast_instance : MenopauseForecast
+        The forecaster to save
+    output_dir : str
+        Directory to save metadata
+    """
+    import json
+    import joblib
+    metadata = {
+        'feature_names': forecast_instance.feature_names,
+        'stage_classes': forecast_instance.stage_classes,
+        'num_features': len(forecast_instance.feature_names)
+    }
+    # Save metadata as JSON
+    with open(os.path.join(output_dir, 'forecast_metadata.json'), 'w') as f:
+        json.dump(metadata, f, indent=2)
+    # Save trained pipelines using joblib (allows full reuse)
+    joblib.dump(forecast_instance.rf_pipeline, os.path.join(output_dir, 'rf_pipeline.pkl'))
+    joblib.dump(forecast_instance.lr_pipeline, os.path.join(output_dir, 'lr_pipeline.pkl'))
+    print(f"Forecast model saved to {output_dir}")
+    print(f"  - forecast_metadata.json")
+    print(f"  - rf_pipeline.pkl")
+    print(f"  - lr_pipeline.pkl")
+def load_forecast_model(output_dir=OUTPUT_DIR):
+    """
+    Load a previously saved forecast model.
+    Parameters:
+    -----------
+    output_dir : str
+        Directory containing saved models
+    Returns:
+    --------
+    MenopauseForecast : The loaded forecaster
+    """
+    import json
+    import joblib
+    # Load metadata
+    with open(os.path.join(output_dir, 'forecast_metadata.json'), 'r') as f:
+        metadata = json.load(f)
+    # Load pipelines
+    rf_pipeline_loaded = joblib.load(os.path.join(output_dir, 'rf_pipeline.pkl'))
+    lr_pipeline_loaded = joblib.load(os.path.join(output_dir, 'lr_pipeline.pkl'))
+    # Recreate forecaster
+    forecast = MenopauseForecast(
+        rf_pipeline=rf_pipeline_loaded,
+        lr_pipeline=lr_pipeline_loaded,
+        feature_names=metadata['feature_names'],
+        stage_classes=metadata['stage_classes']
+    )
+    print(f"Forecast model loaded from {output_dir}")
+    return forecast
+# Initialize and demonstrate the forecasting module
+# Symptom cycle forecasting (defined earlier near CLI args)
+class SymptomCycleForecaster:
+    """
+    Predicts the probability of hot flashes and mood changes within a menstrual cycle
+    based on last menstrual period (LMP) date and target date.
+    """
+    def __init__(self, cycle_length=28, hot_mu=14, hot_sigma=5, mood_mu=26, mood_sigma=4,
+                 base_hot=0.1, amp_hot=0.4, base_mood=0.1, amp_mood=0.45, threshold=0.5):
+        self.cycle_length = cycle_length
+        self.hot_mu = hot_mu
+        self.hot_sigma = hot_sigma
+        self.mood_mu = mood_mu
+        self.mood_sigma = mood_sigma
+        self.base_hot = base_hot
+        self.amp_hot = amp_hot
+        self.base_mood = base_mood
+        self.amp_mood = amp_mood
+        self.threshold = threshold
+    def _parse_lmp(self, lmp, reference_date=None):
+        """Parse LMP input which may be a full date string or an integer day-of-month."""
+        if pd.isna(lmp):
+            return None
+        # If numeric day (int-like), construct a date in the same month as reference_date
+        try:
+            lmp_int = int(lmp)
+            if reference_date is None:
+                ref = pd.Timestamp(datetime.today()).to_pydatetime()
+            else:
+                ref = pd.to_datetime(reference_date, errors='coerce')
+                if pd.isna(ref):
+                    ref = pd.Timestamp(datetime.today()).to_pydatetime()
+                else:
+                    ref = ref.to_pydatetime()
+            # Clamp day to valid range
+            day = max(1, min(lmp_int, 28))
+            return datetime(ref.year, ref.month, day)
+        except Exception:
+            # Try parse as full date string
+            try:
+                return pd.to_datetime(lmp, errors='coerce').to_pydatetime()
+            except Exception:
+                return None
+    def compute_cycle_day(self, lmp, target_date=None):
+        """Return 1-based cycle day (1..cycle_length) or None if cannot compute."""
+        if target_date is None:
+            tdate = datetime.today()
+        else:
+            tdate = pd.to_datetime(target_date, errors='coerce')
+            if pd.isna(tdate):
+                tdate = datetime.today()
+            else:
+                tdate = tdate.to_pydatetime()
+        lmp_date = self._parse_lmp(lmp, reference_date=tdate)
+        if lmp_date is None:
+            return None
+        delta = (tdate - lmp_date).days
+        if delta < 0:
+            # If LMP is in the future, assume it refers to previous cycle (subtract one month)
+            lmp_date = lmp_date - timedelta(days=self.cycle_length)
+            delta = (tdate - lmp_date).days
+        cycle_day = (delta % self.cycle_length) + 1
+        return int(cycle_day)
+    def _gauss_prob(self, day, mu, sigma, base, amp):
+        if day is None:
+            return np.nan
+        val = base + amp * np.exp(-0.5 * ((day - mu) / float(sigma)) ** 2)
+        return float(min(max(val, 0.0), 1.0))
+    def predict_single(self, lmp, target_date=None):
+        day = self.compute_cycle_day(lmp, target_date=target_date)
+        hot_p = self._gauss_prob(day, self.hot_mu, self.hot_sigma, self.base_hot, self.amp_hot)
+        mood_p = self._gauss_prob(day, self.mood_mu, self.mood_sigma, self.base_mood, self.amp_mood)
+        return {
+            'cycle_day': day,
+            'hotflash_prob': hot_p,
+            'hotflash_pred': hot_p >= self.threshold if not np.isnan(hot_p) else None,
+            'mood_prob': mood_p,
+            'mood_pred': mood_p >= self.threshold if not np.isnan(mood_p) else None
+        }
+    def predict_df(self, df, lmp_col='LMP', date_col=None, menopause_stage_col=None):
+        df = df.copy()
+        results = df.apply(
+            lambda row: pd.Series(self.predict_single(
+                lmp=row.get(lmp_col),
+                target_date=(row.get(date_col) if date_col is not None else None)
+            )), axis=1
+        )
+        out = pd.concat([df.reset_index(drop=True), results.reset_index(drop=True)], axis=1)
+        return out
+def predict_symptoms_from_csv(input_csv, output_csv, lmp_col='LMP', date_col=None,
+                              menopause_stage_col=None, cycle_length=28, **kwargs):
+    """Read input CSV, predict hot flashes/mood by cycle day, and write output CSV."""
+    df = pd.read_csv(input_csv)
+    fore = SymptomCycleForecaster(cycle_length=cycle_length)
+    out_df = fore.predict_df(df, lmp_col=lmp_col, date_col=date_col, menopause_stage_col=menopause_stage_col)
+    out_df.to_csv(output_csv, index=False)
+    # Print a brief summary
+    print(f"Wrote symptom predictions for {out_df.shape[0]} rows to {output_csv}")
+    print("Sample predictions (first 5 rows):")
+    print(out_df[[lmp_col] + ['cycle_day','hotflash_prob','hotflash_pred','mood_prob','mood_pred']].head().to_string())
+# CLI integration: run symptom prediction if requested
+if __name__ == '__main__':
+    # If symptom prediction requested via CLI, run fast-path and exit
+    if args.predict_symptoms:
+        if not args.symptoms_input or not args.symptoms_output:
+            print("Error: --symptoms-input and --symptoms-output are required when --predict-symptoms is set")
+            sys.exit(1)
+        else:
+            predict_symptoms_from_csv(
+                input_csv=args.symptoms_input,
+                output_csv=args.symptoms_output,
+                lmp_col=args.lmp_col,
+                date_col=args.date_col,
+                cycle_length=args.cycle_length
+            )
+            sys.exit(0)
+    # Dual predictions are handled in the early fast-path above to avoid training.
+    # Default behavior: create demo forecaster, save trained models and show summary
+    forecast_model = create_forecast_example()
+    save_forecast_model(forecast_model)
+    print("\n" + "="*80)
+    print("FORECASTING MODULE SUMMARY")
+    print("="*80)
+    print("""
+The MenopauseForecast class provides three main methods for predictions:
+1. predict_single(feature_dict, model='RandomForest', return_proba=True)
+   - Predict stage for one individual given feature values
+   - Returns predicted stage and confidence scores
+2. predict_batch(df, model='RandomForest', return_proba=True)
+   - Predict stages for multiple individuals
+   - Returns DataFrame with predictions and probabilities for each stage
+3. compare_models(feature_dict)
+   - Compare predictions from both RandomForest and LogisticRegression
+   - Useful for validating model agreement
+Usage in your own code:
+    from menopause import load_forecast_model
+    # Load the trained forecaster
+    forecast = load_forecast_model('swan_ml_output')
+    # Predict for an individual
+    features = {'HOT7': 1, 'SLEEP7': 2, 'CESD': 10, ...}
+    result = forecast.predict_single(features, model='RandomForest')
+    # Predict for multiple individuals
+    results_df = forecast.predict_batch(your_dataframe, model='RandomForest')
+    """)
+# ==========================================================================================
+# 13. CSV INPUT/OUTPUT FUNCTIONALITY: Batch prediction from CSV files
+# ==========================================================================================
+def predict_from_csv(input_csv, forecast_instance, output_csv=None, model='RandomForest', output_dir=OUTPUT_DIR):
+    """
+    Read individual data from CSV, make predictions, and save results.
+    Parameters:
+    -----------
+    input_csv : str
+        Path to input CSV file with feature columns for individuals
+        CSV should have columns matching training features (or subset)
+    forecast_instance : MenopauseForecast
+        The trained forecaster instance
+    output_csv : str
+        Path to output CSV file (default: input_csv with '_predictions' appended)
+    model : str
+        Which model to use ('RandomForest' or 'LogisticRegression')
+    output_dir : str
+        Directory to save results (for metadata)
+    Returns:
+    --------
+    pd.DataFrame : Results with predictions and confidence scores
+    Example:
+    --------
+    forecast = load_forecast_model('swan_ml_output')
+    results = predict_from_csv('individuals.csv', forecast)
+    # Results saved to 'individuals_predictions.csv'
+    """
+    import os
+    # Read input CSV
+    print(f"Reading input data from: {input_csv}")
+    try:
+        data = pd.read_csv(input_csv)
+    except FileNotFoundError:
+        print(f"ERROR: File not found: {input_csv}")
+        return None
+    n_samples = len(data)
+    print(f"Loaded {n_samples} individuals")
+    # Identify feature columns (exclude ID columns)
+    id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
+    feature_cols = [c for c in data.columns if c not in id_cols]
+    # Separate ID columns from features
+    id_data = data[[c for c in id_cols if c in data.columns]] if any(c in data.columns for c in id_cols) else None
+    # Make predictions
+    print(f"Making predictions using {model}...")
+    predictions = forecast_instance.predict_batch(
+        data[feature_cols],
+        model=model,
+        return_proba=True
+    )
+    # Combine with original data
+    if id_data is not None:
+        results = pd.concat([id_data.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)
+    else:
+        results = predictions.reset_index(drop=True)
+    # Add individual index if no ID column
+    if id_data is None:
+        results.insert(0, 'individual', range(1, n_samples + 1))
+    # Set output file path
+    if output_csv is None:
+        base, ext = os.path.splitext(input_csv)
+        output_csv = f"{base}_predictions{ext}"
+    # Save results
+    print(f"Saving predictions to: {output_csv}")
+    results.to_csv(output_csv, index=False)
+    return results
+def predict_dual_from_csv(stage_input_csv, stage_output_csv, symptoms_input_csv, symptoms_output_csv,
+                          forecast_dir=OUTPUT_DIR, model='RandomForest', lmp_col='LMP',
+                          date_col=None, cycle_length=28):
+    """Run menopause stage prediction and symptom-cycle prediction using separate
+    input and output files for each model.
+    Returns:
+    --------
+    dict : {'stage': stage_results_df, 'symptoms': symptom_results_df}
+    """
+    print(f"Reading stage input data from: {stage_input_csv}")
+    try:
+        stage_data = pd.read_csv(stage_input_csv)
+    except FileNotFoundError:
+        print(f"ERROR: File not found: {stage_input_csv}")
+        return None
+    # Load forecast model
+    try:
+        forecast = load_forecast_model(output_dir=forecast_dir)
+    except Exception as e:
+        print(f"ERROR: Could not load forecast model from '{forecast_dir}': {e}")
+        return None
+    # Identify id and feature columns
+    id_cols = ['ID', 'id', 'SWANID', 'individual', 'Individual', 'subject', 'Subject']
+    feature_cols = [c for c in stage_data.columns if c not in id_cols]
+    # Make stage predictions
+    print(f"Making menopause stage predictions using {model}...")
+    stage_preds = forecast.predict_batch(stage_data[feature_cols], model=model, return_proba=True)
+    id_data = stage_data[[c for c in id_cols if c in stage_data.columns]] if any(c in stage_data.columns for c in id_cols) else None
+    if id_data is not None:
+        stage_results = pd.concat([id_data.reset_index(drop=True), stage_preds.reset_index(drop=True)], axis=1)
+    else:
+        stage_results = stage_preds.reset_index(drop=True)
+        stage_results.insert(0, 'individual', range(1, len(stage_results) + 1))
+    # Default stage output path if not provided
+    if stage_output_csv is None:
+        base, ext = os.path.splitext(stage_input_csv)
+        stage_output_csv = f"{base}_stage_predictions{ext}"
+    print(f"Saving stage predictions to: {stage_output_csv}")
+    stage_results.to_csv(stage_output_csv, index=False)
+    # Symptom predictions (independent)
+    print(f"Reading symptom input data from: {symptoms_input_csv}")
+    try:
+        symptom_data = pd.read_csv(symptoms_input_csv)
+    except FileNotFoundError:
+        print(f"ERROR: File not found: {symptoms_input_csv}")
+        return None
+    if date_col is None and 'date' in symptom_data.columns:
+        date_col = 'date'
+    fore = SymptomCycleForecaster(cycle_length=cycle_length)
+    symptom_results = fore.predict_df(symptom_data, lmp_col=lmp_col, date_col=date_col)
+    # Default symptom output path if not provided
+    if symptoms_output_csv is None:
+        base, ext = os.path.splitext(symptoms_input_csv)
+        symptoms_output_csv = f"{base}_symptom_predictions{ext}"
+    print(f"Saving symptom predictions to: {symptoms_output_csv}")
+    symptom_results.to_csv(symptoms_output_csv, index=False)
+    return {'stage': stage_results, 'symptoms': symptom_results}
+def predict_combined_from_csv(*args, **kwargs):
+    """Deprecated: combined predictions are removed in favor of separate input/output files."""
+    raise ValueError(
+        "Combined predictions are deprecated. Use predict_dual_from_csv() with separate stage and symptom input/output files."
+    )
+def create_demo_csv(forecast_instance, num_individuals=5, output_file='demo_individuals.csv', output_dir=OUTPUT_DIR):
+    """
+    Create a demo CSV file with sample individuals for testing predictions.
+    Uses statistics from the training data to generate realistic feature values.
+    Parameters:
+    -----------
+    forecast_instance : MenopauseForecast
+        The trained forecaster (used to get feature names)
+    num_individuals : int
+        Number of demo individuals to generate
+    output_file : str
+        Path to output CSV file
+    output_dir : str
+        Directory to save demo file
+    Returns:
+    --------
+    str : Path to created CSV file
+    """
+    # Get feature names from forecaster
+    feature_names = forecast_instance.feature_names
+    # Create demo data with random realistic values
+    np.random.seed(42)
+    demo_data = {}
+    # Add individual ID
+    demo_data['individual'] = [f"Individual_{i+1}" for i in range(num_individuals)]
+    # Generate random feature values (using ranges typical for SWAN data)
+    for feat in feature_names:
+        # Random values between 1 and 5 (typical Likert scale for SWAN)
+        demo_data[feat] = np.random.randint(1, 6, size=num_individuals)
+    # Create DataFrame
+    demo_df = pd.DataFrame(demo_data)
+    # Create full path
+    full_path = os.path.join(output_dir, output_file)
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Save demo file
+    demo_df.to_csv(full_path, index=False)
+    print(f"✅ Demo CSV created: {full_path}")
+    print(f"   Individuals: {num_individuals}")
+    print(f"   Features: {len(feature_names)}")
+    print(f"   File shape: {demo_df.shape}")
+    return full_path
+def add_performance_metrics_to_csv(results_df, y_test=None, model_name='RandomForest'):
+    """
+    Add performance metrics to predictions CSV.
+    If true labels available, computes accuracy, precision, recall, F1-score.
+    Parameters:
+    -----------
+    results_df : pd.DataFrame
+        Results dataframe with predictions
+    y_test : array-like
+        True labels (optional)
+    model_name : str
+        Name of model used
+    Returns:
+    --------
+    pd.DataFrame : Results with metrics appended
+    """
+    if y_test is not None:
+        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+        acc = accuracy_score(y_test, results_df['predicted_stage'])
+        prec = precision_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)
+        recall = recall_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)
+        f1 = f1_score(y_test, results_df['predicted_stage'], average='weighted', zero_division=0)
+        # Add as metadata comment at bottom
+        metrics_text = f"\n# Performance Metrics ({model_name})\n"
+        metrics_text += f"# Accuracy: {acc:.3f}\n"
+        metrics_text += f"# Precision (weighted): {prec:.3f}\n"
+        metrics_text += f"# Recall (weighted): {recall:.3f}\n"
+        metrics_text += f"# F1-Score (weighted): {f1:.3f}\n"
+        return results_df, metrics_text
+    return results_df, None

predict_csv.py ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/usr/bin/env python
+"""
+CSV Prediction Script for SWAN Menopause Stage Forecasting
+This script demonstrates how to use the trained forecasting module to make predictions
+on a batch of individuals from a CSV file and save results with confidence scores
+and performance metrics.
+Usage:
+    python predict_csv.py --input demo_individuals.csv --model RandomForest
+    python predict_csv.py --input individuals.csv --output results.csv --model LogisticRegression
+The script will:
+1. Read input CSV with individual feature values
+2. Make predictions using trained model
+3. Save results with predicted stage, confidence, and probabilities
+4. Display summary statistics
+"""
+import os
+import sys
+import argparse
+import pandas as pd
+import numpy as np
+from pathlib import Path
+def main():
+    """Main function to handle CSV prediction."""
+    parser = argparse.ArgumentParser(
+        description='Make menopause stage predictions from CSV file'
+    )
+    parser.add_argument(
+        '--input', '-i',
+        required=True,
+        help='Path to input CSV file with individual feature values'
+    )
+    parser.add_argument(
+        '--output', '-o',
+        default=None,
+        help='Path to output CSV file (default: input_predictions.csv)'
+    )
+    parser.add_argument(
+        '--model', '-m',
+        choices=['RandomForest', 'LogisticRegression'],
+        default='RandomForest',
+        help='Which model to use for predictions'
+    )
+    parser.add_argument(
+        '--forecast-dir',
+        default='swan_ml_output',
+        help='Directory containing trained forecast models'
+    )
+    args = parser.parse_args()
+    # Import after parsing args
+    try:
+        from menopause import load_forecast_model, predict_from_csv
+    except ImportError:
+        print("ERROR: Could not import menopause module.")
+        print("Make sure you're in the correct directory and menopause.py is available.")
+        sys.exit(1)
+    # Check if input file exists
+    if not os.path.exists(args.input):
+        print(f"ERROR: Input file not found: {args.input}")
+        sys.exit(1)
+    # Check if forecast models exist
+    forecast_dir = args.forecast_dir
+    if not os.path.exists(os.path.join(forecast_dir, 'rf_pipeline.pkl')):
+        print(f"ERROR: Forecast models not found in {forecast_dir}")
+        print("Please run 'python menopause.py' first to train models.")
+        sys.exit(1)
+    print("="*80)
+    print("MENOPAUSE STAGE PREDICTION FROM CSV")
+    print("="*80)
+    # Load forecaster
+    print(f"\nLoading forecaster from {forecast_dir}...")
+    forecast = load_forecast_model(forecast_dir)
+    # Make predictions
+    print(f"\nUsing model: {args.model}")
+    results = predict_from_csv(
+        args.input,
+        forecast,
+        output_csv=args.output,
+        model=args.model,
+        output_dir='.'
+    )
+    if results is not None:
+        print("\n" + "="*80)
+        print("PREDICTION RESULTS")
+        print("="*80)
+        # Display results table
+        print("\nDetailed Results:")
+        print(results.to_string(index=False))
+        # Display performance metrics
+        print("\n" + "="*80)
+        print("PERFORMANCE SUMMARY")
+        print("="*80)
+        print(f"\nTotal Individuals: {len(results)}")
+        print(f"\nStage Distribution:")
+        for stage, count in results['predicted_stage'].value_counts().items():
+            pct = count / len(results) * 100
+            print(f"  {stage}: {count} ({pct:.1f}%)")
+        print(f"\nConfidence Scores:")
+        print(f"  Mean: {results['confidence'].mean():.3f}")
+        print(f"  Min: {results['confidence'].min():.3f}")
+        print(f"  Max: {results['confidence'].max():.3f}")
+        print(f"  Std Dev: {results['confidence'].std():.3f}")
+        # Confidence distribution
+        high_conf = (results['confidence'] > 0.8).sum()
+        med_conf = ((results['confidence'] > 0.6) & (results['confidence'] <= 0.8)).sum()
+        low_conf = (results['confidence'] <= 0.6).sum()
+        print(f"\nConfidence Distribution:")
+        print(f"  High (>0.80): {high_conf}/{len(results)} ({high_conf/len(results)*100:.1f}%)")
+        print(f"  Medium (0.60-0.80): {med_conf}/{len(results)} ({med_conf/len(results)*100:.1f}%)")
+        print(f"  Low (≤0.60): {low_conf}/{len(results)} ({low_conf/len(results)*100:.1f}%)")
+        # Output file confirmation
+        output_path = args.output if args.output else f"{Path(args.input).stem}_predictions.csv"
+        print(f"\n✅ Results saved to: {output_path}")
+    else:
+        print("ERROR: Prediction failed.")
+        sys.exit(1)
+    print("\n" + "="*80)
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# SWAN Menopause Prediction - Gradio App
+# Python 3.10+ recommended
+# ── UI Framework ─────────────────────────────────────────────────────────────
+gradio>=4.0.0
+# ── Data & ML ────────────────────────────────────────────────────────────────
+pandas>=1.3.0
+numpy>=1.20.0
+scikit-learn==1.7.2      # Must match version used to train saved .pkl artifacts
+joblib>=1.0.0
+python-dateutil>=2.8.0
+# ── Visualization ────────────────────────────────────────────────────────────
+matplotlib>=3.3.0
+seaborn>=0.11.0
+# ── Notes ─────────────────────────────────────────────────────────────────────
+# scikit-learn version is pinned because the .pkl pipelines (rf_pipeline.pkl,
+# lr_pipeline.pkl) were serialized with scikit-learn 1.7.2. Using a different
+# version may cause pickle incompatibility errors.
+#
+# To install locally:
+#   python -m venv .venv
+#   source .venv/bin/activate   # or .venv\Scripts\activate on Windows
+#   pip install -r requirements_app.txt

swan_ml_output/forecast_metadata.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "feature_names": [
+    "PAIN17",
+    "PAINTW17",
+    "PAIN27",
+    "PAINTW27",
+    "SLEEP17",
+    "SLEEP27",
+    "BCOHOTH7",
+    "EXERCIS7",
+    "EXERHAR7",
+    "EXEROST7",
+    "EXERMEN7",
+    "EXERLOO7",
+    "EXERMEM7",
+    "EXERPER7",
+    "EXERGEN7",
+    "EXERWGH7",
+    "EXERADV7",
+    "EXEROTH7",
+    "EXERSPE7",
+    "ABBLEED7",
+    "BLEEDNG7",
+    "LMPDAY7",
+    "DEPRESS7",
+    "SEX17",
+    "SEX27",
+    "SEX37",
+    "SEX47",
+    "SEX57",
+    "SEX67",
+    "SEX77",
+    "SEX87",
+    "SEX97",
+    "SEX107",
+    "SEX117",
+    "SEX127",
+    "SMOKERE7",
+    "HOTFLAS7",
+    "NUMHOTF7",
+    "BOTHOTF7",
+    "IRRITAB7",
+    "VAGINDR7",
+    "MOODCHG7",
+    "SLEEPQL7",
+    "PHYSILL7",
+    "HOTHEAD7",
+    "EXER12H7",
+    "ALCO24H7",
+    "AGE7",
+    "RACE",
+    "LANGINT7"
+  ],
+  "stage_classes": [
+    "peri",
+    "post",
+    "pre"
+  ],
+  "num_features": 50
+}

swan_ml_output/lr_pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d9a1f99d0fc278ba57c7d21f0de5a0d4f2d88e7a79e4647c5d6f9b0cb925f9e
+size 61178

swan_ml_output/rf_pipeline.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e8a2e356ca8e17972073da38902d5cefe824693bba6b3206316956dafbd64a7
+size 4274787