"""Train and score the baseline-surrogate baseline surrogate matrix on a Parquet dataset.

Single canonical entry point for the baseline-surrogate §6 step-4 acceptance run:
fit Ridge / RF / XGBoost per target, the joint MLP across all primary
targets, and LogReg / XGBoost feasibility classifiers; then score them
on the held-out test split (with a per-scenario-family breakdown) and
run the registry-rover Layer-1 sanity check.

Outputs (under ``--out-dir``):

- ``metrics_long.parquet`` — tidy long-format frame
  ``(algorithm, target, split, scenario_family, metric, value)``.
- ``acceptance_gate.csv`` — one row per ``(algorithm, target)`` with
  the plan's threshold, observed value, and pass/fail.
- ``registry_sanity.csv`` — predictions for Pragyan / Yutu-2 /
  MoonRanger / Rashid-1 vs. the deterministic evaluator (Layer-1 truth).
  Pragyan and Yutu-2 are flown rovers; MoonRanger and Rashid-1 are
  design-target lunar micro-rovers (never deployed) included for
  Layer-1 OOD coverage of the surrogate's input space.
  Each row carries an ``is_primary`` flag. ``True`` rows
  (``total_mass_kg``, ``slope_capability_deg``, ``stalled``)
  are the design-axis Layer-1 acceptance set; ``False`` rows
  (``range_km``, ``energy_margin_raw_pct``) are scenario-OOD
  diagnostics — see ``roverdevkit.surrogate.baselines``
  ``LAYER1_PRIMARY_TARGETS`` / ``LAYER1_DIAGNOSTIC_TARGETS``.
- ``fit_seconds.csv`` — per-fit wall-clock for the writeup.

Examples
--------
::

    # Full 40k acceptance run (current canonical dataset, analytical Bekker-Wong)
    python scripts/run_baselines.py \\
        --dataset data/analytical/lhs_v9.parquet \\
        --out-dir reports/baselines_v9

    # Fast pilot smoke (skip MLP, smaller forest)
    python scripts/run_baselines.py \\
        --dataset data/analytical/lhs_pilot.parquet \\
        --out-dir reports/baselines_pilot \\
        --no-mlp
"""

from __future__ import annotations

import argparse
import logging
import sys
import time
from pathlib import Path

import pandas as pd

from roverdevkit.surrogate.baselines import (
    acceptance_gate,
    evaluate_baselines,
    fit_baselines,
    predict_for_registry_rovers,
)
from roverdevkit.surrogate.dataset import read_parquet
from roverdevkit.surrogate.features import FEASIBILITY_COLUMN


def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
    )
    p.add_argument(
        "--dataset",
        type=Path,
        required=True,
        help="Path to the Parquet dataset produced by scripts/build_dataset.py.",
    )
    p.add_argument(
        "--out-dir",
        type=Path,
        required=True,
        help="Directory for the output reports (created if missing).",
    )
    p.add_argument("--seed", type=int, default=42, help="Estimator random_state.")
    p.add_argument(
        "--n-jobs",
        type=int,
        default=-1,
        help="Plumbed through to RF / XGBoost. -1 uses all cores.",
    )
    p.add_argument(
        "--no-mlp",
        action="store_true",
        help="Skip fitting the joint MLP. Useful for fast smokes.",
    )
    p.add_argument(
        "--no-registry-check",
        action="store_true",
        help="Skip the registry-rover Layer-1 sanity check.",
    )
    p.add_argument(
        "--log-level",
        default="INFO",
        choices=("DEBUG", "INFO", "WARNING", "ERROR"),
    )
    return p.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
    args = _parse_args(argv)
    logging.basicConfig(
        level=args.log_level,
        format="%(asctime)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )
    log = logging.getLogger("run_baselines")

    args.out_dir.mkdir(parents=True, exist_ok=True)
    log.info("loading dataset from %s", args.dataset)
    df = read_parquet(args.dataset)
    log.info(
        "loaded %d rows x %d cols; splits: %s",
        len(df),
        len(df.columns),
        df["split"].value_counts().to_dict() if "split" in df.columns else {},
    )

    df_train = df[df["split"] == "train"]
    df_val = df[df["split"] == "val"]
    df_test = df[df["split"] == "test"]
    log.info("train=%d val=%d test=%d", len(df_train), len(df_val), len(df_test))

    # ----- fit ---------------------------------------------------------------
    t_fit = time.perf_counter()
    fitted = fit_baselines(
        df_train,
        fit_mlp=not args.no_mlp,
        n_jobs=args.n_jobs,
        random_state=args.seed,
        verbose=True,
    )
    fit_elapsed = time.perf_counter() - t_fit
    log.info("fit complete in %.1f s", fit_elapsed)

    # ----- evaluate (val + test, with per-scenario-family breakdown) --------
    log.info("scoring val and test splits...")
    t_eval = time.perf_counter()
    val_metrics = evaluate_baselines(fitted, df_val, split_label="val")
    test_metrics = evaluate_baselines(fitted, df_test, split_label="test")
    train_metrics = evaluate_baselines(fitted, df_train, split_label="train")
    metrics = pd.concat([train_metrics, val_metrics, test_metrics], ignore_index=True)
    log.info("scoring done in %.1f s; %d metric rows", time.perf_counter() - t_eval, len(metrics))

    metrics_path = args.out_dir / "metrics_long.parquet"
    metrics.to_parquet(metrics_path, index=False)
    log.info("wrote %s (%d rows)", metrics_path, len(metrics))

    # ----- acceptance gate (test, overall) ----------------------------------
    gate = acceptance_gate(metrics, split="test", family="__all__")
    gate_path = args.out_dir / "acceptance_gate.csv"
    gate.to_csv(gate_path, index=False)
    log.info("wrote %s; passing rows: %d/%d", gate_path, int(gate["passes"].sum()), len(gate))
    print("\n=== Acceptance gate (test split, all families) ===", flush=True)
    with pd.option_context("display.max_columns", None, "display.width", 160):
        print(gate.to_string(index=False))

    # ----- compact summary table per (algorithm, target) on test ------------
    test_overall = metrics.query("split == 'test' and scenario_family == '__all__'")
    pivot = (
        test_overall.pivot_table(
            index=["algorithm", "target"],
            columns="metric",
            values="value",
            aggfunc="first",
        )
        .reset_index()
        .sort_values(["target", "algorithm"])
    )
    pivot_path = args.out_dir / "test_summary.csv"
    pivot.to_csv(pivot_path, index=False)
    log.info("wrote %s", pivot_path)
    print("\n=== Per-(algorithm, target) test metrics ===", flush=True)
    with pd.option_context("display.max_columns", None, "display.width", 160):
        print(pivot.to_string(index=False))

    # ----- per-scenario breakdown on the primary metrics --------------------
    fam_rows = metrics.query(
        "split == 'test' and scenario_family != '__all__' and metric in ('r2', 'auc')"
    )
    fam_pivot = (
        fam_rows.pivot_table(
            index=["algorithm", "target", "metric"],
            columns="scenario_family",
            values="value",
            aggfunc="first",
        )
        .reset_index()
        .sort_values(["target", "metric", "algorithm"])
    )
    fam_pivot_path = args.out_dir / "test_per_family.csv"
    fam_pivot.to_csv(fam_pivot_path, index=False)
    log.info("wrote %s", fam_pivot_path)

    # ----- fit-time table ---------------------------------------------------
    fit_rows = [
        {"algorithm": k[0], "target": k[1], "fit_seconds": v} for k, v in fitted.fit_seconds.items()
    ]
    fit_df = pd.DataFrame(fit_rows).sort_values(["algorithm", "target"])
    fit_path = args.out_dir / "fit_seconds.csv"
    fit_df.to_csv(fit_path, index=False)
    log.info("wrote %s (%.1f s wall-clock total fit)", fit_path, fit_elapsed)

    # ----- registry rover Layer-1 sanity ------------------------------------
    if not args.no_registry_check:
        log.info("running registry-rover sanity check...")
        try:
            sanity = predict_for_registry_rovers(fitted)
            sanity_path = args.out_dir / "registry_sanity.csv"
            sanity.to_csv(sanity_path, index=False)
            log.info("wrote %s (%d rows)", sanity_path, len(sanity))
            _print_registry_sanity_summary(sanity)
        except Exception as exc:  # pragma: no cover — diagnostic, not fatal
            log.warning("registry-rover sanity check failed: %s", exc)

    return 0


def _print_registry_sanity_summary(sanity: pd.DataFrame) -> None:
    """Print Layer-1 sanity in two tables: design-axis primary + scenario-OOD diagnostic.

    See ``roverdevkit.surrogate.baselines.LAYER1_PRIMARY_TARGETS`` for
    the rationale for the split. Range / energy_margin live in the
    diagnostic block because the registry's published mission distances
    are 100-1000x smaller than the LHS family budgets, which is a
    *scenario*-OOD effect rather than a surrogate-calibration failure.
    """
    primary = sanity[sanity["is_primary"]].copy()
    diagnostic = sanity[~sanity["is_primary"]].copy()

    print(
        "\n=== Registry-rover Layer-1 sanity (PRIMARY: design-axis targets) ===",
        flush=True,
    )
    print(
        "Acceptance set: total_mass_kg, slope_capability_deg, stalled.",
        flush=True,
    )

    regressor_primary = primary[primary["target"] != FEASIBILITY_COLUMN]
    if not regressor_primary.empty:
        primary_summary = (
            regressor_primary.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
            .groupby(["rover", "target"])["abs_pct"]
            .median()
            .unstack("target")
        )
        with pd.option_context("display.max_columns", None, "display.width", 160):
            print("Median |relative error| (%) across algorithms (regression):")
            print(primary_summary.round(2).to_string())

    classifier_primary = primary[primary["target"] == FEASIBILITY_COLUMN]
    if not classifier_primary.empty:
        clf_summary = (
            classifier_primary.assign(
                hit=lambda d: (d["predicted"] >= 0.5).astype(int) == d["evaluator"].astype(int)
            )
            .groupby("rover")["hit"]
            .mean()
            .rename("classifier_accuracy")
            .to_frame()
        )
        with pd.option_context("display.max_columns", None, "display.width", 160):
            print("\nClassifier accuracy across algorithms (stalled):")
            print(clf_summary.round(3).to_string())

    print(
        "\n=== Registry-rover Layer-1 diagnostic (SCENARIO-OOD; not part of acceptance) ===",
        flush=True,
    )
    print(
        "These targets are reported for transparency only. The registry's "
        "published mission\ndistances are 100-1000x smaller than the LHS "
        "family budgets, so the relative errors\nbelow reflect that scale "
        "mismatch rather than physical model accuracy. See SCHEMA.md "
        "v4 entry.",
        flush=True,
    )
    if not diagnostic.empty:
        diagnostic_summary = (
            diagnostic.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
            .groupby(["rover", "target"])["abs_pct"]
            .median()
            .unstack("target")
        )
        with pd.option_context("display.max_columns", None, "display.width", 160):
            print("Median |relative error| (%) across algorithms:")
            print(diagnostic_summary.round(2).to_string())


if __name__ == "__main__":
    sys.exit(main())