"""Train and score the baseline-surrogate baseline surrogate matrix on a Parquet dataset. Single canonical entry point for the baseline-surrogate §6 step-4 acceptance run: fit Ridge / RF / XGBoost per target, the joint MLP across all primary targets, and LogReg / XGBoost feasibility classifiers; then score them on the held-out test split (with a per-scenario-family breakdown) and run the registry-rover Layer-1 sanity check. Outputs (under ``--out-dir``): - ``metrics_long.parquet`` — tidy long-format frame ``(algorithm, target, split, scenario_family, metric, value)``. - ``acceptance_gate.csv`` — one row per ``(algorithm, target)`` with the plan's threshold, observed value, and pass/fail. - ``registry_sanity.csv`` — predictions for Pragyan / Yutu-2 / MoonRanger / Rashid-1 vs. the deterministic evaluator (Layer-1 truth). Pragyan and Yutu-2 are flown rovers; MoonRanger and Rashid-1 are design-target lunar micro-rovers (never deployed) included for Layer-1 OOD coverage of the surrogate's input space. Each row carries an ``is_primary`` flag. ``True`` rows (``total_mass_kg``, ``slope_capability_deg``, ``stalled``) are the design-axis Layer-1 acceptance set; ``False`` rows (``range_km``, ``energy_margin_raw_pct``) are scenario-OOD diagnostics — see ``roverdevkit.surrogate.baselines`` ``LAYER1_PRIMARY_TARGETS`` / ``LAYER1_DIAGNOSTIC_TARGETS``. - ``fit_seconds.csv`` — per-fit wall-clock for the writeup. Examples -------- :: # Full 40k acceptance run (current canonical dataset, analytical Bekker-Wong) python scripts/run_baselines.py \\ --dataset data/analytical/lhs_v9.parquet \\ --out-dir reports/baselines_v9 # Fast pilot smoke (skip MLP, smaller forest) python scripts/run_baselines.py \\ --dataset data/analytical/lhs_pilot.parquet \\ --out-dir reports/baselines_pilot \\ --no-mlp """ from __future__ import annotations import argparse import logging import sys import time from pathlib import Path import pandas as pd from roverdevkit.surrogate.baselines import ( acceptance_gate, evaluate_baselines, fit_baselines, predict_for_registry_rovers, ) from roverdevkit.surrogate.dataset import read_parquet from roverdevkit.surrogate.features import FEASIBILITY_COLUMN def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: p = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) p.add_argument( "--dataset", type=Path, required=True, help="Path to the Parquet dataset produced by scripts/build_dataset.py.", ) p.add_argument( "--out-dir", type=Path, required=True, help="Directory for the output reports (created if missing).", ) p.add_argument("--seed", type=int, default=42, help="Estimator random_state.") p.add_argument( "--n-jobs", type=int, default=-1, help="Plumbed through to RF / XGBoost. -1 uses all cores.", ) p.add_argument( "--no-mlp", action="store_true", help="Skip fitting the joint MLP. Useful for fast smokes.", ) p.add_argument( "--no-registry-check", action="store_true", help="Skip the registry-rover Layer-1 sanity check.", ) p.add_argument( "--log-level", default="INFO", choices=("DEBUG", "INFO", "WARNING", "ERROR"), ) return p.parse_args(argv) def main(argv: list[str] | None = None) -> int: args = _parse_args(argv) logging.basicConfig( level=args.log_level, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("run_baselines") args.out_dir.mkdir(parents=True, exist_ok=True) log.info("loading dataset from %s", args.dataset) df = read_parquet(args.dataset) log.info( "loaded %d rows x %d cols; splits: %s", len(df), len(df.columns), df["split"].value_counts().to_dict() if "split" in df.columns else {}, ) df_train = df[df["split"] == "train"] df_val = df[df["split"] == "val"] df_test = df[df["split"] == "test"] log.info("train=%d val=%d test=%d", len(df_train), len(df_val), len(df_test)) # ----- fit --------------------------------------------------------------- t_fit = time.perf_counter() fitted = fit_baselines( df_train, fit_mlp=not args.no_mlp, n_jobs=args.n_jobs, random_state=args.seed, verbose=True, ) fit_elapsed = time.perf_counter() - t_fit log.info("fit complete in %.1f s", fit_elapsed) # ----- evaluate (val + test, with per-scenario-family breakdown) -------- log.info("scoring val and test splits...") t_eval = time.perf_counter() val_metrics = evaluate_baselines(fitted, df_val, split_label="val") test_metrics = evaluate_baselines(fitted, df_test, split_label="test") train_metrics = evaluate_baselines(fitted, df_train, split_label="train") metrics = pd.concat([train_metrics, val_metrics, test_metrics], ignore_index=True) log.info("scoring done in %.1f s; %d metric rows", time.perf_counter() - t_eval, len(metrics)) metrics_path = args.out_dir / "metrics_long.parquet" metrics.to_parquet(metrics_path, index=False) log.info("wrote %s (%d rows)", metrics_path, len(metrics)) # ----- acceptance gate (test, overall) ---------------------------------- gate = acceptance_gate(metrics, split="test", family="__all__") gate_path = args.out_dir / "acceptance_gate.csv" gate.to_csv(gate_path, index=False) log.info("wrote %s; passing rows: %d/%d", gate_path, int(gate["passes"].sum()), len(gate)) print("\n=== Acceptance gate (test split, all families) ===", flush=True) with pd.option_context("display.max_columns", None, "display.width", 160): print(gate.to_string(index=False)) # ----- compact summary table per (algorithm, target) on test ------------ test_overall = metrics.query("split == 'test' and scenario_family == '__all__'") pivot = ( test_overall.pivot_table( index=["algorithm", "target"], columns="metric", values="value", aggfunc="first", ) .reset_index() .sort_values(["target", "algorithm"]) ) pivot_path = args.out_dir / "test_summary.csv" pivot.to_csv(pivot_path, index=False) log.info("wrote %s", pivot_path) print("\n=== Per-(algorithm, target) test metrics ===", flush=True) with pd.option_context("display.max_columns", None, "display.width", 160): print(pivot.to_string(index=False)) # ----- per-scenario breakdown on the primary metrics -------------------- fam_rows = metrics.query( "split == 'test' and scenario_family != '__all__' and metric in ('r2', 'auc')" ) fam_pivot = ( fam_rows.pivot_table( index=["algorithm", "target", "metric"], columns="scenario_family", values="value", aggfunc="first", ) .reset_index() .sort_values(["target", "metric", "algorithm"]) ) fam_pivot_path = args.out_dir / "test_per_family.csv" fam_pivot.to_csv(fam_pivot_path, index=False) log.info("wrote %s", fam_pivot_path) # ----- fit-time table --------------------------------------------------- fit_rows = [ {"algorithm": k[0], "target": k[1], "fit_seconds": v} for k, v in fitted.fit_seconds.items() ] fit_df = pd.DataFrame(fit_rows).sort_values(["algorithm", "target"]) fit_path = args.out_dir / "fit_seconds.csv" fit_df.to_csv(fit_path, index=False) log.info("wrote %s (%.1f s wall-clock total fit)", fit_path, fit_elapsed) # ----- registry rover Layer-1 sanity ------------------------------------ if not args.no_registry_check: log.info("running registry-rover sanity check...") try: sanity = predict_for_registry_rovers(fitted) sanity_path = args.out_dir / "registry_sanity.csv" sanity.to_csv(sanity_path, index=False) log.info("wrote %s (%d rows)", sanity_path, len(sanity)) _print_registry_sanity_summary(sanity) except Exception as exc: # pragma: no cover — diagnostic, not fatal log.warning("registry-rover sanity check failed: %s", exc) return 0 def _print_registry_sanity_summary(sanity: pd.DataFrame) -> None: """Print Layer-1 sanity in two tables: design-axis primary + scenario-OOD diagnostic. See ``roverdevkit.surrogate.baselines.LAYER1_PRIMARY_TARGETS`` for the rationale for the split. Range / energy_margin live in the diagnostic block because the registry's published mission distances are 100-1000x smaller than the LHS family budgets, which is a *scenario*-OOD effect rather than a surrogate-calibration failure. """ primary = sanity[sanity["is_primary"]].copy() diagnostic = sanity[~sanity["is_primary"]].copy() print( "\n=== Registry-rover Layer-1 sanity (PRIMARY: design-axis targets) ===", flush=True, ) print( "Acceptance set: total_mass_kg, slope_capability_deg, stalled.", flush=True, ) regressor_primary = primary[primary["target"] != FEASIBILITY_COLUMN] if not regressor_primary.empty: primary_summary = ( regressor_primary.assign(abs_pct=lambda d: 100 * d["rel_error"].abs()) .groupby(["rover", "target"])["abs_pct"] .median() .unstack("target") ) with pd.option_context("display.max_columns", None, "display.width", 160): print("Median |relative error| (%) across algorithms (regression):") print(primary_summary.round(2).to_string()) classifier_primary = primary[primary["target"] == FEASIBILITY_COLUMN] if not classifier_primary.empty: clf_summary = ( classifier_primary.assign( hit=lambda d: (d["predicted"] >= 0.5).astype(int) == d["evaluator"].astype(int) ) .groupby("rover")["hit"] .mean() .rename("classifier_accuracy") .to_frame() ) with pd.option_context("display.max_columns", None, "display.width", 160): print("\nClassifier accuracy across algorithms (stalled):") print(clf_summary.round(3).to_string()) print( "\n=== Registry-rover Layer-1 diagnostic (SCENARIO-OOD; not part of acceptance) ===", flush=True, ) print( "These targets are reported for transparency only. The registry's " "published mission\ndistances are 100-1000x smaller than the LHS " "family budgets, so the relative errors\nbelow reflect that scale " "mismatch rather than physical model accuracy. See SCHEMA.md " "v4 entry.", flush=True, ) if not diagnostic.empty: diagnostic_summary = ( diagnostic.assign(abs_pct=lambda d: 100 * d["rel_error"].abs()) .groupby(["rover", "target"])["abs_pct"] .median() .unstack("target") ) with pd.option_context("display.max_columns", None, "display.width", 160): print("Median |relative error| (%) across algorithms:") print(diagnostic_summary.round(2).to_string()) if __name__ == "__main__": sys.exit(main())