roverdevkit / scripts /run_baselines.py
jjreif's picture
Deploy roverdevkit @ 2676a67
b3d14e3
Raw
History Blame Contribute Delete
11.5 kB
"""Train and score the baseline-surrogate baseline surrogate matrix on a Parquet dataset.
Single canonical entry point for the baseline-surrogate §6 step-4 acceptance run:
fit Ridge / RF / XGBoost per target, the joint MLP across all primary
targets, and LogReg / XGBoost feasibility classifiers; then score them
on the held-out test split (with a per-scenario-family breakdown) and
run the registry-rover Layer-1 sanity check.
Outputs (under ``--out-dir``):
- ``metrics_long.parquet`` — tidy long-format frame
``(algorithm, target, split, scenario_family, metric, value)``.
- ``acceptance_gate.csv`` — one row per ``(algorithm, target)`` with
the plan's threshold, observed value, and pass/fail.
- ``registry_sanity.csv`` — predictions for Pragyan / Yutu-2 /
MoonRanger / Rashid-1 vs. the deterministic evaluator (Layer-1 truth).
Pragyan and Yutu-2 are flown rovers; MoonRanger and Rashid-1 are
design-target lunar micro-rovers (never deployed) included for
Layer-1 OOD coverage of the surrogate's input space.
Each row carries an ``is_primary`` flag. ``True`` rows
(``total_mass_kg``, ``slope_capability_deg``, ``stalled``)
are the design-axis Layer-1 acceptance set; ``False`` rows
(``range_km``, ``energy_margin_raw_pct``) are scenario-OOD
diagnostics — see ``roverdevkit.surrogate.baselines``
``LAYER1_PRIMARY_TARGETS`` / ``LAYER1_DIAGNOSTIC_TARGETS``.
- ``fit_seconds.csv`` — per-fit wall-clock for the writeup.
Examples
--------
::
# Full 40k acceptance run (current canonical dataset, analytical Bekker-Wong)
python scripts/run_baselines.py \\
--dataset data/analytical/lhs_v9.parquet \\
--out-dir reports/baselines_v9
# Fast pilot smoke (skip MLP, smaller forest)
python scripts/run_baselines.py \\
--dataset data/analytical/lhs_pilot.parquet \\
--out-dir reports/baselines_pilot \\
--no-mlp
"""
from __future__ import annotations
import argparse
import logging
import sys
import time
from pathlib import Path
import pandas as pd
from roverdevkit.surrogate.baselines import (
acceptance_gate,
evaluate_baselines,
fit_baselines,
predict_for_registry_rovers,
)
from roverdevkit.surrogate.dataset import read_parquet
from roverdevkit.surrogate.features import FEASIBILITY_COLUMN
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
p.add_argument(
"--dataset",
type=Path,
required=True,
help="Path to the Parquet dataset produced by scripts/build_dataset.py.",
)
p.add_argument(
"--out-dir",
type=Path,
required=True,
help="Directory for the output reports (created if missing).",
)
p.add_argument("--seed", type=int, default=42, help="Estimator random_state.")
p.add_argument(
"--n-jobs",
type=int,
default=-1,
help="Plumbed through to RF / XGBoost. -1 uses all cores.",
)
p.add_argument(
"--no-mlp",
action="store_true",
help="Skip fitting the joint MLP. Useful for fast smokes.",
)
p.add_argument(
"--no-registry-check",
action="store_true",
help="Skip the registry-rover Layer-1 sanity check.",
)
p.add_argument(
"--log-level",
default="INFO",
choices=("DEBUG", "INFO", "WARNING", "ERROR"),
)
return p.parse_args(argv)
def main(argv: list[str] | None = None) -> int:
args = _parse_args(argv)
logging.basicConfig(
level=args.log_level,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("run_baselines")
args.out_dir.mkdir(parents=True, exist_ok=True)
log.info("loading dataset from %s", args.dataset)
df = read_parquet(args.dataset)
log.info(
"loaded %d rows x %d cols; splits: %s",
len(df),
len(df.columns),
df["split"].value_counts().to_dict() if "split" in df.columns else {},
)
df_train = df[df["split"] == "train"]
df_val = df[df["split"] == "val"]
df_test = df[df["split"] == "test"]
log.info("train=%d val=%d test=%d", len(df_train), len(df_val), len(df_test))
# ----- fit ---------------------------------------------------------------
t_fit = time.perf_counter()
fitted = fit_baselines(
df_train,
fit_mlp=not args.no_mlp,
n_jobs=args.n_jobs,
random_state=args.seed,
verbose=True,
)
fit_elapsed = time.perf_counter() - t_fit
log.info("fit complete in %.1f s", fit_elapsed)
# ----- evaluate (val + test, with per-scenario-family breakdown) --------
log.info("scoring val and test splits...")
t_eval = time.perf_counter()
val_metrics = evaluate_baselines(fitted, df_val, split_label="val")
test_metrics = evaluate_baselines(fitted, df_test, split_label="test")
train_metrics = evaluate_baselines(fitted, df_train, split_label="train")
metrics = pd.concat([train_metrics, val_metrics, test_metrics], ignore_index=True)
log.info("scoring done in %.1f s; %d metric rows", time.perf_counter() - t_eval, len(metrics))
metrics_path = args.out_dir / "metrics_long.parquet"
metrics.to_parquet(metrics_path, index=False)
log.info("wrote %s (%d rows)", metrics_path, len(metrics))
# ----- acceptance gate (test, overall) ----------------------------------
gate = acceptance_gate(metrics, split="test", family="__all__")
gate_path = args.out_dir / "acceptance_gate.csv"
gate.to_csv(gate_path, index=False)
log.info("wrote %s; passing rows: %d/%d", gate_path, int(gate["passes"].sum()), len(gate))
print("\n=== Acceptance gate (test split, all families) ===", flush=True)
with pd.option_context("display.max_columns", None, "display.width", 160):
print(gate.to_string(index=False))
# ----- compact summary table per (algorithm, target) on test ------------
test_overall = metrics.query("split == 'test' and scenario_family == '__all__'")
pivot = (
test_overall.pivot_table(
index=["algorithm", "target"],
columns="metric",
values="value",
aggfunc="first",
)
.reset_index()
.sort_values(["target", "algorithm"])
)
pivot_path = args.out_dir / "test_summary.csv"
pivot.to_csv(pivot_path, index=False)
log.info("wrote %s", pivot_path)
print("\n=== Per-(algorithm, target) test metrics ===", flush=True)
with pd.option_context("display.max_columns", None, "display.width", 160):
print(pivot.to_string(index=False))
# ----- per-scenario breakdown on the primary metrics --------------------
fam_rows = metrics.query(
"split == 'test' and scenario_family != '__all__' and metric in ('r2', 'auc')"
)
fam_pivot = (
fam_rows.pivot_table(
index=["algorithm", "target", "metric"],
columns="scenario_family",
values="value",
aggfunc="first",
)
.reset_index()
.sort_values(["target", "metric", "algorithm"])
)
fam_pivot_path = args.out_dir / "test_per_family.csv"
fam_pivot.to_csv(fam_pivot_path, index=False)
log.info("wrote %s", fam_pivot_path)
# ----- fit-time table ---------------------------------------------------
fit_rows = [
{"algorithm": k[0], "target": k[1], "fit_seconds": v} for k, v in fitted.fit_seconds.items()
]
fit_df = pd.DataFrame(fit_rows).sort_values(["algorithm", "target"])
fit_path = args.out_dir / "fit_seconds.csv"
fit_df.to_csv(fit_path, index=False)
log.info("wrote %s (%.1f s wall-clock total fit)", fit_path, fit_elapsed)
# ----- registry rover Layer-1 sanity ------------------------------------
if not args.no_registry_check:
log.info("running registry-rover sanity check...")
try:
sanity = predict_for_registry_rovers(fitted)
sanity_path = args.out_dir / "registry_sanity.csv"
sanity.to_csv(sanity_path, index=False)
log.info("wrote %s (%d rows)", sanity_path, len(sanity))
_print_registry_sanity_summary(sanity)
except Exception as exc: # pragma: no cover — diagnostic, not fatal
log.warning("registry-rover sanity check failed: %s", exc)
return 0
def _print_registry_sanity_summary(sanity: pd.DataFrame) -> None:
"""Print Layer-1 sanity in two tables: design-axis primary + scenario-OOD diagnostic.
See ``roverdevkit.surrogate.baselines.LAYER1_PRIMARY_TARGETS`` for
the rationale for the split. Range / energy_margin live in the
diagnostic block because the registry's published mission distances
are 100-1000x smaller than the LHS family budgets, which is a
*scenario*-OOD effect rather than a surrogate-calibration failure.
"""
primary = sanity[sanity["is_primary"]].copy()
diagnostic = sanity[~sanity["is_primary"]].copy()
print(
"\n=== Registry-rover Layer-1 sanity (PRIMARY: design-axis targets) ===",
flush=True,
)
print(
"Acceptance set: total_mass_kg, slope_capability_deg, stalled.",
flush=True,
)
regressor_primary = primary[primary["target"] != FEASIBILITY_COLUMN]
if not regressor_primary.empty:
primary_summary = (
regressor_primary.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
.groupby(["rover", "target"])["abs_pct"]
.median()
.unstack("target")
)
with pd.option_context("display.max_columns", None, "display.width", 160):
print("Median |relative error| (%) across algorithms (regression):")
print(primary_summary.round(2).to_string())
classifier_primary = primary[primary["target"] == FEASIBILITY_COLUMN]
if not classifier_primary.empty:
clf_summary = (
classifier_primary.assign(
hit=lambda d: (d["predicted"] >= 0.5).astype(int) == d["evaluator"].astype(int)
)
.groupby("rover")["hit"]
.mean()
.rename("classifier_accuracy")
.to_frame()
)
with pd.option_context("display.max_columns", None, "display.width", 160):
print("\nClassifier accuracy across algorithms (stalled):")
print(clf_summary.round(3).to_string())
print(
"\n=== Registry-rover Layer-1 diagnostic (SCENARIO-OOD; not part of acceptance) ===",
flush=True,
)
print(
"These targets are reported for transparency only. The registry's "
"published mission\ndistances are 100-1000x smaller than the LHS "
"family budgets, so the relative errors\nbelow reflect that scale "
"mismatch rather than physical model accuracy. See SCHEMA.md "
"v4 entry.",
flush=True,
)
if not diagnostic.empty:
diagnostic_summary = (
diagnostic.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
.groupby(["rover", "target"])["abs_pct"]
.median()
.unstack("target")
)
with pd.option_context("display.max_columns", None, "display.width", 160):
print("Median |relative error| (%) across algorithms:")
print(diagnostic_summary.round(2).to_string())
if __name__ == "__main__":
sys.exit(main())