Spaces:

jjreif
/

roverdevkit

Running

App Files Files Community

roverdevkit / scripts /run_baselines.py

jjreif

Deploy roverdevkit @ 2676a67

b3d14e3 10 days ago

Raw

History Blame Contribute Delete

11.5 kB

	"""Train and score the baseline-surrogate baseline surrogate matrix on a Parquet dataset.

	Single canonical entry point for the baseline-surrogate §6 step-4 acceptance run:
	fit Ridge / RF / XGBoost per target, the joint MLP across all primary
	targets, and LogReg / XGBoost feasibility classifiers; then score them
	on the held-out test split (with a per-scenario-family breakdown) and
	run the registry-rover Layer-1 sanity check.

	Outputs (under ``--out-dir``):

	- ``metrics_long.parquet`` — tidy long-format frame
	``(algorithm, target, split, scenario_family, metric, value)``.
	- ``acceptance_gate.csv`` — one row per ``(algorithm, target)`` with
	the plan's threshold, observed value, and pass/fail.
	- ``registry_sanity.csv`` — predictions for Pragyan / Yutu-2 /
	MoonRanger / Rashid-1 vs. the deterministic evaluator (Layer-1 truth).
	Pragyan and Yutu-2 are flown rovers; MoonRanger and Rashid-1 are
	design-target lunar micro-rovers (never deployed) included for
	Layer-1 OOD coverage of the surrogate's input space.
	Each row carries an ``is_primary`` flag. ``True`` rows
	(``total_mass_kg``, ``slope_capability_deg``, ``stalled``)
	are the design-axis Layer-1 acceptance set; ``False`` rows
	(``range_km``, ``energy_margin_raw_pct``) are scenario-OOD
	diagnostics — see ``roverdevkit.surrogate.baselines``
	``LAYER1_PRIMARY_TARGETS`` / ``LAYER1_DIAGNOSTIC_TARGETS``.
	- ``fit_seconds.csv`` — per-fit wall-clock for the writeup.

	Examples
	--------
	::

	# Full 40k acceptance run (current canonical dataset, analytical Bekker-Wong)
	python scripts/run_baselines.py \\
	--dataset data/analytical/lhs_v9.parquet \\
	--out-dir reports/baselines_v9

	# Fast pilot smoke (skip MLP, smaller forest)
	python scripts/run_baselines.py \\
	--dataset data/analytical/lhs_pilot.parquet \\
	--out-dir reports/baselines_pilot \\
	--no-mlp
	"""

	from __future__ import annotations

	import argparse
	import logging
	import sys
	import time
	from pathlib import Path

	import pandas as pd

	from roverdevkit.surrogate.baselines import (
	acceptance_gate,
	evaluate_baselines,
	fit_baselines,
	predict_for_registry_rovers,
	)
	from roverdevkit.surrogate.dataset import read_parquet
	from roverdevkit.surrogate.features import FEASIBILITY_COLUMN


	def _parse_args(argv: list[str] \| None = None) -> argparse.Namespace:
	p = argparse.ArgumentParser(
	description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
	)
	p.add_argument(
	"--dataset",
	type=Path,
	required=True,
	help="Path to the Parquet dataset produced by scripts/build_dataset.py.",
	)
	p.add_argument(
	"--out-dir",
	type=Path,
	required=True,
	help="Directory for the output reports (created if missing).",
	)
	p.add_argument("--seed", type=int, default=42, help="Estimator random_state.")
	p.add_argument(
	"--n-jobs",
	type=int,
	default=-1,
	help="Plumbed through to RF / XGBoost. -1 uses all cores.",
	)
	p.add_argument(
	"--no-mlp",
	action="store_true",
	help="Skip fitting the joint MLP. Useful for fast smokes.",
	)
	p.add_argument(
	"--no-registry-check",
	action="store_true",
	help="Skip the registry-rover Layer-1 sanity check.",
	)
	p.add_argument(
	"--log-level",
	default="INFO",
	choices=("DEBUG", "INFO", "WARNING", "ERROR"),
	)
	return p.parse_args(argv)


	def main(argv: list[str] \| None = None) -> int:
	args = _parse_args(argv)
	logging.basicConfig(
	level=args.log_level,
	format="%(asctime)s %(levelname)s %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("run_baselines")

	args.out_dir.mkdir(parents=True, exist_ok=True)
	log.info("loading dataset from %s", args.dataset)
	df = read_parquet(args.dataset)
	log.info(
	"loaded %d rows x %d cols; splits: %s",
	len(df),
	len(df.columns),
	df["split"].value_counts().to_dict() if "split" in df.columns else {},
	)

	df_train = df[df["split"] == "train"]
	df_val = df[df["split"] == "val"]
	df_test = df[df["split"] == "test"]
	log.info("train=%d val=%d test=%d", len(df_train), len(df_val), len(df_test))

	# ----- fit ---------------------------------------------------------------
	t_fit = time.perf_counter()
	fitted = fit_baselines(
	df_train,
	fit_mlp=not args.no_mlp,
	n_jobs=args.n_jobs,
	random_state=args.seed,
	verbose=True,
	)
	fit_elapsed = time.perf_counter() - t_fit
	log.info("fit complete in %.1f s", fit_elapsed)

	# ----- evaluate (val + test, with per-scenario-family breakdown) --------
	log.info("scoring val and test splits...")
	t_eval = time.perf_counter()
	val_metrics = evaluate_baselines(fitted, df_val, split_label="val")
	test_metrics = evaluate_baselines(fitted, df_test, split_label="test")
	train_metrics = evaluate_baselines(fitted, df_train, split_label="train")
	metrics = pd.concat([train_metrics, val_metrics, test_metrics], ignore_index=True)
	log.info("scoring done in %.1f s; %d metric rows", time.perf_counter() - t_eval, len(metrics))

	metrics_path = args.out_dir / "metrics_long.parquet"
	metrics.to_parquet(metrics_path, index=False)
	log.info("wrote %s (%d rows)", metrics_path, len(metrics))

	# ----- acceptance gate (test, overall) ----------------------------------
	gate = acceptance_gate(metrics, split="test", family="__all__")
	gate_path = args.out_dir / "acceptance_gate.csv"
	gate.to_csv(gate_path, index=False)
	log.info("wrote %s; passing rows: %d/%d", gate_path, int(gate["passes"].sum()), len(gate))
	print("\n=== Acceptance gate (test split, all families) ===", flush=True)
	with pd.option_context("display.max_columns", None, "display.width", 160):
	print(gate.to_string(index=False))

	# ----- compact summary table per (algorithm, target) on test ------------
	test_overall = metrics.query("split == 'test' and scenario_family == '__all__'")
	pivot = (
	test_overall.pivot_table(
	index=["algorithm", "target"],
	columns="metric",
	values="value",
	aggfunc="first",
	)
	.reset_index()
	.sort_values(["target", "algorithm"])
	)
	pivot_path = args.out_dir / "test_summary.csv"
	pivot.to_csv(pivot_path, index=False)
	log.info("wrote %s", pivot_path)
	print("\n=== Per-(algorithm, target) test metrics ===", flush=True)
	with pd.option_context("display.max_columns", None, "display.width", 160):
	print(pivot.to_string(index=False))

	# ----- per-scenario breakdown on the primary metrics --------------------
	fam_rows = metrics.query(
	"split == 'test' and scenario_family != '__all__' and metric in ('r2', 'auc')"
	)
	fam_pivot = (
	fam_rows.pivot_table(
	index=["algorithm", "target", "metric"],
	columns="scenario_family",
	values="value",
	aggfunc="first",
	)
	.reset_index()
	.sort_values(["target", "metric", "algorithm"])
	)
	fam_pivot_path = args.out_dir / "test_per_family.csv"
	fam_pivot.to_csv(fam_pivot_path, index=False)
	log.info("wrote %s", fam_pivot_path)

	# ----- fit-time table ---------------------------------------------------
	fit_rows = [
	{"algorithm": k[0], "target": k[1], "fit_seconds": v} for k, v in fitted.fit_seconds.items()
	]
	fit_df = pd.DataFrame(fit_rows).sort_values(["algorithm", "target"])
	fit_path = args.out_dir / "fit_seconds.csv"
	fit_df.to_csv(fit_path, index=False)
	log.info("wrote %s (%.1f s wall-clock total fit)", fit_path, fit_elapsed)

	# ----- registry rover Layer-1 sanity ------------------------------------
	if not args.no_registry_check:
	log.info("running registry-rover sanity check...")
	try:
	sanity = predict_for_registry_rovers(fitted)
	sanity_path = args.out_dir / "registry_sanity.csv"
	sanity.to_csv(sanity_path, index=False)
	log.info("wrote %s (%d rows)", sanity_path, len(sanity))
	_print_registry_sanity_summary(sanity)
	except Exception as exc: # pragma: no cover — diagnostic, not fatal
	log.warning("registry-rover sanity check failed: %s", exc)

	return 0


	def _print_registry_sanity_summary(sanity: pd.DataFrame) -> None:
	"""Print Layer-1 sanity in two tables: design-axis primary + scenario-OOD diagnostic.

	See ``roverdevkit.surrogate.baselines.LAYER1_PRIMARY_TARGETS`` for
	the rationale for the split. Range / energy_margin live in the
	diagnostic block because the registry's published mission distances
	are 100-1000x smaller than the LHS family budgets, which is a
	scenario-OOD effect rather than a surrogate-calibration failure.
	"""
	primary = sanity[sanity["is_primary"]].copy()
	diagnostic = sanity[~sanity["is_primary"]].copy()

	print(
	"\n=== Registry-rover Layer-1 sanity (PRIMARY: design-axis targets) ===",
	flush=True,
	)
	print(
	"Acceptance set: total_mass_kg, slope_capability_deg, stalled.",
	flush=True,
	)

	regressor_primary = primary[primary["target"] != FEASIBILITY_COLUMN]
	if not regressor_primary.empty:
	primary_summary = (
	regressor_primary.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
	.groupby(["rover", "target"])["abs_pct"]
	.median()
	.unstack("target")
	)
	with pd.option_context("display.max_columns", None, "display.width", 160):
	print("Median \|relative error\| (%) across algorithms (regression):")
	print(primary_summary.round(2).to_string())

	classifier_primary = primary[primary["target"] == FEASIBILITY_COLUMN]
	if not classifier_primary.empty:
	clf_summary = (
	classifier_primary.assign(
	hit=lambda d: (d["predicted"] >= 0.5).astype(int) == d["evaluator"].astype(int)
	)
	.groupby("rover")["hit"]
	.mean()
	.rename("classifier_accuracy")
	.to_frame()
	)
	with pd.option_context("display.max_columns", None, "display.width", 160):
	print("\nClassifier accuracy across algorithms (stalled):")
	print(clf_summary.round(3).to_string())

	print(
	"\n=== Registry-rover Layer-1 diagnostic (SCENARIO-OOD; not part of acceptance) ===",
	flush=True,
	)
	print(
	"These targets are reported for transparency only. The registry's "
	"published mission\ndistances are 100-1000x smaller than the LHS "
	"family budgets, so the relative errors\nbelow reflect that scale "
	"mismatch rather than physical model accuracy. See SCHEMA.md "
	"v4 entry.",
	flush=True,
	)
	if not diagnostic.empty:
	diagnostic_summary = (
	diagnostic.assign(abs_pct=lambda d: 100 * d["rel_error"].abs())
	.groupby(["rover", "target"])["abs_pct"]
	.median()
	.unstack("target")
	)
	with pd.option_context("display.max_columns", None, "display.width", 160):
	print("Median \|relative error\| (%) across algorithms:")
	print(diagnostic_summary.round(2).to_string())


	if __name__ == "__main__":
	sys.exit(main())