Spaces:

jjreif
/

roverdevkit

Running

App Files Files Community

roverdevkit / webapp /backend /tests /test_evaluate.py

jjreif

Deploy roverdevkit @ 2676a67

b3d14e3 11 days ago

Raw

History Blame Contribute Delete

12.4 kB

	"""Smoke tests for ``POST /evaluate``.

	These run the analytical mission evaluator end-to-end. Unlike the
	predict tests they do not depend on the quantile-calibration artifact.
	"""

	from __future__ import annotations

	import pytest
	from fastapi.testclient import TestClient

	PRIMARY_TARGETS = {
	"range_km",
	"energy_margin_raw_pct",
	"slope_capability_deg",
	"total_mass_kg",
	}


	def test_evaluate_returns_all_primary_targets(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	response = client.post("/evaluate", json=payload)
	assert response.status_code == 200, response.text
	body = response.json()

	assert body["scenario_name"] == "equatorial_mare_traverse"
	targets = {m["target"] for m in body["metrics"]}
	assert targets == PRIMARY_TARGETS
	for metric in body["metrics"]:
	assert isinstance(metric["value"], (int, float))

	thermal = body["thermal"]
	for key in (
	"survives",
	"peak_sun_temp_c",
	"lunar_night_temp_c",
	"min_operating_temp_c",
	"max_operating_temp_c",
	"rhu_power_w",
	"hibernation_power_w",
	"surface_area_m2",
	"hot_case_ok",
	"cold_case_ok",
	):
	assert key in thermal
	# The default architecture has a -30/+50 °C envelope and these
	# are the limits the survival flag is judged against.
	assert thermal["min_operating_temp_c"] == -30.0
	assert thermal["max_operating_temp_c"] == 50.0

	# Schema v6 (v6 schema update): the per-evaluation drivetrain diagnostic
	# was renamed from ``motor_torque`` to ``stall`` and exposes the
	# explicit slip / capacity headroom rather than the v5 OK/NOT-OK
	# composite. See ``StallDiagnosticOut`` in webapp.backend.schemas.
	stall = body["stall"]
	for key in (
	"stalled",
	"peak_torque_demand_nm",
	"peak_torque_capacity_nm",
	):
	assert key in stall
	assert stall["peak_torque_demand_nm"] >= 0.0
	assert stall["peak_torque_capacity_nm"] > 0.0

	arch = body["architecture"]
	for key in (
	"mobility_architecture",
	"obstacle_capability_m",
	"required_obstacle_height_m",
	"obstacle_margin_m",
	"obstacle_requirement_met",
	"architecture_mass_kg",
	):
	assert key in arch

	# Schema v6 also surfaces the runtime-derived effective duty cycle
	# and cruise speed at the top level so the frontend can show what
	# the evaluator actually used (vs. the design's δ_des).
	assert "effective_duty_cycle" in body
	assert 0.0 <= body["effective_duty_cycle"] <= 0.6
	assert "cruise_speed_mps" in body
	assert body["cruise_speed_mps"] >= 0.0

	assert body["elapsed_ms"] > 0


	def test_evaluate_thermal_cold_case_drives_failure_for_no_rhu_design(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""The default architecture has 0 W RHU; cold case should be the failing one.

	With no RHU and 2 W of hibernation power, a 0.2-ish m² enclosure
	radiates to ~133 K (well below the −30 °C limit) and the hot case
	sits comfortably under +50 °C at any latitude. The dialog leans on
	this distinction to explain why survival fails, so we pin it
	here.
	"""
	payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	response = client.post("/evaluate", json=payload)
	assert response.status_code == 200
	thermal = response.json()["thermal"]
	if not thermal["survives"]:
	assert not thermal["cold_case_ok"]
	# Hot case should never be the failure for this sample design at
	# equatorial latitude (sanity guard against a regression that
	# silently flips the model).
	assert thermal["hot_case_ok"]


	def test_evaluate_payload_override_increases_total_mass(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Schema v9: the ``payload_mass_kg`` override is a top-level mass line
	item, so a non-zero override raises ``total_mass_kg`` ~one-for-one and
	leaves the other primary targets at or below their no-payload values.
	"""
	base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	base_resp = client.post("/evaluate", json={**base, "payload_mass_kg": 0.0})
	heavy_resp = client.post("/evaluate", json={**base, "payload_mass_kg": 10.0})
	assert base_resp.status_code == 200, base_resp.text
	assert heavy_resp.status_code == 200, heavy_resp.text

	base_mass = {m["target"]: m["value"] for m in base_resp.json()["metrics"]}[
	"total_mass_kg"
	]
	heavy_mass = {m["target"]: m["value"] for m in heavy_resp.json()["metrics"]}[
	"total_mass_kg"
	]
	# Payload sits outside the dry-mass growth margin, so the delta is the
	# payload itself (no extra margin applied on top).
	assert heavy_mass == pytest.approx(base_mass + 10.0, abs=1e-6)


	def test_evaluate_payload_power_override_reduces_range(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Schema v9: ``payload_power_w`` adds to the continuous ops-time load,
	so a non-zero override never increases range and typically shrinks it.
	"""
	base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	quiet = client.post("/evaluate", json={**base, "payload_power_w": 0.0})
	noisy = client.post("/evaluate", json={**base, "payload_power_w": 25.0})
	assert quiet.status_code == 200, quiet.text
	assert noisy.status_code == 200, noisy.text
	quiet_range = {m["target"]: m["value"] for m in quiet.json()["metrics"]}["range_km"]
	noisy_range = {m["target"]: m["value"] for m in noisy.json()["metrics"]}["range_km"]
	assert noisy_range <= quiet_range + 1e-9


	def test_evaluate_rejects_out_of_bounds_payload(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Payload overrides are bounded ``[0, 30]`` at the HTTP boundary."""
	response = client.post(
	"/evaluate",
	json={
	"design": sample_design,
	"scenario_name": "equatorial_mare_traverse",
	"payload_mass_kg": 999.0,
	},
	)
	assert response.status_code == 422


	def test_evaluate_mission_duration_override_increases_range(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Longer ``mission_duration_earth_days`` extends the simulation window,
	so range_km should not decrease when duration doubles on a non-binding
	mare traverse (energy-limited, not cap-limited).
	"""
	base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	short_resp = client.post(
	"/evaluate", json={**base, "mission_duration_earth_days": 7.0}
	)
	long_resp = client.post(
	"/evaluate", json={**base, "mission_duration_earth_days": 28.0}
	)
	assert short_resp.status_code == 200, short_resp.text
	assert long_resp.status_code == 200, long_resp.text
	short_range = {m["target"]: m["value"] for m in short_resp.json()["metrics"]}[
	"range_km"
	]
	long_range = {m["target"]: m["value"] for m in long_resp.json()["metrics"]}[
	"range_km"
	]
	assert long_range > short_range + 1e-9


	def test_evaluate_rejects_out_of_bounds_mission_duration(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Mission-duration overrides are bounded ``[0.5, 90]`` at the HTTP boundary."""
	response = client.post(
	"/evaluate",
	json={
	"design": sample_design,
	"scenario_name": "equatorial_mare_traverse",
	"mission_duration_earth_days": 0.1,
	},
	)
	assert response.status_code == 422


	def test_evaluate_unknown_scenario_returns_404(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	payload = {"design": sample_design, "scenario_name": "no_such_scenario"}
	response = client.post("/evaluate", json=payload)
	assert response.status_code == 404


	def test_evaluate_rejects_out_of_bounds_design(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	bad = dict(sample_design)
	bad["wheel_radius_m"] = 5.0
	response = client.post(
	"/evaluate",
	json={"design": bad, "scenario_name": "equatorial_mare_traverse"},
	)
	assert response.status_code == 422


	def test_evaluate_values_match_primary_metrics_shape(
	client: TestClient,
	sample_design: dict[str, float \| int],
	) -> None:
	"""Sanity-check the projection of ``MissionMetrics`` onto the four primary targets.

	Range and total mass are strictly positive for every well-formed
	scenario; slope is bounded above by 90°; energy margin is unbounded
	but should be finite. This is a coarse "no NaN snuck through" guard.
	"""
	payload = {"design": sample_design, "scenario_name": "polar_prospecting"}
	response = client.post("/evaluate", json=payload)
	assert response.status_code == 200
	body = response.json()
	by_target = {m["target"]: m["value"] for m in body["metrics"]}

	assert by_target["total_mass_kg"] > 0
	assert by_target["range_km"] >= 0
	assert 0 <= by_target["slope_capability_deg"] <= 90
	assert by_target["energy_margin_raw_pct"] == by_target["energy_margin_raw_pct"] # not NaN


	def test_evaluate_and_predict_agree_within_surrogate_noise_floor(
	client: TestClient,
	sample_design: dict[str, float \| int],
	surrogate_v7_1_compatible: bool,
	) -> None:
	"""The surrogate's median should track the evaluator within R²-noise.

	On the canonical equatorial-mare scenario for the Yutu-2-ish
	sample design, the tuned-median tuned median has R² ≥ 0.99 on every
	primary target. We pick a generous tolerance per target rather
	than assert exact equality so this test does not flake on
	XGBoost-version churn or harmless quantile-head retrains.
	"""
	if not surrogate_v7_1_compatible:
	pytest.skip(
	"schema-v7_1 quantile_bundles.joblib not on disk; pre-v7_1 "
	"bundles lack scenario_operational_duty_cycle and KeyError "
	"on the v7_1 feature row."
	)
	payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
	eval_resp = client.post("/evaluate", json=payload)
	pred_resp = client.post("/predict", json=payload)
	assert eval_resp.status_code == 200
	if pred_resp.status_code == 503:
	# Quantile bundles missing (mirrors the predict-test skip path).
	return
	assert pred_resp.status_code == 200

	evaluator = {m["target"]: m["value"] for m in eval_resp.json()["metrics"]}
	surrogate = {p["target"]: p["q50"] for p in pred_resp.json()["predictions"]}

	# Per-target relative tolerance on the median. Energy margin runs
	# large positive on equatorial-mare so we use absolute tolerance
	# (a 5 pp gap on a 600 % margin is still <1 % relative error).
	# The slope tolerance is set to ~2x the v9 surrogate's overall test
	# RMSE (0.930 deg) divided by a typical equatorial-mare sample-design
	# slope_capability (~22 deg) — i.e. tight enough to catch wiring bugs
	# but loose enough not to flake on a single-point tail residual at
	# the surrogate noise floor. Widened from 0.08 (v6) to 0.10 (v9)
	# because the v9 median head is marginally noisier on slope after the
	# payload-feature retrain (test R² 0.978). See
	# ``reports/surrogate_v9/median_sanity.csv``.
	# total_mass is a near-analytic function of design + payload, so the
	# median head learns it to high precision (test R² 0.999, RMSE
	# 0.567 kg). The 0.04 rel tol (~1.8 kg at this ~44 kg sample design)
	# is ~3x RMSE — a single-point tail allowance now that payload is an
	# extra LHS input adding a little variance, still tight enough to
	# catch a units / wiring regression.
	rel_tol = {
	"range_km": 0.10,
	"slope_capability_deg": 0.10,
	"total_mass_kg": 0.05,
	}
	for tgt, tol in rel_tol.items():
	e = evaluator[tgt]
	s = surrogate[tgt]
	assert abs(e - s) <= max(tol * abs(e), 1e-3), (tgt, e, s)
	# Energy margin: tolerate a 50-pp gap in absolute terms.
	assert abs(evaluator["energy_margin_raw_pct"] - surrogate["energy_margin_raw_pct"]) <= 50