roverdevkit / webapp /backend /tests /test_evaluate.py
jjreif's picture
Deploy roverdevkit @ 2676a67
b3d14e3
Raw
History Blame Contribute Delete
12.4 kB
"""Smoke tests for ``POST /evaluate``.
These run the analytical mission evaluator end-to-end. Unlike the
predict tests they do *not* depend on the quantile-calibration artifact.
"""
from __future__ import annotations
import pytest
from fastapi.testclient import TestClient
PRIMARY_TARGETS = {
"range_km",
"energy_margin_raw_pct",
"slope_capability_deg",
"total_mass_kg",
}
def test_evaluate_returns_all_primary_targets(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
response = client.post("/evaluate", json=payload)
assert response.status_code == 200, response.text
body = response.json()
assert body["scenario_name"] == "equatorial_mare_traverse"
targets = {m["target"] for m in body["metrics"]}
assert targets == PRIMARY_TARGETS
for metric in body["metrics"]:
assert isinstance(metric["value"], (int, float))
thermal = body["thermal"]
for key in (
"survives",
"peak_sun_temp_c",
"lunar_night_temp_c",
"min_operating_temp_c",
"max_operating_temp_c",
"rhu_power_w",
"hibernation_power_w",
"surface_area_m2",
"hot_case_ok",
"cold_case_ok",
):
assert key in thermal
# The default architecture has a -30/+50 °C envelope and these
# are the limits the survival flag is judged against.
assert thermal["min_operating_temp_c"] == -30.0
assert thermal["max_operating_temp_c"] == 50.0
# Schema v6 (v6 schema update): the per-evaluation drivetrain diagnostic
# was renamed from ``motor_torque`` to ``stall`` and exposes the
# explicit slip / capacity headroom rather than the v5 OK/NOT-OK
# composite. See ``StallDiagnosticOut`` in webapp.backend.schemas.
stall = body["stall"]
for key in (
"stalled",
"peak_torque_demand_nm",
"peak_torque_capacity_nm",
):
assert key in stall
assert stall["peak_torque_demand_nm"] >= 0.0
assert stall["peak_torque_capacity_nm"] > 0.0
arch = body["architecture"]
for key in (
"mobility_architecture",
"obstacle_capability_m",
"required_obstacle_height_m",
"obstacle_margin_m",
"obstacle_requirement_met",
"architecture_mass_kg",
):
assert key in arch
# Schema v6 also surfaces the runtime-derived effective duty cycle
# and cruise speed at the top level so the frontend can show what
# the evaluator actually used (vs. the design's δ_des).
assert "effective_duty_cycle" in body
assert 0.0 <= body["effective_duty_cycle"] <= 0.6
assert "cruise_speed_mps" in body
assert body["cruise_speed_mps"] >= 0.0
assert body["elapsed_ms"] > 0
def test_evaluate_thermal_cold_case_drives_failure_for_no_rhu_design(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""The default architecture has 0 W RHU; cold case should be the failing one.
With no RHU and 2 W of hibernation power, a 0.2-ish m² enclosure
radiates to ~133 K (well below the −30 °C limit) and the hot case
sits comfortably under +50 °C at any latitude. The dialog leans on
this distinction to explain *why* survival fails, so we pin it
here.
"""
payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
response = client.post("/evaluate", json=payload)
assert response.status_code == 200
thermal = response.json()["thermal"]
if not thermal["survives"]:
assert not thermal["cold_case_ok"]
# Hot case should never be the failure for this sample design at
# equatorial latitude (sanity guard against a regression that
# silently flips the model).
assert thermal["hot_case_ok"]
def test_evaluate_payload_override_increases_total_mass(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Schema v9: the ``payload_mass_kg`` override is a top-level mass line
item, so a non-zero override raises ``total_mass_kg`` ~one-for-one and
leaves the other primary targets at or below their no-payload values.
"""
base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
base_resp = client.post("/evaluate", json={**base, "payload_mass_kg": 0.0})
heavy_resp = client.post("/evaluate", json={**base, "payload_mass_kg": 10.0})
assert base_resp.status_code == 200, base_resp.text
assert heavy_resp.status_code == 200, heavy_resp.text
base_mass = {m["target"]: m["value"] for m in base_resp.json()["metrics"]}[
"total_mass_kg"
]
heavy_mass = {m["target"]: m["value"] for m in heavy_resp.json()["metrics"]}[
"total_mass_kg"
]
# Payload sits outside the dry-mass growth margin, so the delta is the
# payload itself (no extra margin applied on top).
assert heavy_mass == pytest.approx(base_mass + 10.0, abs=1e-6)
def test_evaluate_payload_power_override_reduces_range(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Schema v9: ``payload_power_w`` adds to the continuous ops-time load,
so a non-zero override never increases range and typically shrinks it.
"""
base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
quiet = client.post("/evaluate", json={**base, "payload_power_w": 0.0})
noisy = client.post("/evaluate", json={**base, "payload_power_w": 25.0})
assert quiet.status_code == 200, quiet.text
assert noisy.status_code == 200, noisy.text
quiet_range = {m["target"]: m["value"] for m in quiet.json()["metrics"]}["range_km"]
noisy_range = {m["target"]: m["value"] for m in noisy.json()["metrics"]}["range_km"]
assert noisy_range <= quiet_range + 1e-9
def test_evaluate_rejects_out_of_bounds_payload(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Payload overrides are bounded ``[0, 30]`` at the HTTP boundary."""
response = client.post(
"/evaluate",
json={
"design": sample_design,
"scenario_name": "equatorial_mare_traverse",
"payload_mass_kg": 999.0,
},
)
assert response.status_code == 422
def test_evaluate_mission_duration_override_increases_range(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Longer ``mission_duration_earth_days`` extends the simulation window,
so range_km should not decrease when duration doubles on a non-binding
mare traverse (energy-limited, not cap-limited).
"""
base = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
short_resp = client.post(
"/evaluate", json={**base, "mission_duration_earth_days": 7.0}
)
long_resp = client.post(
"/evaluate", json={**base, "mission_duration_earth_days": 28.0}
)
assert short_resp.status_code == 200, short_resp.text
assert long_resp.status_code == 200, long_resp.text
short_range = {m["target"]: m["value"] for m in short_resp.json()["metrics"]}[
"range_km"
]
long_range = {m["target"]: m["value"] for m in long_resp.json()["metrics"]}[
"range_km"
]
assert long_range > short_range + 1e-9
def test_evaluate_rejects_out_of_bounds_mission_duration(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Mission-duration overrides are bounded ``[0.5, 90]`` at the HTTP boundary."""
response = client.post(
"/evaluate",
json={
"design": sample_design,
"scenario_name": "equatorial_mare_traverse",
"mission_duration_earth_days": 0.1,
},
)
assert response.status_code == 422
def test_evaluate_unknown_scenario_returns_404(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
payload = {"design": sample_design, "scenario_name": "no_such_scenario"}
response = client.post("/evaluate", json=payload)
assert response.status_code == 404
def test_evaluate_rejects_out_of_bounds_design(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
bad = dict(sample_design)
bad["wheel_radius_m"] = 5.0
response = client.post(
"/evaluate",
json={"design": bad, "scenario_name": "equatorial_mare_traverse"},
)
assert response.status_code == 422
def test_evaluate_values_match_primary_metrics_shape(
client: TestClient,
sample_design: dict[str, float | int],
) -> None:
"""Sanity-check the projection of ``MissionMetrics`` onto the four primary targets.
Range and total mass are strictly positive for every well-formed
scenario; slope is bounded above by 90°; energy margin is unbounded
but should be finite. This is a coarse "no NaN snuck through" guard.
"""
payload = {"design": sample_design, "scenario_name": "polar_prospecting"}
response = client.post("/evaluate", json=payload)
assert response.status_code == 200
body = response.json()
by_target = {m["target"]: m["value"] for m in body["metrics"]}
assert by_target["total_mass_kg"] > 0
assert by_target["range_km"] >= 0
assert 0 <= by_target["slope_capability_deg"] <= 90
assert by_target["energy_margin_raw_pct"] == by_target["energy_margin_raw_pct"] # not NaN
def test_evaluate_and_predict_agree_within_surrogate_noise_floor(
client: TestClient,
sample_design: dict[str, float | int],
surrogate_v7_1_compatible: bool,
) -> None:
"""The surrogate's median should track the evaluator within R²-noise.
On the canonical equatorial-mare scenario for the Yutu-2-ish
sample design, the tuned-median tuned median has R² ≥ 0.99 on every
primary target. We pick a generous tolerance per target rather
than assert exact equality so this test does not flake on
XGBoost-version churn or harmless quantile-head retrains.
"""
if not surrogate_v7_1_compatible:
pytest.skip(
"schema-v7_1 quantile_bundles.joblib not on disk; pre-v7_1 "
"bundles lack scenario_operational_duty_cycle and KeyError "
"on the v7_1 feature row."
)
payload = {"design": sample_design, "scenario_name": "equatorial_mare_traverse"}
eval_resp = client.post("/evaluate", json=payload)
pred_resp = client.post("/predict", json=payload)
assert eval_resp.status_code == 200
if pred_resp.status_code == 503:
# Quantile bundles missing (mirrors the predict-test skip path).
return
assert pred_resp.status_code == 200
evaluator = {m["target"]: m["value"] for m in eval_resp.json()["metrics"]}
surrogate = {p["target"]: p["q50"] for p in pred_resp.json()["predictions"]}
# Per-target relative tolerance on the median. Energy margin runs
# large positive on equatorial-mare so we use absolute tolerance
# (a 5 pp gap on a 600 % margin is still <1 % relative error).
# The slope tolerance is set to ~2x the v9 surrogate's overall test
# RMSE (0.930 deg) divided by a typical equatorial-mare sample-design
# slope_capability (~22 deg) — i.e. tight enough to catch wiring bugs
# but loose enough not to flake on a single-point tail residual at
# the surrogate noise floor. Widened from 0.08 (v6) to 0.10 (v9)
# because the v9 median head is marginally noisier on slope after the
# payload-feature retrain (test R² 0.978). See
# ``reports/surrogate_v9/median_sanity.csv``.
# total_mass is a near-analytic function of design + payload, so the
# median head learns it to high precision (test R² 0.999, RMSE
# 0.567 kg). The 0.04 rel tol (~1.8 kg at this ~44 kg sample design)
# is ~3x RMSE — a single-point tail allowance now that payload is an
# extra LHS input adding a little variance, still tight enough to
# catch a units / wiring regression.
rel_tol = {
"range_km": 0.10,
"slope_capability_deg": 0.10,
"total_mass_kg": 0.05,
}
for tgt, tol in rel_tol.items():
e = evaluator[tgt]
s = surrogate[tgt]
assert abs(e - s) <= max(tol * abs(e), 1e-3), (tgt, e, s)
# Energy margin: tolerate a 50-pp gap in absolute terms.
assert abs(evaluator["energy_margin_raw_pct"] - surrogate["energy_margin_raw_pct"]) <= 50