Spaces:
Sleeping
Sleeping
feat: add baseline scores JSON, inference script, and update Dockerfile for improved project structure
6d74982 | """ | |
| GridMind-RL Grader Validation Tests (pytest) | |
| Run with: pytest tests/test_graders.py -v | |
| """ | |
| import json | |
| import time | |
| import pytest | |
| import requests | |
| ENV_URL = "http://localhost:7860" | |
| BASE = ENV_URL | |
| def wait_for_server(url: str, timeout: int = 15): | |
| for _ in range(timeout): | |
| try: | |
| r = requests.get(f"{url}/health", timeout=2) | |
| if r.status_code == 200: | |
| return True | |
| except Exception: | |
| pass | |
| time.sleep(1) | |
| return False | |
| def server_running(): | |
| if not wait_for_server(ENV_URL): | |
| pytest.skip("GridMind-RL server not running at " + ENV_URL) | |
| def reset(task_id=1, seed=42): | |
| r = requests.post(f"{BASE}/reset", json={"task_id": task_id, "seed": seed, "num_buildings": 1}) | |
| r.raise_for_status() | |
| return r.json() | |
| def step(action: dict) -> dict: | |
| r = requests.post(f"{BASE}/step", json=action) | |
| r.raise_for_status() | |
| return r.json() | |
| def grade() -> dict: | |
| r = requests.get(f"{BASE}/grade") | |
| r.raise_for_status() | |
| return r.json() | |
| def run_full_episode(task_id: int, seed: int, hvac: float = 0.5) -> dict: | |
| reset(task_id=task_id, seed=seed) | |
| action = {"hvac_power_level": hvac, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0} | |
| done = False | |
| while not done: | |
| resp = step(action) | |
| if resp.get("done"): | |
| done = True | |
| return grade() | |
| # ── Task 1 ────────────────────────────────────────────────────────────────── | |
| class TestTask1: | |
| def test_score_in_range(self): | |
| g = run_full_episode(task_id=1, seed=1) | |
| assert 0.0 <= g["score"] <= 1.0, f"Score {g['score']} out of [0,1]" | |
| def test_score_not_always_zero(self): | |
| g = run_full_episode(task_id=1, seed=2, hvac=0.2) | |
| assert g["score"] > 0.01, "Low HVAC policy should score > 0" | |
| def test_score_not_always_one(self): | |
| g = run_full_episode(task_id=1, seed=3, hvac=1.0) | |
| assert g["score"] < 0.999, "Always-on policy should not score 1.0" | |
| def test_deterministic(self): | |
| g1 = run_full_episode(task_id=1, seed=42) | |
| g2 = run_full_episode(task_id=1, seed=42) | |
| assert abs(g1["score"] - g2["score"]) < 1e-6, "Grader not deterministic with same seed" | |
| def test_sub_scores_present(self): | |
| g = run_full_episode(task_id=1, seed=5) | |
| assert "cost" in g["sub_scores"], "Task 1 grade missing 'cost' sub-score" | |
| def test_exploit_shedding_penalized(self): | |
| """Always shedding 50% should be detected and penalized.""" | |
| reset(task_id=1, seed=10) | |
| action = {"hvac_power_level": 0.5, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0.5} | |
| done = False | |
| while not done: | |
| resp = step(action) | |
| if resp.get("done"): | |
| done = True | |
| g = grade() | |
| # Score should be reduced OR exploit flagged | |
| assert g["exploit_detected"] or g["score"] < 0.9 | |
| # ── Task 2 ────────────────────────────────────────────────────────────────── | |
| class TestTask2: | |
| def test_score_in_range(self): | |
| g = run_full_episode(task_id=2, seed=20) | |
| assert 0.0 <= g["score"] <= 1.0 | |
| def test_has_temp_sub_score(self): | |
| g = run_full_episode(task_id=2, seed=21) | |
| assert "temperature" in g["sub_scores"] | |
| def test_temp_score_range(self): | |
| g = run_full_episode(task_id=2, seed=22) | |
| ts = g["sub_scores"].get("temperature", -1) | |
| assert 0.0 <= ts <= 1.0, f"Temperature sub-score {ts} out of [0,1]" | |
| def test_weights_sum_correct(self): | |
| """Task 2 score = 0.6*cost + 0.4*temp.""" | |
| g = run_full_episode(task_id=2, seed=23) | |
| expected = g["sub_scores"]["cost"] * 0.6 + g["sub_scores"]["temperature"] * 0.4 | |
| assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"] | |
| def test_score_varies_with_policy(self): | |
| g_low = run_full_episode(task_id=2, seed=24, hvac=0.1) | |
| g_high = run_full_episode(task_id=2, seed=24, hvac=0.9) | |
| # Scores should differ (policy matters) | |
| assert abs(g_low["score"] - g_high["score"]) > 0.001 | |
| # ── Task 3 ────────────────────────────────────────────────────────────────── | |
| class TestTask3: | |
| def test_score_in_range(self): | |
| g = run_full_episode(task_id=3, seed=30) | |
| assert 0.0 <= g["score"] <= 1.0 | |
| def test_has_all_sub_scores(self): | |
| g = run_full_episode(task_id=3, seed=31) | |
| for key in ["cost", "temperature", "grid_response", "batch_deadline", "carbon"]: | |
| assert key in g["sub_scores"], f"Missing sub-score: {key}" | |
| def test_all_sub_scores_in_range(self): | |
| g = run_full_episode(task_id=3, seed=32) | |
| for key, val in g["sub_scores"].items(): | |
| assert 0.0 <= val <= 1.0, f"Sub-score '{key}' = {val} out of [0,1]" | |
| def test_weights_sum_correct(self): | |
| g = run_full_episode(task_id=3, seed=33) | |
| ss = g["sub_scores"] | |
| expected = ( | |
| ss["cost"] * 0.28 | |
| + ss["temperature"] * 0.20 | |
| + ss["grid_response"] * 0.20 | |
| + ss["batch_deadline"] * 0.12 | |
| + ss["carbon"] * 0.20 | |
| ) | |
| assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"] | |
| def test_grid_response_sub_score(self): | |
| g = run_full_episode(task_id=3, seed=34) | |
| gs = g["sub_scores"].get("grid_response", -1) | |
| assert 0.0 <= gs <= 1.0, f"grid_response={gs} out of [0,1]" | |
| def test_batch_deadline_sub_score(self): | |
| g = run_full_episode(task_id=3, seed=35) | |
| bd = g["sub_scores"].get("batch_deadline", -1) | |
| assert 0.0 <= bd <= 1.0 | |
| # ── Multi-building grading ──────────────────────────────────────────────────── | |
| class TestMultiBuilding: | |
| def test_2_building_grade(self): | |
| requests.post(f"{BASE}/reset", json={"task_id": 1, "seed": 50, "num_buildings": 2}).raise_for_status() | |
| action = [ | |
| {"hvac_power_level": 0.4, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 0}, | |
| {"hvac_power_level": 0.6, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 1}, | |
| ] | |
| done = False | |
| while not done: | |
| r = requests.post(f"{BASE}/step", json=action) | |
| if r.json()[0].get("done"): | |
| done = True | |
| g = grade() | |
| assert 0.0 <= g["score"] <= 1.0 | |