Gridmind / tests /test_graders.py
ShreeshantXD's picture
feat: add baseline scores JSON, inference script, and update Dockerfile for improved project structure
6d74982
"""
GridMind-RL Grader Validation Tests (pytest)
Run with: pytest tests/test_graders.py -v
"""
import json
import time
import pytest
import requests
ENV_URL = "http://localhost:7860"
BASE = ENV_URL
def wait_for_server(url: str, timeout: int = 15):
for _ in range(timeout):
try:
r = requests.get(f"{url}/health", timeout=2)
if r.status_code == 200:
return True
except Exception:
pass
time.sleep(1)
return False
@pytest.fixture(scope="session", autouse=True)
def server_running():
if not wait_for_server(ENV_URL):
pytest.skip("GridMind-RL server not running at " + ENV_URL)
def reset(task_id=1, seed=42):
r = requests.post(f"{BASE}/reset", json={"task_id": task_id, "seed": seed, "num_buildings": 1})
r.raise_for_status()
return r.json()
def step(action: dict) -> dict:
r = requests.post(f"{BASE}/step", json=action)
r.raise_for_status()
return r.json()
def grade() -> dict:
r = requests.get(f"{BASE}/grade")
r.raise_for_status()
return r.json()
def run_full_episode(task_id: int, seed: int, hvac: float = 0.5) -> dict:
reset(task_id=task_id, seed=seed)
action = {"hvac_power_level": hvac, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0}
done = False
while not done:
resp = step(action)
if resp.get("done"):
done = True
return grade()
# ── Task 1 ──────────────────────────────────────────────────────────────────
class TestTask1:
def test_score_in_range(self):
g = run_full_episode(task_id=1, seed=1)
assert 0.0 <= g["score"] <= 1.0, f"Score {g['score']} out of [0,1]"
def test_score_not_always_zero(self):
g = run_full_episode(task_id=1, seed=2, hvac=0.2)
assert g["score"] > 0.01, "Low HVAC policy should score > 0"
def test_score_not_always_one(self):
g = run_full_episode(task_id=1, seed=3, hvac=1.0)
assert g["score"] < 0.999, "Always-on policy should not score 1.0"
def test_deterministic(self):
g1 = run_full_episode(task_id=1, seed=42)
g2 = run_full_episode(task_id=1, seed=42)
assert abs(g1["score"] - g2["score"]) < 1e-6, "Grader not deterministic with same seed"
def test_sub_scores_present(self):
g = run_full_episode(task_id=1, seed=5)
assert "cost" in g["sub_scores"], "Task 1 grade missing 'cost' sub-score"
def test_exploit_shedding_penalized(self):
"""Always shedding 50% should be detected and penalized."""
reset(task_id=1, seed=10)
action = {"hvac_power_level": 0.5, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0.5}
done = False
while not done:
resp = step(action)
if resp.get("done"):
done = True
g = grade()
# Score should be reduced OR exploit flagged
assert g["exploit_detected"] or g["score"] < 0.9
# ── Task 2 ──────────────────────────────────────────────────────────────────
class TestTask2:
def test_score_in_range(self):
g = run_full_episode(task_id=2, seed=20)
assert 0.0 <= g["score"] <= 1.0
def test_has_temp_sub_score(self):
g = run_full_episode(task_id=2, seed=21)
assert "temperature" in g["sub_scores"]
def test_temp_score_range(self):
g = run_full_episode(task_id=2, seed=22)
ts = g["sub_scores"].get("temperature", -1)
assert 0.0 <= ts <= 1.0, f"Temperature sub-score {ts} out of [0,1]"
def test_weights_sum_correct(self):
"""Task 2 score = 0.6*cost + 0.4*temp."""
g = run_full_episode(task_id=2, seed=23)
expected = g["sub_scores"]["cost"] * 0.6 + g["sub_scores"]["temperature"] * 0.4
assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]
def test_score_varies_with_policy(self):
g_low = run_full_episode(task_id=2, seed=24, hvac=0.1)
g_high = run_full_episode(task_id=2, seed=24, hvac=0.9)
# Scores should differ (policy matters)
assert abs(g_low["score"] - g_high["score"]) > 0.001
# ── Task 3 ──────────────────────────────────────────────────────────────────
class TestTask3:
def test_score_in_range(self):
g = run_full_episode(task_id=3, seed=30)
assert 0.0 <= g["score"] <= 1.0
def test_has_all_sub_scores(self):
g = run_full_episode(task_id=3, seed=31)
for key in ["cost", "temperature", "grid_response", "batch_deadline", "carbon"]:
assert key in g["sub_scores"], f"Missing sub-score: {key}"
def test_all_sub_scores_in_range(self):
g = run_full_episode(task_id=3, seed=32)
for key, val in g["sub_scores"].items():
assert 0.0 <= val <= 1.0, f"Sub-score '{key}' = {val} out of [0,1]"
def test_weights_sum_correct(self):
g = run_full_episode(task_id=3, seed=33)
ss = g["sub_scores"]
expected = (
ss["cost"] * 0.28
+ ss["temperature"] * 0.20
+ ss["grid_response"] * 0.20
+ ss["batch_deadline"] * 0.12
+ ss["carbon"] * 0.20
)
assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]
def test_grid_response_sub_score(self):
g = run_full_episode(task_id=3, seed=34)
gs = g["sub_scores"].get("grid_response", -1)
assert 0.0 <= gs <= 1.0, f"grid_response={gs} out of [0,1]"
def test_batch_deadline_sub_score(self):
g = run_full_episode(task_id=3, seed=35)
bd = g["sub_scores"].get("batch_deadline", -1)
assert 0.0 <= bd <= 1.0
# ── Multi-building grading ────────────────────────────────────────────────────
class TestMultiBuilding:
def test_2_building_grade(self):
requests.post(f"{BASE}/reset", json={"task_id": 1, "seed": 50, "num_buildings": 2}).raise_for_status()
action = [
{"hvac_power_level": 0.4, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 0},
{"hvac_power_level": 0.6, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 1},
]
done = False
while not done:
r = requests.post(f"{BASE}/step", json=action)
if r.json()[0].get("done"):
done = True
g = grade()
assert 0.0 <= g["score"] <= 1.0