Spaces:

Prajwal782007
/

Gridmind

Sleeping

App Files Files Community

Gridmind / tests /test_graders.py

ShreeshantXD

feat: add baseline scores JSON, inference script, and update Dockerfile for improved project structure

6d74982 about 2 months ago

raw

history blame contribute delete

6.95 kB

	"""
	GridMind-RL Grader Validation Tests (pytest)
	Run with: pytest tests/test_graders.py -v
	"""

	import json
	import time
	import pytest
	import requests

	ENV_URL = "http://localhost:7860"
	BASE = ENV_URL


	def wait_for_server(url: str, timeout: int = 15):
	for _ in range(timeout):
	try:
	r = requests.get(f"{url}/health", timeout=2)
	if r.status_code == 200:
	return True
	except Exception:
	pass
	time.sleep(1)
	return False


	@pytest.fixture(scope="session", autouse=True)
	def server_running():
	if not wait_for_server(ENV_URL):
	pytest.skip("GridMind-RL server not running at " + ENV_URL)


	def reset(task_id=1, seed=42):
	r = requests.post(f"{BASE}/reset", json={"task_id": task_id, "seed": seed, "num_buildings": 1})
	r.raise_for_status()
	return r.json()


	def step(action: dict) -> dict:
	r = requests.post(f"{BASE}/step", json=action)
	r.raise_for_status()
	return r.json()


	def grade() -> dict:
	r = requests.get(f"{BASE}/grade")
	r.raise_for_status()
	return r.json()


	def run_full_episode(task_id: int, seed: int, hvac: float = 0.5) -> dict:
	reset(task_id=task_id, seed=seed)
	action = {"hvac_power_level": hvac, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0}
	done = False
	while not done:
	resp = step(action)
	if resp.get("done"):
	done = True
	return grade()


	# ── Task 1 ──────────────────────────────────────────────────────────────────

	class TestTask1:
	def test_score_in_range(self):
	g = run_full_episode(task_id=1, seed=1)
	assert 0.0 <= g["score"] <= 1.0, f"Score {g['score']} out of [0,1]"

	def test_score_not_always_zero(self):
	g = run_full_episode(task_id=1, seed=2, hvac=0.2)
	assert g["score"] > 0.01, "Low HVAC policy should score > 0"

	def test_score_not_always_one(self):
	g = run_full_episode(task_id=1, seed=3, hvac=1.0)
	assert g["score"] < 0.999, "Always-on policy should not score 1.0"

	def test_deterministic(self):
	g1 = run_full_episode(task_id=1, seed=42)
	g2 = run_full_episode(task_id=1, seed=42)
	assert abs(g1["score"] - g2["score"]) < 1e-6, "Grader not deterministic with same seed"

	def test_sub_scores_present(self):
	g = run_full_episode(task_id=1, seed=5)
	assert "cost" in g["sub_scores"], "Task 1 grade missing 'cost' sub-score"

	def test_exploit_shedding_penalized(self):
	"""Always shedding 50% should be detected and penalized."""
	reset(task_id=1, seed=10)
	action = {"hvac_power_level": 0.5, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0.5}
	done = False
	while not done:
	resp = step(action)
	if resp.get("done"):
	done = True
	g = grade()
	# Score should be reduced OR exploit flagged
	assert g["exploit_detected"] or g["score"] < 0.9


	# ── Task 2 ──────────────────────────────────────────────────────────────────

	class TestTask2:
	def test_score_in_range(self):
	g = run_full_episode(task_id=2, seed=20)
	assert 0.0 <= g["score"] <= 1.0

	def test_has_temp_sub_score(self):
	g = run_full_episode(task_id=2, seed=21)
	assert "temperature" in g["sub_scores"]

	def test_temp_score_range(self):
	g = run_full_episode(task_id=2, seed=22)
	ts = g["sub_scores"].get("temperature", -1)
	assert 0.0 <= ts <= 1.0, f"Temperature sub-score {ts} out of [0,1]"

	def test_weights_sum_correct(self):
	"""Task 2 score = 0.6cost + 0.4temp."""
	g = run_full_episode(task_id=2, seed=23)
	expected = g["sub_scores"]["cost"] * 0.6 + g["sub_scores"]["temperature"] * 0.4
	assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]

	def test_score_varies_with_policy(self):
	g_low = run_full_episode(task_id=2, seed=24, hvac=0.1)
	g_high = run_full_episode(task_id=2, seed=24, hvac=0.9)
	# Scores should differ (policy matters)
	assert abs(g_low["score"] - g_high["score"]) > 0.001


	# ── Task 3 ──────────────────────────────────────────────────────────────────

	class TestTask3:
	def test_score_in_range(self):
	g = run_full_episode(task_id=3, seed=30)
	assert 0.0 <= g["score"] <= 1.0

	def test_has_all_sub_scores(self):
	g = run_full_episode(task_id=3, seed=31)
	for key in ["cost", "temperature", "grid_response", "batch_deadline", "carbon"]:
	assert key in g["sub_scores"], f"Missing sub-score: {key}"

	def test_all_sub_scores_in_range(self):
	g = run_full_episode(task_id=3, seed=32)
	for key, val in g["sub_scores"].items():
	assert 0.0 <= val <= 1.0, f"Sub-score '{key}' = {val} out of [0,1]"

	def test_weights_sum_correct(self):
	g = run_full_episode(task_id=3, seed=33)
	ss = g["sub_scores"]
	expected = (
	ss["cost"] * 0.28
	+ ss["temperature"] * 0.20
	+ ss["grid_response"] * 0.20
	+ ss["batch_deadline"] * 0.12
	+ ss["carbon"] * 0.20
	)
	assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]

	def test_grid_response_sub_score(self):
	g = run_full_episode(task_id=3, seed=34)
	gs = g["sub_scores"].get("grid_response", -1)
	assert 0.0 <= gs <= 1.0, f"grid_response={gs} out of [0,1]"

	def test_batch_deadline_sub_score(self):
	g = run_full_episode(task_id=3, seed=35)
	bd = g["sub_scores"].get("batch_deadline", -1)
	assert 0.0 <= bd <= 1.0


	# ── Multi-building grading ────────────────────────────────────────────────────

	class TestMultiBuilding:
	def test_2_building_grade(self):
	requests.post(f"{BASE}/reset", json={"task_id": 1, "seed": 50, "num_buildings": 2}).raise_for_status()
	action = [
	{"hvac_power_level": 0.4, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 0},
	{"hvac_power_level": 0.6, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 1},
	]
	done = False
	while not done:
	r = requests.post(f"{BASE}/step", json=action)
	if r.json()[0].get("done"):
	done = True
	g = grade()
	assert 0.0 <= g["score"] <= 1.0