"""
GridMind-RL Grader Validation Tests (pytest)
Run with: pytest tests/test_graders.py -v
"""

import json
import time
import pytest
import requests

ENV_URL = "http://localhost:7860"
BASE = ENV_URL


def wait_for_server(url: str, timeout: int = 15):
    for _ in range(timeout):
        try:
            r = requests.get(f"{url}/health", timeout=2)
            if r.status_code == 200:
                return True
        except Exception:
            pass
        time.sleep(1)
    return False


@pytest.fixture(scope="session", autouse=True)
def server_running():
    if not wait_for_server(ENV_URL):
        pytest.skip("GridMind-RL server not running at " + ENV_URL)


def reset(task_id=1, seed=42):
    r = requests.post(f"{BASE}/reset", json={"task_id": task_id, "seed": seed, "num_buildings": 1})
    r.raise_for_status()
    return r.json()


def step(action: dict) -> dict:
    r = requests.post(f"{BASE}/step", json=action)
    r.raise_for_status()
    return r.json()


def grade() -> dict:
    r = requests.get(f"{BASE}/grade")
    r.raise_for_status()
    return r.json()


def run_full_episode(task_id: int, seed: int, hvac: float = 0.5) -> dict:
    reset(task_id=task_id, seed=seed)
    action = {"hvac_power_level": hvac, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0}
    done = False
    while not done:
        resp = step(action)
        if resp.get("done"):
            done = True
    return grade()


# ── Task 1 ──────────────────────────────────────────────────────────────────

class TestTask1:
    def test_score_in_range(self):
        g = run_full_episode(task_id=1, seed=1)
        assert 0.0 <= g["score"] <= 1.0, f"Score {g['score']} out of [0,1]"

    def test_score_not_always_zero(self):
        g = run_full_episode(task_id=1, seed=2, hvac=0.2)
        assert g["score"] > 0.01, "Low HVAC policy should score > 0"

    def test_score_not_always_one(self):
        g = run_full_episode(task_id=1, seed=3, hvac=1.0)
        assert g["score"] < 0.999, "Always-on policy should not score 1.0"

    def test_deterministic(self):
        g1 = run_full_episode(task_id=1, seed=42)
        g2 = run_full_episode(task_id=1, seed=42)
        assert abs(g1["score"] - g2["score"]) < 1e-6, "Grader not deterministic with same seed"

    def test_sub_scores_present(self):
        g = run_full_episode(task_id=1, seed=5)
        assert "cost" in g["sub_scores"], "Task 1 grade missing 'cost' sub-score"

    def test_exploit_shedding_penalized(self):
        """Always shedding 50% should be detected and penalized."""
        reset(task_id=1, seed=10)
        action = {"hvac_power_level": 0.5, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0.5}
        done = False
        while not done:
            resp = step(action)
            if resp.get("done"):
                done = True
        g = grade()
        # Score should be reduced OR exploit flagged
        assert g["exploit_detected"] or g["score"] < 0.9


# ── Task 2 ──────────────────────────────────────────────────────────────────

class TestTask2:
    def test_score_in_range(self):
        g = run_full_episode(task_id=2, seed=20)
        assert 0.0 <= g["score"] <= 1.0

    def test_has_temp_sub_score(self):
        g = run_full_episode(task_id=2, seed=21)
        assert "temperature" in g["sub_scores"]

    def test_temp_score_range(self):
        g = run_full_episode(task_id=2, seed=22)
        ts = g["sub_scores"].get("temperature", -1)
        assert 0.0 <= ts <= 1.0, f"Temperature sub-score {ts} out of [0,1]"

    def test_weights_sum_correct(self):
        """Task 2 score = 0.6*cost + 0.4*temp."""
        g = run_full_episode(task_id=2, seed=23)
        expected = g["sub_scores"]["cost"] * 0.6 + g["sub_scores"]["temperature"] * 0.4
        assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]

    def test_score_varies_with_policy(self):
        g_low  = run_full_episode(task_id=2, seed=24, hvac=0.1)
        g_high = run_full_episode(task_id=2, seed=24, hvac=0.9)
        # Scores should differ (policy matters)
        assert abs(g_low["score"] - g_high["score"]) > 0.001


# ── Task 3 ──────────────────────────────────────────────────────────────────

class TestTask3:
    def test_score_in_range(self):
        g = run_full_episode(task_id=3, seed=30)
        assert 0.0 <= g["score"] <= 1.0

    def test_has_all_sub_scores(self):
        g = run_full_episode(task_id=3, seed=31)
        for key in ["cost", "temperature", "grid_response", "batch_deadline", "carbon"]:
            assert key in g["sub_scores"], f"Missing sub-score: {key}"

    def test_all_sub_scores_in_range(self):
        g = run_full_episode(task_id=3, seed=32)
        for key, val in g["sub_scores"].items():
            assert 0.0 <= val <= 1.0, f"Sub-score '{key}' = {val} out of [0,1]"

    def test_weights_sum_correct(self):
        g = run_full_episode(task_id=3, seed=33)
        ss = g["sub_scores"]
        expected = (
            ss["cost"] * 0.28
            + ss["temperature"] * 0.20
            + ss["grid_response"] * 0.20
            + ss["batch_deadline"] * 0.12
            + ss["carbon"] * 0.20
        )
        assert abs(g["score"] - expected) < 0.01 or g["exploit_detected"]

    def test_grid_response_sub_score(self):
        g = run_full_episode(task_id=3, seed=34)
        gs = g["sub_scores"].get("grid_response", -1)
        assert 0.0 <= gs <= 1.0, f"grid_response={gs} out of [0,1]"

    def test_batch_deadline_sub_score(self):
        g = run_full_episode(task_id=3, seed=35)
        bd = g["sub_scores"].get("batch_deadline", -1)
        assert 0.0 <= bd <= 1.0


# ── Multi-building grading ────────────────────────────────────────────────────

class TestMultiBuilding:
    def test_2_building_grade(self):
        requests.post(f"{BASE}/reset", json={"task_id": 1, "seed": 50, "num_buildings": 2}).raise_for_status()
        action = [
            {"hvac_power_level": 0.4, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 0},
            {"hvac_power_level": 0.6, "thermal_charge_rate": 0, "batch_job_slot": 0, "load_shed_fraction": 0, "building_id": 1},
        ]
        done = False
        while not done:
            r = requests.post(f"{BASE}/step", json=action)
            if r.json()[0].get("done"):
                done = True
        g = grade()
        assert 0.0 <= g["score"] <= 1.0