import random
import sys
import os
import re
import ast
import pathlib


sys.path.insert(0, os.path.dirname(__file__))
from src.tasks import EasyTask, MediumTask, HardTask
from src.agent import DeterministicAgent
from src.models import State

results = {}

def run_single(seed=None):
    if seed is None:
        seed = random.randint(1000, 99999)
    random.seed(seed)

    agent = DeterministicAgent()
    tasks = {"Easy": EasyTask(), "Medium": MediumTask(), "Hard": HardTask()}
    scores = {}
    metrics = {}

    for level, task in tasks.items():
        task_seed = seed + list(tasks.keys()).index(level) * 999
        state = task.reset(seed=task_seed)
        done = False
        steps = 0
        total_reward = 0.0
        while not done:
            action_idx = agent.get_action(state)
            result = task.step(action_idx)
            state = result.state
            total_reward += result.reward
            done = result.done
            steps += 1
            if steps > 500:
                break
        score = task.evaluate()
        scores[level] = score
        metrics[level] = {
            "cleared": result.info["total_cleared"],
            "avg_wait": result.info["avg_waiting_time"],
            "emg": result.info["emergencies_handled"],
            "reward": round(total_reward, 2),
        }

    overall = sum(scores.values()) / len(scores)
    scores["Overall"] = overall
    return seed, scores, metrics


def sep(title):
    print(f"\n{'='*55}")
    print(f"  {title}")
    print(f"{'='*55}")


sep("TEST 1: SEED REPRODUCIBILITY")
seed = 42
s1, sc1, m1 = run_single(seed)
s2, sc2, m2 = run_single(seed)

print(f"Seed: {seed}")
print(f"\nRun 1: Easy={sc1['Easy']:.4f} | Medium={sc1['Medium']:.4f} | Hard={sc1['Hard']:.4f} | Overall={sc1['Overall']:.4f}")
print(f"       Cleared: {m1['Easy']['cleared']} / {m1['Medium']['cleared']} / {m1['Hard']['cleared']}")
print(f"\nRun 2: Easy={sc2['Easy']:.4f} | Medium={sc2['Medium']:.4f} | Hard={sc2['Hard']:.4f} | Overall={sc2['Overall']:.4f}")
print(f"       Cleared: {m2['Easy']['cleared']} / {m2['Medium']['cleared']} / {m2['Hard']['cleared']}")

if sc1 == sc2 and m1 == m2:
    print("\n✅ TEST 1: SEED REPRODUCIBILITY → PASS")
    results["Seed Reproducibility"] = "PASS"
else:
    print("\n❌ TEST 1: SEED REPRODUCIBILITY → FAIL (outputs differ with same seed)")
    results["Seed Reproducibility"] = "FAIL"


sep("TEST 2: STOCHASTIC VARIABILITY (3 random runs)")
runs = [run_single() for _ in range(3)]
print(f"\n{'Run':<5} {'Seed':<8} {'Easy':<8} {'Medium':<9} {'Hard':<8} {'Overall':<10} {'Hard Cleared'}")
print("-" * 65)
for i, (sd, sc, mx) in enumerate(runs, 1):
    print(f"  {i:<4} {sd:<8} {sc['Easy']:<8.4f} {sc['Medium']:<9.4f} {sc['Hard']:<8.4f} {sc['Overall']:<10.4f} {mx['Hard']['cleared']}")


all_scores = [(r[1]['Easy'], r[1]['Medium'], r[1]['Hard']) for r in runs]
unique_seeds = len(set(r[0] for r in runs)) == 3
any_diff = len(set(s for s in [str(x) for x in all_scores])) > 1

if unique_seeds and any_diff:
    print("\n✅ TEST 2: STOCHASTIC VARIABILITY → PASS (unique seeds + varying outputs)")
    results["Stochastic Variability"] = "PASS"
else:
    print("\n❌ TEST 2: STOCHASTIC VARIABILITY → FAIL (same outputs == possible hardcoding)")
    results["Stochastic Variability"] = "FAIL"


sep("TEST 3: NO HARDCODED VALUES SCAN")
SUSPECT_PATTERNS = [
    r'\[0\.\d+,\s*0\.\d+,\s*0\.\d+',
    r'return\s+0\.\d+\b',
]
IGNORE_BLOCK_MARKER = '__name__'
files = list(pathlib.Path('src').glob('*.py')) + [
    pathlib.Path('evaluate.py'), pathlib.Path('visualize.py')
]
all_ok = True
for fp in files:
    content = fp.read_text(encoding='utf-8')

    parts = content.split('if __name__')
    audit_content = parts[0]
    warnings = []
    for pat in SUSPECT_PATTERNS:
        if re.search(pat, audit_content):
            warnings.append(pat)
    if warnings:
        print(f"  ⚠️  WARNING: {fp}  ← suspicious pattern found")
        all_ok = False
    else:
        print(f"  ✅ OK: {fp}")

if all_ok:
    print("\n✅ TEST 3: NO HARDCODED VALUES → PASS")
    results["No Hardcoding"] = "PASS"
else:
    print("\n❌ TEST 3: NO HARDCODED VALUES → FAIL")
    results["No Hardcoding"] = "FAIL"


sep("TEST 4: METRIC CONSISTENCY (Low vs High Traffic)")


low_task = EasyTask()
low_task.env.arrival_rate_base = 0.2
state = low_task.reset(seed=100)
agent_low = DeterministicAgent()
done = False
while not done:
    r = low_task.step(agent_low.get_action(state))
    state = r.state
    done = r.done
low_score = low_task.evaluate()
low_cleared = r.info["total_cleared"]
low_wait = r.info["avg_waiting_time"]


high_task = EasyTask()
high_task.env.arrival_rate_base = 6.0
state = high_task.reset(seed=100)
agent_high = DeterministicAgent()
done = False
while not done:
    r = high_task.step(agent_high.get_action(state))
    state = r.state
    done = r.done
high_score = high_task.evaluate()
high_cleared = r.info["total_cleared"]
high_wait = r.info["avg_waiting_time"]

print(f"\n  Low Traffic:  Score={low_score:.4f} | Cleared={low_cleared} | Avg Wait={low_wait:.2f}")
print(f"  High Traffic: Score={high_score:.4f} | Cleared={high_cleared} | Avg Wait={high_wait:.2f}")
print(f"\n  Score Delta: {low_score - high_score:+.4f} | Wait Delta: {high_wait - low_wait:+.2f}")

if low_wait <= high_wait and low_score >= high_score:
    print("\n✅ TEST 4: METRIC CONSISTENCY → PASS (low traffic scores better than high traffic)")
    results["Metric Logic"] = "PASS"
else:
    print("\n❌ TEST 4: METRIC CONSISTENCY → FAIL (higher traffic should not beat lower)")
    results["Metric Logic"] = "FAIL"


sep("TEST 5: AGENT IMPACT (Real vs Random Policy)")


_, real_scores, _ = run_single(seed=999)
real_overall = real_scores["Overall"]


class RandomAgent:
    def get_action(self, state):
        return random.choice([1, 2])

def run_with_agent(agent_obj, seed=999):
    random.seed(seed)
    tasks = {"Easy": EasyTask(), "Medium": MediumTask(), "Hard": HardTask()}
    total_score = 0
    for level, task in tasks.items():
        task_seed = seed + list(tasks.keys()).index(level) * 999
        state = task.reset(seed=task_seed)
        done = False
        steps = 0
        while not done:
            r = task.step(agent_obj.get_action(state))
            state = r.state
            done = r.done
            steps += 1
            if steps > 500:
                break
        total_score += task.evaluate()
    return total_score / len(tasks)

random_overall = run_with_agent(RandomAgent(), seed=999)
delta = real_overall - random_overall

print(f"\n  Real Agent Score:   {real_overall:.4f}")
print(f"  Random Agent Score: {random_overall:.4f}")
print(f"  Improvement Delta:  {delta:+.4f} ({delta*100:.1f}%)")

if delta > 0.02:
    print("\n✅ TEST 5: AGENT IMPACT → PASS (Real agent significantly outperforms random)")
    results["Agent Impact"] = "PASS"
elif delta >= 0:
    print("\n⚠️  TEST 5: AGENT IMPACT → MARGINAL (small gap, agent barely helps)")
    results["Agent Impact"] = "MARGINAL"
else:
    print("\n❌ TEST 5: AGENT IMPACT → FAIL (random policy beats real agent — logic error)")
    results["Agent Impact"] = "FAIL"


sep("TEST 6: EXTREME SCENARIOS")


zero_task = EasyTask()
zero_task.env.arrival_rate_base = 0.0
state = zero_task.reset(seed=7)
done = False
agent_z = DeterministicAgent()
while not done:
    r = zero_task.step(agent_z.get_action(state))
    state = r.state
    done = r.done
zero_score = zero_task.evaluate()


cong_task = EasyTask()
cong_task.env.arrival_rate_base = 10.0
state = cong_task.reset(seed=7)
done = False
agent_c = DeterministicAgent()
while not done:
    r = cong_task.step(agent_c.get_action(state))
    state = r.state
    done = r.done
cong_score = cong_task.evaluate()


from src.tasks import HardTask
emg_task = HardTask()
state = emg_task.reset(seed=77)
agent_e = DeterministicAgent()
done = False
while not done:
    r = emg_task.step(agent_e.get_action(state))
    state = r.state
    done = r.done
emg_score = emg_task.evaluate()
emg_handled = r.info["emergencies_handled"]

print(f"\n  Case A (Zero traffic):     Score = {zero_score:.4f} (expected ≈ 1.0)")
print(f"  Case B (Extreme traffic):  Score = {cong_score:.4f} (expected < zero_score)")
print(f"  Case C (Emergency task):   Score = {emg_score:.4f} | Emergencies Handled = {emg_handled}")

case_a = zero_score >= 0.85
case_b = cong_score < zero_score
case_c = emg_handled > 0

if case_a and case_b and case_c:
    print("\n✅ TEST 6: EXTREME SCENARIOS → PASS")
    results["Extreme Cases"] = "PASS"
else:
    issues = []
    if not case_a: issues.append(f"Zero-traffic score {zero_score:.3f} unexpectedly low")
    if not case_b: issues.append(f"Congested score {cong_score:.3f} ≥ zero-traffic score {zero_score:.3f}")
    if not case_c: issues.append("No emergencies handled in hard task")
    print(f"\n❌ TEST 6: EXTREME SCENARIOS → FAIL: {'; '.join(issues)}")
    results["Extreme Cases"] = "FAIL"


sep("TEST 7: GRAPH VALIDATION (Score ↔ Graph Consistency)")
from visualize import generate_graph

_, sc_a, _ = run_single(seed=111)
_, sc_b, _ = run_single(seed=222)

out_a = "audit_graph_A.png"
out_b = "audit_graph_B.png"
generate_graph(sc_a, 111, output_path=out_a)
generate_graph(sc_b, 222, output_path=out_b)


size_a = os.path.getsize(out_a)
size_b = os.path.getsize(out_b)
graph_files_exist = os.path.exists(out_a) and os.path.exists(out_b)
values_match = (
    abs(sc_a['Easy'] - sc_b['Easy']) > 0.0001 or 
    abs(sc_a['Medium'] - sc_b['Medium']) > 0.0001
)

print(f"\n  Seed 111: Easy={sc_a['Easy']:.4f} Medium={sc_a['Medium']:.4f} Hard={sc_a['Hard']:.4f}")
print(f"  Seed 222: Easy={sc_b['Easy']:.4f} Medium={sc_b['Medium']:.4f} Hard={sc_b['Hard']:.4f}")
print(f"  Graph A size: {size_a} bytes | Graph B size: {size_b} bytes")
print(f"  Scores differ across seeds: {values_match}")
print(f"  Graph files generated: {graph_files_exist}")


for f in [out_a, out_b]:
    if os.path.exists(f): os.remove(f)

if graph_files_exist and values_match:
    print("\n✅ TEST 7: GRAPH VALIDATION → PASS (graphs generated from live scores, vary with seed)")
    results["Graph Accuracy"] = "PASS"
else:
    print("\n❌ TEST 7: GRAPH VALIDATION → FAIL")
    results["Graph Accuracy"] = "FAIL"


sep("FINAL AUDIT SUMMARY")
icon = {"PASS": "✅", "FAIL": "❌", "MARGINAL": "⚠️ "}
for test, status in results.items():
    print(f"  {icon.get(status, '?')} {test}: {status}")

any_fail = any(v == "FAIL" for v in results.values())
any_marginal = any(v == "MARGINAL" for v in results.values())

print(f"\n{'='*55}")
if any_fail:
    print("  FINAL VERDICT: NEEDS FIXES ❌")
elif any_marginal:
    print("  FINAL VERDICT: MOSTLY TRUSTED ⚠️  (minor issues detected)")
else:
    print("  FINAL VERDICT: TRUSTED SYSTEM ✅")
print(f"{'='*55}\n")