import sys
sys.path.insert(0, ".")

# Direct imports — bypass openenv-core's __init__ to avoid gradio hang
import importlib.util, types

def load(path, name):
    spec = importlib.util.spec_from_file_location(name, path)
    mod = importlib.util.module_from_spec(spec)
    sys.modules[name] = mod
    spec.loader.exec_module(mod)
    return mod

# Load chain in dependency order
load("models/models.py",   "models.models")
load("skills/ambiguity_detection.py", "skills.ambiguity_detection")
load("skills/conversation_memory.py", "skills.conversation_memory")
load("skills/reward_system.py",       "skills.reward_system")
load("env/env.py",         "env.env")
load("tasks/tasks.py",     "tasks.tasks")
load("grader/grader.py",   "grader.grader")

from models.models import Action
from env.env      import AmbiguityEnv
from grader.grader import evaluate_episode
from tasks.tasks   import TASKS

print("=== STEP 4 VALIDATION CHECKLIST ===\n")

# ── Tasks ──────────────────────────────────────────────────────────────
print("[CHECK] 4 tasks defined")
assert len(TASKS) == 4
for i, t in enumerate(TASKS):
    print("  Task %d: %-32s ambiguity=%-5s missing=%s"
          % (i+1, t["name"], t["ambiguity"], t["missing_fields"]))

assert TASKS[0]["ambiguity"] == False
assert TASKS[3]["ambiguity"] == True
assert len(TASKS[3]["missing_fields"]) > 1
print("[OK] difficulty progression: easy → medium → medium → hard\n")

# ── helper ─────────────────────────────────────────────────────────────
def run(task, actions):
    env = AmbiguityEnv()
    env.reset(task)
    for a in actions:
        env.step(a)
    return env.state()

# ── Case 1: 2 good questions + correct execute → 1.0 ──────────────────
print("Case 1: 2 targeted questions + correct execute (hard task)")
s = run(TASKS[3], [
    Action(type="ask", question="When should this happen?"),
    Action(type="ask", question="Who are the participants?"),
    Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
])
score, reason = evaluate_episode(s)
print("  score=%.2f  reason=%s" % (score, reason))
assert score == 1.0, "Expected 1.0 got %.2f" % score
print("  [PASS]\n")

# ── Case 2: 5 questions → penalty ──────────────────────────────────────
print("Case 2: 5 questions (spam) → penalty")
s2 = run(TASKS[3], [
    Action(type="ask", question="When should this happen?"),
    Action(type="ask", question="Who are the participants?"),
    Action(type="ask", question="Where is it?"),
    Action(type="ask", question="What is the agenda?"),
    Action(type="ask", question="How long will it take?"),
    Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
])
score2, reason2 = evaluate_episode(s2)
print("  score=%.2f  reason=%s" % (score2, reason2))
assert score2 < 1.0, "Expected penalty"
print("  [PASS] penalised to %.2f\n" % score2)

# ── Case 3: 0 questions + wrong execute → 0.0 ─────────────────────────
print("Case 3: no questions + wrong execute → 0.0")
s3 = run(TASKS[3], [
    Action(type="execute", proposed_time="2 PM", proposed_participants=["Wrong"]),
])
score3, reason3 = evaluate_episode(s3)
print("  score=%.2f  reason=%s" % (score3, reason3))
assert score3 == 0.0, "Expected 0.0 got %.2f" % score3
print("  [PASS]\n")

# ── Case 4: partial correctness (time only) → 0.5 ─────────────────────
print("Case 4: collect time only, execute with correct time but wrong people → 0.5")
s4 = run(TASKS[3], [
    Action(type="ask", question="When should this happen?"),
    Action(type="execute", proposed_time="10 AM", proposed_participants=["Wrong"]),
])
score4, reason4 = evaluate_episode(s4)
print("  score=%.2f  reason=%s" % (score4, reason4))
assert score4 == 0.5, "Expected 0.5 got %.2f" % score4
print("  [PASS]\n")

# ── Case 5: determinism ────────────────────────────────────────────────
print("Case 5: determinism — same input × 2")
actions = [
    Action(type="ask", question="When should this happen?"),
    Action(type="ask", question="Who are the participants?"),
    Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
]
sa, _ = evaluate_episode(run(TASKS[1], actions))
sb, _ = evaluate_episode(run(TASKS[1], actions))
assert sa == sb
print("  [PASS] %.2f == %.2f\n" % (sa, sb))

print("=== ALL CHECKS PASSED ===")