import sys sys.path.insert(0, ".") # Direct imports — bypass openenv-core's __init__ to avoid gradio hang import importlib.util, types def load(path, name): spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) sys.modules[name] = mod spec.loader.exec_module(mod) return mod # Load chain in dependency order load("models/models.py", "models.models") load("skills/ambiguity_detection.py", "skills.ambiguity_detection") load("skills/conversation_memory.py", "skills.conversation_memory") load("skills/reward_system.py", "skills.reward_system") load("env/env.py", "env.env") load("tasks/tasks.py", "tasks.tasks") load("grader/grader.py", "grader.grader") from models.models import Action from env.env import AmbiguityEnv from grader.grader import evaluate_episode from tasks.tasks import TASKS print("=== STEP 4 VALIDATION CHECKLIST ===\n") # ── Tasks ────────────────────────────────────────────────────────────── print("[CHECK] 4 tasks defined") assert len(TASKS) == 4 for i, t in enumerate(TASKS): print(" Task %d: %-32s ambiguity=%-5s missing=%s" % (i+1, t["name"], t["ambiguity"], t["missing_fields"])) assert TASKS[0]["ambiguity"] == False assert TASKS[3]["ambiguity"] == True assert len(TASKS[3]["missing_fields"]) > 1 print("[OK] difficulty progression: easy → medium → medium → hard\n") # ── helper ───────────────────────────────────────────────────────────── def run(task, actions): env = AmbiguityEnv() env.reset(task) for a in actions: env.step(a) return env.state() # ── Case 1: 2 good questions + correct execute → 1.0 ────────────────── print("Case 1: 2 targeted questions + correct execute (hard task)") s = run(TASKS[3], [ Action(type="ask", question="When should this happen?"), Action(type="ask", question="Who are the participants?"), Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), ]) score, reason = evaluate_episode(s) print(" score=%.2f reason=%s" % (score, reason)) assert score == 1.0, "Expected 1.0 got %.2f" % score print(" [PASS]\n") # ── Case 2: 5 questions → penalty ────────────────────────────────────── print("Case 2: 5 questions (spam) → penalty") s2 = run(TASKS[3], [ Action(type="ask", question="When should this happen?"), Action(type="ask", question="Who are the participants?"), Action(type="ask", question="Where is it?"), Action(type="ask", question="What is the agenda?"), Action(type="ask", question="How long will it take?"), Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), ]) score2, reason2 = evaluate_episode(s2) print(" score=%.2f reason=%s" % (score2, reason2)) assert score2 < 1.0, "Expected penalty" print(" [PASS] penalised to %.2f\n" % score2) # ── Case 3: 0 questions + wrong execute → 0.0 ───────────────────────── print("Case 3: no questions + wrong execute → 0.0") s3 = run(TASKS[3], [ Action(type="execute", proposed_time="2 PM", proposed_participants=["Wrong"]), ]) score3, reason3 = evaluate_episode(s3) print(" score=%.2f reason=%s" % (score3, reason3)) assert score3 == 0.0, "Expected 0.0 got %.2f" % score3 print(" [PASS]\n") # ── Case 4: partial correctness (time only) → 0.5 ───────────────────── print("Case 4: collect time only, execute with correct time but wrong people → 0.5") s4 = run(TASKS[3], [ Action(type="ask", question="When should this happen?"), Action(type="execute", proposed_time="10 AM", proposed_participants=["Wrong"]), ]) score4, reason4 = evaluate_episode(s4) print(" score=%.2f reason=%s" % (score4, reason4)) assert score4 == 0.5, "Expected 0.5 got %.2f" % score4 print(" [PASS]\n") # ── Case 5: determinism ──────────────────────────────────────────────── print("Case 5: determinism — same input × 2") actions = [ Action(type="ask", question="When should this happen?"), Action(type="ask", question="Who are the participants?"), Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), ] sa, _ = evaluate_episode(run(TASKS[1], actions)) sb, _ = evaluate_episode(run(TASKS[1], actions)) assert sa == sb print(" [PASS] %.2f == %.2f\n" % (sa, sb)) print("=== ALL CHECKS PASSED ===")