Spaces:
Sleeping
Sleeping
| import sys | |
| sys.path.insert(0, ".") | |
| # Direct imports β bypass openenv-core's __init__ to avoid gradio hang | |
| import importlib.util, types | |
| def load(path, name): | |
| spec = importlib.util.spec_from_file_location(name, path) | |
| mod = importlib.util.module_from_spec(spec) | |
| sys.modules[name] = mod | |
| spec.loader.exec_module(mod) | |
| return mod | |
| # Load chain in dependency order | |
| load("models/models.py", "models.models") | |
| load("skills/ambiguity_detection.py", "skills.ambiguity_detection") | |
| load("skills/conversation_memory.py", "skills.conversation_memory") | |
| load("skills/reward_system.py", "skills.reward_system") | |
| load("env/env.py", "env.env") | |
| load("tasks/tasks.py", "tasks.tasks") | |
| load("grader/grader.py", "grader.grader") | |
| from models.models import Action | |
| from env.env import AmbiguityEnv | |
| from grader.grader import evaluate_episode | |
| from tasks.tasks import TASKS | |
| print("=== STEP 4 VALIDATION CHECKLIST ===\n") | |
| # ββ Tasks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("[CHECK] 4 tasks defined") | |
| assert len(TASKS) == 4 | |
| for i, t in enumerate(TASKS): | |
| print(" Task %d: %-32s ambiguity=%-5s missing=%s" | |
| % (i+1, t["name"], t["ambiguity"], t["missing_fields"])) | |
| assert TASKS[0]["ambiguity"] == False | |
| assert TASKS[3]["ambiguity"] == True | |
| assert len(TASKS[3]["missing_fields"]) > 1 | |
| print("[OK] difficulty progression: easy β medium β medium β hard\n") | |
| # ββ helper βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run(task, actions): | |
| env = AmbiguityEnv() | |
| env.reset(task) | |
| for a in actions: | |
| env.step(a) | |
| return env.state() | |
| # ββ Case 1: 2 good questions + correct execute β 1.0 ββββββββββββββββββ | |
| print("Case 1: 2 targeted questions + correct execute (hard task)") | |
| s = run(TASKS[3], [ | |
| Action(type="ask", question="When should this happen?"), | |
| Action(type="ask", question="Who are the participants?"), | |
| Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), | |
| ]) | |
| score, reason = evaluate_episode(s) | |
| print(" score=%.2f reason=%s" % (score, reason)) | |
| assert score == 1.0, "Expected 1.0 got %.2f" % score | |
| print(" [PASS]\n") | |
| # ββ Case 2: 5 questions β penalty ββββββββββββββββββββββββββββββββββββββ | |
| print("Case 2: 5 questions (spam) β penalty") | |
| s2 = run(TASKS[3], [ | |
| Action(type="ask", question="When should this happen?"), | |
| Action(type="ask", question="Who are the participants?"), | |
| Action(type="ask", question="Where is it?"), | |
| Action(type="ask", question="What is the agenda?"), | |
| Action(type="ask", question="How long will it take?"), | |
| Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), | |
| ]) | |
| score2, reason2 = evaluate_episode(s2) | |
| print(" score=%.2f reason=%s" % (score2, reason2)) | |
| assert score2 < 1.0, "Expected penalty" | |
| print(" [PASS] penalised to %.2f\n" % score2) | |
| # ββ Case 3: 0 questions + wrong execute β 0.0 βββββββββββββββββββββββββ | |
| print("Case 3: no questions + wrong execute β 0.0") | |
| s3 = run(TASKS[3], [ | |
| Action(type="execute", proposed_time="2 PM", proposed_participants=["Wrong"]), | |
| ]) | |
| score3, reason3 = evaluate_episode(s3) | |
| print(" score=%.2f reason=%s" % (score3, reason3)) | |
| assert score3 == 0.0, "Expected 0.0 got %.2f" % score3 | |
| print(" [PASS]\n") | |
| # ββ Case 4: partial correctness (time only) β 0.5 βββββββββββββββββββββ | |
| print("Case 4: collect time only, execute with correct time but wrong people β 0.5") | |
| s4 = run(TASKS[3], [ | |
| Action(type="ask", question="When should this happen?"), | |
| Action(type="execute", proposed_time="10 AM", proposed_participants=["Wrong"]), | |
| ]) | |
| score4, reason4 = evaluate_episode(s4) | |
| print(" score=%.2f reason=%s" % (score4, reason4)) | |
| assert score4 == 0.5, "Expected 0.5 got %.2f" % score4 | |
| print(" [PASS]\n") | |
| # ββ Case 5: determinism ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Case 5: determinism β same input Γ 2") | |
| actions = [ | |
| Action(type="ask", question="When should this happen?"), | |
| Action(type="ask", question="Who are the participants?"), | |
| Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]), | |
| ] | |
| sa, _ = evaluate_episode(run(TASKS[1], actions)) | |
| sb, _ = evaluate_episode(run(TASKS[1], actions)) | |
| assert sa == sb | |
| print(" [PASS] %.2f == %.2f\n" % (sa, sb)) | |
| print("=== ALL CHECKS PASSED ===") | |