ambiguity-env / test_step4.py
Yaser77's picture
feat: ambiguity resolution environment v1.0 - OpenEnv Hackathon
c06cf60
import sys
sys.path.insert(0, ".")
# Direct imports β€” bypass openenv-core's __init__ to avoid gradio hang
import importlib.util, types
def load(path, name):
spec = importlib.util.spec_from_file_location(name, path)
mod = importlib.util.module_from_spec(spec)
sys.modules[name] = mod
spec.loader.exec_module(mod)
return mod
# Load chain in dependency order
load("models/models.py", "models.models")
load("skills/ambiguity_detection.py", "skills.ambiguity_detection")
load("skills/conversation_memory.py", "skills.conversation_memory")
load("skills/reward_system.py", "skills.reward_system")
load("env/env.py", "env.env")
load("tasks/tasks.py", "tasks.tasks")
load("grader/grader.py", "grader.grader")
from models.models import Action
from env.env import AmbiguityEnv
from grader.grader import evaluate_episode
from tasks.tasks import TASKS
print("=== STEP 4 VALIDATION CHECKLIST ===\n")
# ── Tasks ──────────────────────────────────────────────────────────────
print("[CHECK] 4 tasks defined")
assert len(TASKS) == 4
for i, t in enumerate(TASKS):
print(" Task %d: %-32s ambiguity=%-5s missing=%s"
% (i+1, t["name"], t["ambiguity"], t["missing_fields"]))
assert TASKS[0]["ambiguity"] == False
assert TASKS[3]["ambiguity"] == True
assert len(TASKS[3]["missing_fields"]) > 1
print("[OK] difficulty progression: easy β†’ medium β†’ medium β†’ hard\n")
# ── helper ─────────────────────────────────────────────────────────────
def run(task, actions):
env = AmbiguityEnv()
env.reset(task)
for a in actions:
env.step(a)
return env.state()
# ── Case 1: 2 good questions + correct execute β†’ 1.0 ──────────────────
print("Case 1: 2 targeted questions + correct execute (hard task)")
s = run(TASKS[3], [
Action(type="ask", question="When should this happen?"),
Action(type="ask", question="Who are the participants?"),
Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
])
score, reason = evaluate_episode(s)
print(" score=%.2f reason=%s" % (score, reason))
assert score == 1.0, "Expected 1.0 got %.2f" % score
print(" [PASS]\n")
# ── Case 2: 5 questions β†’ penalty ──────────────────────────────────────
print("Case 2: 5 questions (spam) β†’ penalty")
s2 = run(TASKS[3], [
Action(type="ask", question="When should this happen?"),
Action(type="ask", question="Who are the participants?"),
Action(type="ask", question="Where is it?"),
Action(type="ask", question="What is the agenda?"),
Action(type="ask", question="How long will it take?"),
Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
])
score2, reason2 = evaluate_episode(s2)
print(" score=%.2f reason=%s" % (score2, reason2))
assert score2 < 1.0, "Expected penalty"
print(" [PASS] penalised to %.2f\n" % score2)
# ── Case 3: 0 questions + wrong execute β†’ 0.0 ─────────────────────────
print("Case 3: no questions + wrong execute β†’ 0.0")
s3 = run(TASKS[3], [
Action(type="execute", proposed_time="2 PM", proposed_participants=["Wrong"]),
])
score3, reason3 = evaluate_episode(s3)
print(" score=%.2f reason=%s" % (score3, reason3))
assert score3 == 0.0, "Expected 0.0 got %.2f" % score3
print(" [PASS]\n")
# ── Case 4: partial correctness (time only) β†’ 0.5 ─────────────────────
print("Case 4: collect time only, execute with correct time but wrong people β†’ 0.5")
s4 = run(TASKS[3], [
Action(type="ask", question="When should this happen?"),
Action(type="execute", proposed_time="10 AM", proposed_participants=["Wrong"]),
])
score4, reason4 = evaluate_episode(s4)
print(" score=%.2f reason=%s" % (score4, reason4))
assert score4 == 0.5, "Expected 0.5 got %.2f" % score4
print(" [PASS]\n")
# ── Case 5: determinism ────────────────────────────────────────────────
print("Case 5: determinism β€” same input Γ— 2")
actions = [
Action(type="ask", question="When should this happen?"),
Action(type="ask", question="Who are the participants?"),
Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
]
sa, _ = evaluate_episode(run(TASKS[1], actions))
sb, _ = evaluate_episode(run(TASKS[1], actions))
assert sa == sb
print(" [PASS] %.2f == %.2f\n" % (sa, sb))
print("=== ALL CHECKS PASSED ===")