Spaces:

Yaser77
/

ambiguity-env

Sleeping

App Files Files Community

ambiguity-env / test_step4.py

Yaser77

feat: ambiguity resolution environment v1.0 - OpenEnv Hackathon

c06cf60 about 2 months ago

raw

history blame contribute delete

4.88 kB

	import sys
	sys.path.insert(0, ".")

	# Direct imports — bypass openenv-core's __init__ to avoid gradio hang
	import importlib.util, types

	def load(path, name):
	spec = importlib.util.spec_from_file_location(name, path)
	mod = importlib.util.module_from_spec(spec)
	sys.modules[name] = mod
	spec.loader.exec_module(mod)
	return mod

	# Load chain in dependency order
	load("models/models.py", "models.models")
	load("skills/ambiguity_detection.py", "skills.ambiguity_detection")
	load("skills/conversation_memory.py", "skills.conversation_memory")
	load("skills/reward_system.py", "skills.reward_system")
	load("env/env.py", "env.env")
	load("tasks/tasks.py", "tasks.tasks")
	load("grader/grader.py", "grader.grader")

	from models.models import Action
	from env.env import AmbiguityEnv
	from grader.grader import evaluate_episode
	from tasks.tasks import TASKS

	print("=== STEP 4 VALIDATION CHECKLIST ===\n")

	# ── Tasks ──────────────────────────────────────────────────────────────
	print("[CHECK] 4 tasks defined")
	assert len(TASKS) == 4
	for i, t in enumerate(TASKS):
	print(" Task %d: %-32s ambiguity=%-5s missing=%s"
	% (i+1, t["name"], t["ambiguity"], t["missing_fields"]))

	assert TASKS[0]["ambiguity"] == False
	assert TASKS[3]["ambiguity"] == True
	assert len(TASKS[3]["missing_fields"]) > 1
	print("[OK] difficulty progression: easy → medium → medium → hard\n")

	# ── helper ─────────────────────────────────────────────────────────────
	def run(task, actions):
	env = AmbiguityEnv()
	env.reset(task)
	for a in actions:
	env.step(a)
	return env.state()

	# ── Case 1: 2 good questions + correct execute → 1.0 ──────────────────
	print("Case 1: 2 targeted questions + correct execute (hard task)")
	s = run(TASKS[3], [
	Action(type="ask", question="When should this happen?"),
	Action(type="ask", question="Who are the participants?"),
	Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
	])
	score, reason = evaluate_episode(s)
	print(" score=%.2f reason=%s" % (score, reason))
	assert score == 1.0, "Expected 1.0 got %.2f" % score
	print(" [PASS]\n")

	# ── Case 2: 5 questions → penalty ──────────────────────────────────────
	print("Case 2: 5 questions (spam) → penalty")
	s2 = run(TASKS[3], [
	Action(type="ask", question="When should this happen?"),
	Action(type="ask", question="Who are the participants?"),
	Action(type="ask", question="Where is it?"),
	Action(type="ask", question="What is the agenda?"),
	Action(type="ask", question="How long will it take?"),
	Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
	])
	score2, reason2 = evaluate_episode(s2)
	print(" score=%.2f reason=%s" % (score2, reason2))
	assert score2 < 1.0, "Expected penalty"
	print(" [PASS] penalised to %.2f\n" % score2)

	# ── Case 3: 0 questions + wrong execute → 0.0 ─────────────────────────
	print("Case 3: no questions + wrong execute → 0.0")
	s3 = run(TASKS[3], [
	Action(type="execute", proposed_time="2 PM", proposed_participants=["Wrong"]),
	])
	score3, reason3 = evaluate_episode(s3)
	print(" score=%.2f reason=%s" % (score3, reason3))
	assert score3 == 0.0, "Expected 0.0 got %.2f" % score3
	print(" [PASS]\n")

	# ── Case 4: partial correctness (time only) → 0.5 ─────────────────────
	print("Case 4: collect time only, execute with correct time but wrong people → 0.5")
	s4 = run(TASKS[3], [
	Action(type="ask", question="When should this happen?"),
	Action(type="execute", proposed_time="10 AM", proposed_participants=["Wrong"]),
	])
	score4, reason4 = evaluate_episode(s4)
	print(" score=%.2f reason=%s" % (score4, reason4))
	assert score4 == 0.5, "Expected 0.5 got %.2f" % score4
	print(" [PASS]\n")

	# ── Case 5: determinism ────────────────────────────────────────────────
	print("Case 5: determinism — same input × 2")
	actions = [
	Action(type="ask", question="When should this happen?"),
	Action(type="ask", question="Who are the participants?"),
	Action(type="execute", proposed_time="10 AM", proposed_participants=["Team A"]),
	]
	sa, _ = evaluate_episode(run(TASKS[1], actions))
	sb, _ = evaluate_episode(run(TASKS[1], actions))
	assert sa == sb
	print(" [PASS] %.2f == %.2f\n" % (sa, sb))

	print("=== ALL CHECKS PASSED ===")