File size: 1,755 Bytes
aa4f7bc ba2722e aa4f7bc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | """Small evaluation harness that executes the expected action sequence for each task
and prints a JSON summary of grader scores. Use this to reproduce Round-1 evaluation outputs.
"""
import json
from env.environment import SupportTicketEnv
from env.models import Action
EXPECTED_ACTIONS = {
"task_easy_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="issue_refund", parameters={"amount": "full"}),
Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
],
"task_medium_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
],
"task_hard_1": [
Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
Action(action_type="reply_to_customer", parameters={"message": "We're escalating this to billing tier 2 and will follow up."}),
Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
],
}
def run_sequence(task_id: str, actions):
env = SupportTicketEnv(task_id=task_id)
env.reset()
final_reward = 0.0
done = False
for a in actions:
obs, reward, done, info = env.step(a)
final_reward = info.get("current_reward", final_reward)
if done:
break
return final_reward
def main():
results = {}
for task_id, actions in EXPECTED_ACTIONS.items():
score = run_sequence(task_id, actions)
results[task_id] = {"score": score}
print(json.dumps({"results": results}, indent=2))
if __name__ == "__main__":
main()
|