| """Small evaluation harness that executes the expected action sequence for each task |
| and prints a JSON summary of grader scores. Use this to reproduce Round-1 evaluation outputs. |
| """ |
| import json |
| from env.environment import SupportTicketEnv |
| from env.models import Action |
|
|
|
|
| EXPECTED_ACTIONS = { |
| "task_easy_1": [ |
| Action(action_type="check_policy", parameters={}), |
| Action(action_type="issue_refund", parameters={"amount": "full"}), |
| Action(action_type="close_ticket", parameters={"resolution": "refunded"}), |
| ], |
| "task_medium_1": [ |
| Action(action_type="check_policy", parameters={}), |
| Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}), |
| Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}), |
| ], |
| "task_hard_1": [ |
| Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}), |
| Action(action_type="reply_to_customer", parameters={"message": "We're escalating this to billing tier 2 and will follow up."}), |
| Action(action_type="escalate", parameters={"reason": "billing_tier2"}), |
| ], |
| } |
|
|
|
|
| def run_sequence(task_id: str, actions): |
| env = SupportTicketEnv(task_id=task_id) |
| env.reset() |
| final_reward = 0.0 |
| done = False |
| for a in actions: |
| obs, reward, done, info = env.step(a) |
| final_reward = info.get("current_reward", final_reward) |
| if done: |
| break |
| return final_reward |
|
|
|
|
| def main(): |
| results = {} |
| for task_id, actions in EXPECTED_ACTIONS.items(): |
| score = run_sequence(task_id, actions) |
| results[task_id] = {"score": score} |
|
|
| print(json.dumps({"results": results}, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|