File size: 1,755 Bytes
aa4f7bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2722e
aa4f7bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""Small evaluation harness that executes the expected action sequence for each task
and prints a JSON summary of grader scores. Use this to reproduce Round-1 evaluation outputs.
"""
import json
from env.environment import SupportTicketEnv
from env.models import Action


EXPECTED_ACTIONS = {
    "task_easy_1": [
        Action(action_type="check_policy", parameters={}),
        Action(action_type="issue_refund", parameters={"amount": "full"}),
        Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
    ],
    "task_medium_1": [
        Action(action_type="check_policy", parameters={}),
        Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
        Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
    ],
    "task_hard_1": [
        Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
        Action(action_type="reply_to_customer", parameters={"message": "We're escalating this to billing tier 2 and will follow up."}),
        Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
    ],
}


def run_sequence(task_id: str, actions):
    env = SupportTicketEnv(task_id=task_id)
    env.reset()
    final_reward = 0.0
    done = False
    for a in actions:
        obs, reward, done, info = env.step(a)
        final_reward = info.get("current_reward", final_reward)
        if done:
            break
    return final_reward


def main():
    results = {}
    for task_id, actions in EXPECTED_ACTIONS.items():
        score = run_sequence(task_id, actions)
        results[task_id] = {"score": score}

    print(json.dumps({"results": results}, indent=2))


if __name__ == "__main__":
    main()