openenv-rl-environment / evaluate.py
Sid8421's picture
Fix final OpenEnv validator compliance: inference stdout format, Dockerfile deps, API vars, and grader bounds
ba2722e
"""Small evaluation harness that executes the expected action sequence for each task
and prints a JSON summary of grader scores. Use this to reproduce Round-1 evaluation outputs.
"""
import json
from env.environment import SupportTicketEnv
from env.models import Action
EXPECTED_ACTIONS = {
"task_easy_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="issue_refund", parameters={"amount": "full"}),
Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
],
"task_medium_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
],
"task_hard_1": [
Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
Action(action_type="reply_to_customer", parameters={"message": "We're escalating this to billing tier 2 and will follow up."}),
Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
],
}
def run_sequence(task_id: str, actions):
env = SupportTicketEnv(task_id=task_id)
env.reset()
final_reward = 0.0
done = False
for a in actions:
obs, reward, done, info = env.step(a)
final_reward = info.get("current_reward", final_reward)
if done:
break
return final_reward
def main():
results = {}
for task_id, actions in EXPECTED_ACTIONS.items():
score = run_sequence(task_id, actions)
results[task_id] = {"score": score}
print(json.dumps({"results": results}, indent=2))
if __name__ == "__main__":
main()