| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| if str(ROOT) not in sys.path: |
| sys.path.insert(0, str(ROOT)) |
|
|
| from openai import OpenAI |
|
|
| from support_ops_env.env import SupportOpsEnv |
| from support_ops_env.models import Action, BaselineResult |
| from support_ops_env.tasks import list_task_ids |
|
|
|
|
| SYSTEM_PROMPT = """You are evaluating a support operations environment. |
| Return exactly one JSON object with keys: action_type, target, value. |
| Choose from action_type: |
| - inspect_ticket |
| - request_context |
| - set_priority |
| - set_route |
| - set_resolution |
| - escalate |
| - rank_queue |
| - finalize |
| Be concise and deterministic. Only use ticket ids that appear in the observation. |
| When enough evidence is gathered, finalize.""" |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Run a reproducible baseline over all SupportOpsEnv tasks.") |
| parser.add_argument("--model", default="gpt-4.1-mini", help="OpenAI model name") |
| parser.add_argument("--output", default="baseline_results.json", help="Path to write JSON results") |
| args = parser.parse_args() |
|
|
| api_key = os.getenv("OPENAI_API_KEY") |
| if not api_key: |
| raise SystemExit("OPENAI_API_KEY is required.") |
|
|
| client = OpenAI(api_key=api_key) |
| results: List[BaselineResult] = [] |
|
|
| for task_id in list_task_ids(): |
| env = SupportOpsEnv(task_id=task_id) |
| observation = env.reset() |
| done = False |
| transcript: List[Dict[str, object]] = [] |
| last_info: Dict[str, object] = {} |
|
|
| while not done: |
| response = client.responses.create( |
| model=args.model, |
| temperature=0, |
| input=[ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| { |
| "role": "user", |
| "content": json.dumps(observation.model_dump(), indent=2, sort_keys=True), |
| }, |
| ], |
| ) |
| raw = response.output_text.strip() |
| payload = json.loads(raw) |
| action = Action.model_validate(payload) |
| observation, reward, done, info = env.step(action) |
| transcript.append( |
| { |
| "action": action.model_dump(), |
| "reward": reward.model_dump(), |
| "task_score": info["task_score"], |
| "done": done, |
| } |
| ) |
| last_info = info |
|
|
| results.append( |
| BaselineResult( |
| task_id=task_id, |
| difficulty=observation.difficulty, |
| score=float(last_info.get("task_score", 0.0)), |
| steps=int(last_info.get("step_count", 0)), |
| transcript=transcript, |
| ) |
| ) |
|
|
| output_path = Path(args.output) |
| payload = { |
| "model": args.model, |
| "average_score": round(sum(item.score for item in results) / len(results), 4), |
| "results": [item.model_dump() for item in results], |
| } |
| output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") |
| print(json.dumps(payload, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|