Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, List | |
| ROOT = Path(__file__).resolve().parent.parent | |
| if str(ROOT) not in sys.path: | |
| sys.path.insert(0, str(ROOT)) | |
| from openai import OpenAI | |
| from support_ops_env.env import SupportOpsEnv | |
| from support_ops_env.models import Action, BaselineResult | |
| from support_ops_env.tasks import list_task_ids | |
| SYSTEM_PROMPT = """You are evaluating a support operations environment. | |
| Return exactly one JSON object with keys: action_type, target, value. | |
| Choose from action_type: | |
| - inspect_ticket | |
| - request_context | |
| - set_priority | |
| - set_route | |
| - set_resolution | |
| - escalate | |
| - rank_queue | |
| - finalize | |
| Be concise and deterministic. Only use ticket ids that appear in the observation. | |
| When enough evidence is gathered, finalize.""" | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Run a reproducible baseline over all SupportOpsEnv tasks.") | |
| parser.add_argument("--model", default="gpt-4.1-mini", help="OpenAI model name") | |
| parser.add_argument("--output", default="baseline_results.json", help="Path to write JSON results") | |
| args = parser.parse_args() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| raise SystemExit("OPENAI_API_KEY is required.") | |
| client = OpenAI(api_key=api_key) | |
| results: List[BaselineResult] = [] | |
| for task_id in list_task_ids(): | |
| env = SupportOpsEnv(task_id=task_id) | |
| observation = env.reset() | |
| done = False | |
| transcript: List[Dict[str, object]] = [] | |
| last_info: Dict[str, object] = {} | |
| while not done: | |
| response = client.responses.create( | |
| model=args.model, | |
| temperature=0, | |
| input=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| { | |
| "role": "user", | |
| "content": json.dumps(observation.model_dump(), indent=2, sort_keys=True), | |
| }, | |
| ], | |
| ) | |
| raw = response.output_text.strip() | |
| payload = json.loads(raw) | |
| action = Action.model_validate(payload) | |
| observation, reward, done, info = env.step(action) | |
| transcript.append( | |
| { | |
| "action": action.model_dump(), | |
| "reward": reward.model_dump(), | |
| "task_score": info["task_score"], | |
| "done": done, | |
| } | |
| ) | |
| last_info = info | |
| results.append( | |
| BaselineResult( | |
| task_id=task_id, | |
| difficulty=observation.difficulty, | |
| score=float(last_info.get("task_score", 0.0)), | |
| steps=int(last_info.get("step_count", 0)), | |
| transcript=transcript, | |
| ) | |
| ) | |
| output_path = Path(args.output) | |
| payload = { | |
| "model": args.model, | |
| "average_score": round(sum(item.score for item in results) / len(results), 4), | |
| "results": [item.model_dump() for item in results], | |
| } | |
| output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") | |
| print(json.dumps(payload, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |