supportOpsEnv / scripts /run_baseline.py
dbatcode28's picture
initial
bd67155
raw
history blame
3.28 kB
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, List
ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from openai import OpenAI
from support_ops_env.env import SupportOpsEnv
from support_ops_env.models import Action, BaselineResult
from support_ops_env.tasks import list_task_ids
SYSTEM_PROMPT = """You are evaluating a support operations environment.
Return exactly one JSON object with keys: action_type, target, value.
Choose from action_type:
- inspect_ticket
- request_context
- set_priority
- set_route
- set_resolution
- escalate
- rank_queue
- finalize
Be concise and deterministic. Only use ticket ids that appear in the observation.
When enough evidence is gathered, finalize."""
def main() -> None:
parser = argparse.ArgumentParser(description="Run a reproducible baseline over all SupportOpsEnv tasks.")
parser.add_argument("--model", default="gpt-4.1-mini", help="OpenAI model name")
parser.add_argument("--output", default="baseline_results.json", help="Path to write JSON results")
args = parser.parse_args()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise SystemExit("OPENAI_API_KEY is required.")
client = OpenAI(api_key=api_key)
results: List[BaselineResult] = []
for task_id in list_task_ids():
env = SupportOpsEnv(task_id=task_id)
observation = env.reset()
done = False
transcript: List[Dict[str, object]] = []
last_info: Dict[str, object] = {}
while not done:
response = client.responses.create(
model=args.model,
temperature=0,
input=[
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": json.dumps(observation.model_dump(), indent=2, sort_keys=True),
},
],
)
raw = response.output_text.strip()
payload = json.loads(raw)
action = Action.model_validate(payload)
observation, reward, done, info = env.step(action)
transcript.append(
{
"action": action.model_dump(),
"reward": reward.model_dump(),
"task_score": info["task_score"],
"done": done,
}
)
last_info = info
results.append(
BaselineResult(
task_id=task_id,
difficulty=observation.difficulty,
score=float(last_info.get("task_score", 0.0)),
steps=int(last_info.get("step_count", 0)),
transcript=transcript,
)
)
output_path = Path(args.output)
payload = {
"model": args.model,
"average_score": round(sum(item.score for item in results) / len(results), 4),
"results": [item.model_dump() for item in results],
}
output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print(json.dumps(payload, indent=2))
if __name__ == "__main__":
main()