Spaces:
Sleeping
Sleeping
File size: 6,421 Bytes
f4e02da d8abf58 97db3e3 9174350 f4e02da d8abf58 f4e02da d8abf58 f4e02da d8abf58 f4e02da d8abf58 f4e02da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Dict, Tuple
from openai import OpenAI
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from openenv_support_triage.environment import SupportTriageEnv
from openenv_support_triage.graders import grade_state
from openenv_support_triage.models import ActionModel, ObservationModel
from openenv_support_triage.tasks import TASKS
DEFAULT_MODEL = "gpt-4.1-mini"
DEFAULT_SEED = 7
SCORE_EPS = 0.1
def strict_score(value: float) -> float:
return min(1.0 - SCORE_EPS, max(SCORE_EPS, value))
def one_decimal_score(value: float) -> float:
return round(strict_score(value), 1)
def heuristic_action(observation: ObservationModel) -> ActionModel:
for ticket in observation.tickets:
if ticket.priority is None or ticket.team is None:
text = f"{ticket.subject} {ticket.customer_message}".lower()
if "fraud" in text or "unknown purchase" in text or "chargeback" in text:
return ActionModel(action_type="classify_ticket", ticket_id=ticket.ticket_id, priority="urgent", team="risk")
if "refund" in text or "invoice" in text or "prorated" in text or "charge" in text:
priority = "high" if ticket.customer_tier in {"premium", "enterprise"} else "medium"
return ActionModel(action_type="classify_ticket", ticket_id=ticket.ticket_id, priority=priority, team="billing")
if "api" in text or "500" in text or "log in" in text or "password" in text:
priority = "urgent" if "down" in text or "500" in text else "high"
return ActionModel(action_type="classify_ticket", ticket_id=ticket.ticket_id, priority=priority, team="technical")
return ActionModel(action_type="classify_ticket", ticket_id=ticket.ticket_id, priority="medium", team="support")
for ticket in observation.tickets:
if not ticket.drafted_reply and ticket.status != "resolved":
reply = (
"Thanks for contacting us. We will verify details, provide an update, "
"and follow support policy."
)
return ActionModel(action_type="draft_reply", ticket_id=ticket.ticket_id, reply_text=reply)
for ticket in observation.tickets:
if ticket.status != "resolved":
return ActionModel(
action_type="resolve_ticket",
ticket_id=ticket.ticket_id,
resolution_note="Issue triaged, response drafted, and routed to correct team.",
)
return ActionModel(action_type="noop")
def llm_action(client: OpenAI, model: str, observation: ObservationModel, seed: int) -> ActionModel:
schema_hint = {
"action_type": "classify_ticket|draft_reply|resolve_ticket|noop",
"ticket_id": "string or null",
"priority": "low|medium|high|urgent or null",
"team": "support|billing|technical|risk or null",
"reply_text": "string or null",
"resolution_note": "string or null",
}
prompt = {
"objective": observation.objective,
"step_index": observation.step_index,
"max_steps": observation.max_steps,
"tickets": [t.model_dump() for t in observation.tickets],
"output_schema": schema_hint,
"instruction": (
"Return only one JSON object. Choose a single best next action. "
"Avoid noop unless everything is resolved."
),
}
response = client.chat.completions.create(
model=model,
temperature=0,
seed=seed,
response_format={"type": "json_object"},
messages=[
{
"role": "system",
"content": "You are an operations agent that performs customer support triage precisely.",
},
{
"role": "user",
"content": json.dumps(prompt),
},
],
)
content = response.choices[0].message.content
data = json.loads(content) if content else {}
return ActionModel.model_validate(data)
def run_task(task_id: str, model: str, seed: int, use_heuristic_only: bool = False) -> Tuple[float, Dict[str, float], float]:
env = SupportTriageEnv(task_id=task_id)
observation = env.reset(task_id=task_id)
client = None if use_heuristic_only else OpenAI()
done = False
while not done:
if use_heuristic_only:
action = heuristic_action(observation)
else:
try:
action = llm_action(client=client, model=model, observation=observation, seed=seed)
except Exception:
action = heuristic_action(observation)
observation, reward, done, _ = env.step(action)
final_state = env.state()
task_score, components = grade_state(final_state)
return task_score, components, final_state.running_score
def main() -> None:
parser = argparse.ArgumentParser(description="Run reproducible OpenEnv baseline inference")
parser.add_argument("--model", default=os.getenv("OPENAI_MODEL", DEFAULT_MODEL))
parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
parser.add_argument("--heuristic-only", action="store_true")
args = parser.parse_args()
if not args.heuristic_only and not os.getenv("OPENAI_API_KEY"):
raise EnvironmentError("OPENAI_API_KEY is required unless --heuristic-only is set")
results = {}
scores = []
for task_id in sorted(TASKS.keys()):
score, components, running_score = run_task(
task_id=task_id,
model=args.model,
seed=args.seed,
use_heuristic_only=args.heuristic_only,
)
scores.append(score)
results[task_id] = {
"task_score": one_decimal_score(score),
"grade_components": components,
"trajectory_reward": one_decimal_score(running_score),
}
aggregate = sum(scores) / len(scores) if scores else 0.0
payload = {
"model": args.model,
"seed": args.seed,
"heuristic_only": args.heuristic_only,
"aggregate_score": one_decimal_score(aggregate),
"tasks": results,
}
print(json.dumps(payload, indent=2))
if __name__ == "__main__":
main()
|