import sys from html import escape from pathlib import Path from typing import Any # Ensure repo root is on sys.path so `models` and `server` are importable _repo_root = str(Path(__file__).resolve().parent.parent) if _repo_root not in sys.path: sys.path.insert(0, _repo_root) from fastapi import HTTPException from pydantic import BaseModel from fastapi.responses import HTMLResponse, RedirectResponse from openenv.core.env_server import create_app from models import HelpdeskTicketAction, HelpdeskTicketObservation from server.environment import HelpdeskTicketRoutingEnvironment from server.grader import grade_action from server.tasks import TASKS, load_dataset from vocabulary import APP_ENV_NAME, PROJECT_TITLE, TEAM_NAME app = create_app( HelpdeskTicketRoutingEnvironment, HelpdeskTicketAction, HelpdeskTicketObservation, env_name=APP_ENV_NAME, ) class GraderRequest(BaseModel): task_id: int ticket_id: str action: dict[str, Any] @app.get("/", include_in_schema=False) def root_redirect(): return RedirectResponse(url="/web", status_code=307) @app.get("/tasks") def list_tasks(): return { "tasks": [ { "id": t["id"], "name": t["name"], "difficulty": t["difficulty"], "instructions": t["instructions"], "allowed_fields": t["allowed_fields"], } for t in TASKS.values() ] } @app.get("/web", response_class=HTMLResponse) def web_ui(): dataset = load_dataset() dataset_size = len(dataset) alternate_route_count = sum( 1 for ticket in dataset if ticket.alternate_route_score_multiplier > 0.0 ) clustered_case_count = sum(1 for ticket in dataset if ticket.service_cluster_id) hidden_context_case_count = sum( 1 for ticket in dataset if ticket.ambiguity_note or ticket.related_ticket_id or ticket.planning_note or ticket.customer_update_note ) incident_sensitive_count = sum(1 for ticket in dataset if ticket.incident_recommended) difficulty_labels = { "easy": "Guided", "medium": "Contextual", "hard": "Adaptive", } task_cards = "".join( f"""
Task {t['id']} {escape(difficulty_labels.get(t['difficulty'], t['difficulty']).upper())}

{escape(t['name'])}

{escape(t['instructions'])}

{''.join(f'{escape(field)}' for field in t['allowed_fields'])}
""" for t in TASKS.values() ) html = f""" {escape(APP_ENV_NAME)}

OpenEnv Environment

{escape(PROJECT_TITLE)}

{escape(APP_ENV_NAME)}

Queue decisions that actually carry forward.

A sleek benchmark surface for sequential helpdesk routing: hidden context, cluster-aware follow-ons, incident handling, deferrals, and a terminal rubric that rewards queue strategy instead of isolated classification alone.

Task family: easy to hard Closed-form grader Queue-level terminal objective
{dataset_size} Tickets in the grounded dataset Curated records plus queue mutation mechanics create repeatable but non-trivial episodes.
{alternate_route_count} Capacity-aware alternate routes The grader can reward declared fallback routes instead of collapsing to all-or-nothing exact match.
{clustered_case_count} Cluster-linked or coordinated cases Handling one ticket can stabilize or destabilize the downstream tickets in the same workstream.
{hidden_context_case_count} Hidden-context routing cases Investigation tools matter because key evidence does not appear in the initial observation by default.

Task Ladder

One benchmark family, not three disconnected demos

The difficulty ladder keeps the same full-routing output while progressively changing observability, queue dependencies, and operational pressure.

{task_cards}

Environment Signals

What the agent is balancing

The benchmark is designed so strong policy choices change later tickets, incident coverage, and terminal queue quality instead of just nudging shaped reward.

Hidden context retrieval

Related-ticket previews, requester history, internal routing notes, queue cluster summaries, and capacity forecasts are revealed through explicit tool use.

investigate request_info cluster summary

Operational actions with consequences

Deferrals can raise later urgency, incident handling can reduce downstream debt, and weak handling can spawn or worsen follow-up work.

defer open_incident follow-up spawning

Queue-level terminal rubric

Final scoring blends routing trajectory quality with queue management quality so agents are rewarded for coherent episode strategy, not just isolated ticket matches.

terminal rubric queue quality planning-aware

Quick Routes

Fast ways to demo the environment

Useful entry points for judges, reviewers, or anyone trying to get signal from the project quickly.

Interactive API docs

Browse the full OpenEnv-compatible surface, request models, and built-in helper endpoints.

GET /docs Open Docs

Task manifest

Inspect the easy, medium, and hard task definitions exactly as exposed by the server.

GET /tasks View Tasks

Hard-task baseline rollout

See a deterministic baseline episode over the hardest queue with the current environment logic.

GET /baseline?task_id=3&seed=42 Run Baseline

Health and deployment status

Quick check that the service is alive and ready for OpenEnv-style evaluation requests.

GET /health Check Health
""" return HTMLResponse(content=html) def _build_baseline_submit_action( ticket: dict[str, Any], allowed_fields: list[str] ) -> HelpdeskTicketAction: import inference candidate = inference.heuristic_action(ticket, allowed_fields) candidate, _ = inference.apply_domain_overrides(ticket, candidate, allowed_fields) return HelpdeskTicketAction(**candidate) @app.get("/baseline") def baseline_rollout(task_id: int = 1, seed: int = 42): import inference env = HelpdeskTicketRoutingEnvironment() observation = env.reset(seed=seed, task_id=task_id) steps: list[dict[str, Any]] = [] while not observation.done: ticket = observation.current_ticket if ticket is None: break investigate, tool_name = inference.should_investigate(ticket, observation.history) if ( investigate and tool_name is not None and observation.investigation_budget_remaining > 0 ): investigate_action = HelpdeskTicketAction( action_type="investigate", tool_name=tool_name, tool_target_ticket_id=ticket.get("related_ticket_id"), ) observation = env.step(investigate_action) steps.append( { "action": investigate_action.model_dump(exclude_none=True), "reward": observation.reward, "done": observation.done, "action_source": "baseline_investigate", } ) if observation.done: break ticket = observation.current_ticket if ticket is None: break action = _build_baseline_submit_action( inference.merge_ticket_context(ticket, observation), list(observation.allowed_fields), ) observation = env.step(action) steps.append( { "action": action.model_dump(exclude_none=True), "reward": observation.reward, "done": observation.done, "action_source": "baseline_submit", } ) return { "task_id": task_id, "seed": seed, "step_count": len(steps), "final_reward": observation.reward, "rubric_reward": observation.rubric_reward, "steps": steps, } @app.post("/grader") def grader_preview(request: GraderRequest): ticket = next( (record for record in load_dataset() if record.ticket_id == request.ticket_id), None, ) if ticket is None: raise HTTPException(status_code=404, detail=f"Unknown ticket_id: {request.ticket_id}") try: action = HelpdeskTicketAction.model_validate(request.action) except Exception as exc: raise HTTPException(status_code=422, detail=str(exc)) from exc score, breakdown = grade_action(action, ticket, request.task_id) return { "task_id": request.task_id, "ticket_id": request.ticket_id, "score": score, "breakdown": breakdown, "expected": { "issue_type": ticket.issue_type, "priority": ticket.priority, "assignment_group": ticket.assignment_group, "resolution_action": ticket.resolution_action, }, "submitted": action.model_dump(exclude_none=True), } def main() -> None: import uvicorn uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False) if __name__ == "__main__": main()