{escape(t['name'])}
{escape(t['instructions'])}
import sys
from html import escape
from pathlib import Path
from typing import Any
# Ensure repo root is on sys.path so `models` and `server` are importable
_repo_root = str(Path(__file__).resolve().parent.parent)
if _repo_root not in sys.path:
sys.path.insert(0, _repo_root)
from fastapi import HTTPException
from pydantic import BaseModel
from fastapi.responses import HTMLResponse, RedirectResponse
from openenv.core.env_server import create_app
from models import HelpdeskTicketAction, HelpdeskTicketObservation
from server.environment import HelpdeskTicketRoutingEnvironment
from server.grader import grade_action
from server.tasks import TASKS, load_dataset
from vocabulary import APP_ENV_NAME, PROJECT_TITLE, TEAM_NAME
app = create_app(
HelpdeskTicketRoutingEnvironment,
HelpdeskTicketAction,
HelpdeskTicketObservation,
env_name=APP_ENV_NAME,
)
class GraderRequest(BaseModel):
task_id: int
ticket_id: str
action: dict[str, Any]
@app.get("/", include_in_schema=False)
def root_redirect():
return RedirectResponse(url="/web", status_code=307)
@app.get("/tasks")
def list_tasks():
return {
"tasks": [
{
"id": t["id"],
"name": t["name"],
"difficulty": t["difficulty"],
"instructions": t["instructions"],
"allowed_fields": t["allowed_fields"],
}
for t in TASKS.values()
]
}
@app.get("/web", response_class=HTMLResponse)
def web_ui():
dataset = load_dataset()
dataset_size = len(dataset)
alternate_route_count = sum(
1 for ticket in dataset if ticket.alternate_route_score_multiplier > 0.0
)
clustered_case_count = sum(1 for ticket in dataset if ticket.service_cluster_id)
hidden_context_case_count = sum(
1
for ticket in dataset
if ticket.ambiguity_note
or ticket.related_ticket_id
or ticket.planning_note
or ticket.customer_update_note
)
incident_sensitive_count = sum(1 for ticket in dataset if ticket.incident_recommended)
difficulty_labels = {
"easy": "Guided",
"medium": "Contextual",
"hard": "Adaptive",
}
task_cards = "".join(
f"""
{escape(t['instructions'])}{escape(t['name'])}
OpenEnv Environment
{escape(APP_ENV_NAME)}
A sleek benchmark surface for sequential helpdesk routing: hidden context, cluster-aware follow-ons, incident handling, deferrals, and a terminal rubric that rewards queue strategy instead of isolated classification alone.
Task Ladder
The difficulty ladder keeps the same full-routing output while progressively changing observability, queue dependencies, and operational pressure.
Environment Signals
The benchmark is designed so strong policy choices change later tickets, incident coverage, and terminal queue quality instead of just nudging shaped reward.
Related-ticket previews, requester history, internal routing notes, queue cluster summaries, and capacity forecasts are revealed through explicit tool use.
Deferrals can raise later urgency, incident handling can reduce downstream debt, and weak handling can spawn or worsen follow-up work.
Final scoring blends routing trajectory quality with queue management quality so agents are rewarded for coherent episode strategy, not just isolated ticket matches.
Quick Routes
Useful entry points for judges, reviewers, or anyone trying to get signal from the project quickly.
Browse the full OpenEnv-compatible surface, request models, and built-in helper endpoints.
GET /docs
Open Docs
Inspect the easy, medium, and hard task definitions exactly as exposed by the server.
GET /tasks
View Tasks
See a deterministic baseline episode over the hardest queue with the current environment logic.
GET /baseline?task_id=3&seed=42
Run Baseline
Quick check that the service is alive and ready for OpenEnv-style evaluation requests.
GET /health
Check Health