Spaces:

AlgoCore
/

support-ticket-env

Sleeping

App Files Files Community

AlgoCore commited on Mar 28

Commit

a3d65ce

0 Parent(s):

Initial commit

Browse files

Files changed (28) hide show

.gitignore +3 -0
Dockerfile +27 -0
README.md +12 -0
__init__.py +11 -0
baseline.py +194 -0
client.py +33 -0
graders.py +136 -0
gradio_ui.py +211 -0
models.py +119 -0
openenv.yaml +57 -0
openenv_stub/openenv/__init__.py +0 -0
openenv_stub/openenv/core/__init__.py +3 -0
openenv_stub/openenv/core/env_client.py +12 -0
openenv_stub/openenv/core/env_server/__init__.py +0 -0
openenv_stub/openenv/core/env_server/http_server.py +13 -0
openenv_stub/openenv/core/env_server/interfaces.py +28 -0
openenv_stub/openenv/core/env_server/types.py +33 -0
pyproject.toml +26 -0
run_tests.py +276 -0
server/__init__.py +4 -0
server/app.py +28 -0
server/requirements.txt +7 -0
server/support_environment.py +281 -0
tests/__init__.py +0 -0
tests/conftest.py +6 -0
tests/test_environment.py +191 -0
tests/test_graders.py +121 -0
tickets.py +84 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__/
+*.pyc
+*.pyo

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# ── Dockerfile: Customer Support Ticket Resolution Environment ──
+FROM python:3.11-slim
+WORKDIR /app
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential curl && rm -rf /var/lib/apt/lists/*
+# Python deps
+COPY server/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy source
+COPY . /app/support_ticket_env
+ENV PYTHONPATH=/app
+ENV ENABLE_WEB_INTERFACE=true
+# HF Spaces uses port 7860
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+CMD ["uvicorn", "support_ticket_env.server.app:app", \
+     "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Support Ticket Env
+emoji: 🎫
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---
+# Customer Support Ticket Resolution Environment
+A real-world OpenEnv environment for AI agent training.

__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Customer Support Ticket Resolution — OpenEnv Environment."""
+from support_ticket_env.models import SupportAction, SupportObservation, SupportState
+from support_ticket_env.client import SupportTicketEnv
+__all__ = [
+    "SupportAction",
+    "SupportObservation",
+    "SupportState",
+    "SupportTicketEnv",
+]

baseline.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python3
+"""
+baseline.py — Baseline inference script for the Support Ticket Environment.
+Runs an OpenAI-compatible model against all 3 tasks and reports scores.
+Usage:
+    OPENAI_API_KEY=sk-... python baseline.py --base-url http://localhost:7860
+Environment variables:
+    OPENAI_API_KEY   : required
+    OPENAI_BASE_URL  : optional override (default https://api.openai.com/v1)
+    OPENAI_MODEL     : optional model name (default gpt-4o-mini)
+"""
+import argparse
+import json
+import os
+import asyncio
+import re
+from openai import AsyncOpenAI
+from support_ticket_env.client import SupportTicketEnv
+from support_ticket_env.models import SupportAction
+# ─────────────────────────── Config ────────────────────────────
+VALID_CATEGORIES = ["billing", "technical", "account", "general", "refund"]
+VALID_ACTIONS = ["classify", "reply", "escalate", "close"]
+SYSTEM_PROMPT = """You are a customer support AI agent operating in a ticket triage environment.
+On each turn you receive a JSON observation with:
+  - ticket_text : the customer's message
+  - feedback    : what happened last step
+  - task_id     : 1=classify only, 2=classify then act, 3=full resolution
+You must respond with a JSON object (no markdown) matching this schema:
+{
+  "action_type": "classify" | "reply" | "escalate" | "close",
+  "category": "billing" | "technical" | "account" | "general" | "refund",  // only for classify
+  "reply_text": "...",  // only for reply
+  "reason": "..."       // optional
+}
+Strategy:
+- For task 1: only classify (use action_type="classify" with a category).
+- For task 2: first classify, then choose the best action.
+- For task 3: classify each ticket, then reply/escalate/close as appropriate.
+Always produce valid JSON and nothing else.
+"""
+def parse_llm_response(text: str) -> dict:
+    """Extract JSON from LLM response, stripping markdown fences if present."""
+    text = text.strip()
+    # Strip ```json ... ``` fences
+    text = re.sub(r"^```(?:json)?\s*", "", text)
+    text = re.sub(r"\s*```$", "", text)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        # Fallback: try to extract first JSON object
+        match = re.search(r"\{.*\}", text, re.DOTALL)
+        if match:
+            return json.loads(match.group())
+        raise
+async def run_task(
+    env_base_url: str,
+    llm: AsyncOpenAI,
+    model: str,
+    task_id: int,
+    seed: int = 42,
+    max_steps: int = 10,
+) -> float:
+    """Run one episode for a given task_id. Returns the total reward."""
+    total_reward = 0.0
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    async with SupportTicketEnv(base_url=env_base_url) as env:
+        result = await env.reset(task_id=task_id, seed=seed)
+        obs = result.observation
+        for step in range(max_steps):
+            # Build user message from observation
+            obs_text = json.dumps({
+                "ticket_id": obs.ticket_id,
+                "ticket_text": obs.ticket_text,
+                "task_id": obs.task_id,
+                "current_category": obs.current_category,
+                "resolved": obs.resolved,
+                "step_count": obs.step_count,
+                "feedback": obs.feedback,
+            }, indent=2)
+            messages.append({"role": "user", "content": obs_text})
+            # Call LLM
+            response = await llm.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.0,
+                max_tokens=256,
+            )
+            assistant_text = response.choices[0].message.content
+            messages.append({"role": "assistant", "content": assistant_text})
+            # Parse action
+            try:
+                action_dict = parse_llm_response(assistant_text)
+            except Exception as e:
+                print(f"  [step {step+1}] Failed to parse LLM response: {e}")
+                break
+            try:
+                action = SupportAction(**action_dict)
+            except Exception as e:
+                print(f"  [step {step+1}] Invalid action schema: {e}")
+                break
+            # Step environment
+            result = await env.step(action)
+            obs = result.observation
+            reward = result.reward or 0.0
+            total_reward += reward
+            print(
+                f"  [step {step+1}] action={action.action_type}"
+                + (f"/{action.category}" if action.category else "")
+                + f"  reward={reward:.3f}  feedback={obs.feedback[:60]}"
+            )
+            if result.done:
+                break
+    return round(total_reward, 4)
+async def main(env_base_url: str, model: str, seeds: list[int]) -> None:
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed")
+    openai_base = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
+    llm = AsyncOpenAI(api_key=api_key, base_url=openai_base)
+    results = {}
+    for task_id in [1, 2, 3]:
+        task_scores = []
+        print(f"\n{'='*60}")
+        print(f"  TASK {task_id}  (seed={seeds[0]})")
+        print(f"{'='*60}")
+        for seed in seeds:
+            score = await run_task(env_base_url, llm, model, task_id, seed=seed)
+            task_scores.append(score)
+            print(f"  → total_reward for seed {seed}: {score}")
+        avg = round(sum(task_scores) / len(task_scores), 4)
+        results[f"task{task_id}"] = {"scores": task_scores, "avg": avg}
+        print(f"  ► Average: {avg}")
+    print("\n" + "="*60)
+    print("  BASELINE SUMMARY")
+    print("="*60)
+    for k, v in results.items():
+        print(f"  {k}: avg={v['avg']:.4f}  scores={v['scores']}")
+    overall = round(
+        sum(v["avg"] for v in results.values()) / len(results), 4
+    )
+    print(f"\n  Overall avg: {overall:.4f}")
+    print("="*60)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Baseline inference for support_ticket_env")
+    parser.add_argument(
+        "--base-url",
+        default=os.environ.get("ENV_BASE_URL", "http://localhost:7860"),
+        help="Base URL of the running environment server",
+    )
+    parser.add_argument(
+        "--model",
+        default=os.environ.get("OPENAI_MODEL", "gpt-4o-mini"),
+        help="OpenAI model name",
+    )
+    parser.add_argument(
+        "--seeds",
+        nargs="+",
+        type=int,
+        default=[42, 7, 123],
+        help="Random seeds for reproducibility",
+    )
+    args = parser.parse_args()
+    asyncio.run(main(args.base_url, args.model, args.seeds))

client.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Client for the Customer Support Ticket Resolution Environment.
+"""
+from openenv.core.env_client import EnvClient
+from support_ticket_env.models import SupportAction, SupportObservation, SupportState
+class SupportTicketEnv(EnvClient[SupportAction, SupportObservation, SupportState]):
+    """
+    OpenEnv client for the Support Ticket Resolution environment.
+    Usage (async):
+        async with SupportTicketEnv(base_url="http://localhost:8000") as env:
+            result = await env.reset(task_id=1)
+            result = await env.step(SupportAction(action_type="classify", category="billing"))
+    Usage (sync):
+        with SupportTicketEnv(base_url="http://localhost:8000").sync() as env:
+            result = env.reset(task_id=2)
+            result = env.step(SupportAction(action_type="classify", category="technical"))
+            result = env.step(SupportAction(action_type="escalate"))
+    """
+    def _parse_action(self, action: SupportAction) -> dict:
+        return action.model_dump()
+    def _parse_result(self, data: dict) -> SupportObservation:
+        obs_data = data.get("observation", data)
+        return SupportObservation(**obs_data)
+    def _parse_state(self, data: dict) -> SupportState:
+        return SupportState(**data)

graders.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Graders for all three tasks.
+Each grader returns a float in [0.0, 1.0].
+Task 1 – Classification (easy)
+    - 1.0  : correct category
+    - 0.0  : wrong category
+Task 2 – Action Selection (medium)
+    - 1.0  : correct action
+    - 0.5  : partially correct (e.g., escalate vs reply both defensible)
+    - 0.0  : clearly wrong (e.g., close an unsolved ticket)
+Task 3 – Full Resolution (hard)
+    Combines classification + action + reply quality into a single score.
+    Rewards partial progress so the agent gets signal throughout the trajectory.
+"""
+from __future__ import annotations
+from typing import Dict, Any
+# ─────────────────────────── helpers ───────────────────────────
+# Pairs of actions that are considered "close enough" for partial credit
+_PARTIAL_CREDIT_PAIRS = {
+    frozenset({"reply", "escalate"}),  # borderline tickets
+}
+_KEYWORD_REWARDS: Dict[str, list[str]] = {
+    "billing":   ["refund", "charge", "invoice", "payment", "billing"],
+    "account":   ["password", "login", "account", "cancel", "subscription"],
+    "technical": ["engineering", "escalate", "bug", "crash", "error", "fix"],
+    "refund":    ["refund", "return", "credit", "process"],
+    "general":   ["hours", "contact", "phone", "information", "help"],
+}
+def _reply_quality(reply_text: str, category: str) -> float:
+    """Return 0.0–0.5 based on how relevant the reply text is."""
+    if not reply_text:
+        return 0.0
+    text_lower = reply_text.lower()
+    keywords = _KEYWORD_REWARDS.get(category, [])
+    hits = sum(1 for kw in keywords if kw in text_lower)
+    # cap at 0.5 (the other 0.5 comes from action correctness)
+    return min(0.5, hits * 0.1)
+# ─────────────────────────── Task 1 ────────────────────────────
+def grade_task1(
+    predicted_category: str,
+    correct_category: str,
+) -> float:
+    """Binary classification reward."""
+    return 1.0 if predicted_category == correct_category else 0.0
+# ─────────────────────────── Task 2 ────────────────────────────
+def grade_task2(
+    action_type: str,
+    correct_action: str,
+    category: str | None = None,
+) -> float:
+    """
+    Action-selection reward.
+    Full credit for exact match, partial credit for defensible alternatives.
+    Penalises closing an unresolved ticket.
+    """
+    if action_type == correct_action:
+        return 1.0
+    # Partial credit for ambiguous cases
+    pair = frozenset({action_type, correct_action})
+    if pair in _PARTIAL_CREDIT_PAIRS:
+        return 0.5
+    # Closing an unresolved ticket is always wrong
+    if action_type == "close":
+        return 0.0
+    return 0.0
+# ─────────────────────────── Task 3 ────────────────────────────
+def grade_task3(
+    classified_correctly: bool,
+    action_correct: bool,
+    action_partial: bool,
+    reply_text: str | None,
+    category: str,
+    resolved: bool,
+    steps_taken: int,
+    max_steps: int = 5,
+) -> float:
+    """
+    Multi-step resolution reward with partial progress.
+    Breakdown:
+      0.20  – classification correct
+      0.40  – action correct  (0.20 if partial)
+      0.25  – reply quality   (NLP keyword overlap)
+      0.15  – efficiency bonus (fewer steps → higher bonus)
+    """
+    score = 0.0
+    if classified_correctly:
+        score += 0.20
+    if action_correct:
+        score += 0.40
+    elif action_partial:
+        score += 0.20
+    if reply_text:
+        score += _reply_quality(reply_text, category) * 0.5   # scaled to 0.25 max
+    # Efficiency: full 0.15 for 1 step, 0 for max_steps steps
+    if resolved and steps_taken <= max_steps:
+        efficiency = max(0.0, (max_steps - steps_taken) / (max_steps - 1))
+        score += 0.15 * efficiency
+    return round(min(1.0, score), 4)
+# ─────────────────────────── Penalty ───────────────────────────
+def loop_penalty(step_count: int, max_steps: int = 10) -> float:
+    """Return a negative reward if agent is stuck in a loop."""
+    if step_count > max_steps:
+        return -0.05 * (step_count - max_steps)
+    return 0.0

gradio_ui.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+gradio_ui.py — Interactive Gradio web interface for the Support Ticket Environment.
+Allows human exploration and debugging without writing code.
+Launched automatically when ENABLE_WEB_INTERFACE=true or run directly.
+Usage:
+    python support_ticket_env/gradio_ui.py
+"""
+import json
+import sys
+import os
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+STUB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "openenv_stub")
+sys.path.insert(0, STUB)
+sys.path.insert(0, ROOT)
+try:
+    import gradio as gr
+except ImportError:
+    print("gradio not installed. Run: pip install gradio")
+    sys.exit(1)
+from support_ticket_env.server.support_environment import SupportTicketEnvironment
+from support_ticket_env.models import SupportAction
+# ─── shared env instance ────────────────────────────────────────
+_env = SupportTicketEnvironment()
+_history: list[dict] = []
+_current_obs = None
+# ─── helpers ────────────────────────────────────────────────────
+def _format_history() -> str:
+    if not _history:
+        return "_No actions yet._"
+    lines = []
+    for i, h in enumerate(_history, 1):
+        reward_str = f"{h['reward']:+.3f}" if h["reward"] is not None else "—"
+        lines.append(
+            f"**Step {i}** | `{h['action']}` → reward `{reward_str}`\n"
+            f"> {h['feedback']}"
+        )
+    return "\n\n".join(lines)
+def _obs_to_display(obs) -> tuple[str, str, str]:
+    """Return (ticket_box, status_box, score_box)."""
+    ticket = f"**[{obs.ticket_id}]** {obs.ticket_text}"
+    status = (
+        f"Task **{obs.task_id}** | Step **{obs.step_count}** | "
+        f"Category: `{obs.current_category or 'unknown'}` | "
+        f"Resolved: {'✅' if obs.resolved else '⬜'}"
+    )
+    score = f"Last step score: **{obs.score:.3f}** | reward: **{obs.reward or 0.0:+.3f}**"
+    return ticket, status, score
+# ─── UI callbacks ────────────────────────────────────────────────
+def do_reset(task_id: int, seed: int):
+    global _history, _current_obs
+    _history = []
+    obs = _env.reset(task_id=task_id, seed=seed)
+    _current_obs = obs
+    ticket, status, score = _obs_to_display(obs)
+    return (
+        ticket, status, score,
+        _format_history(),
+        gr.update(interactive=True),
+        obs.feedback,
+        gr.update(value=False),  # done flag
+    )
+def do_step(action_type: str, category: str, reply_text: str, reason: str):
+    global _current_obs
+    if _current_obs is None:
+        return (
+            "⚠️ Please reset the environment first.",
+            "", "", _format_history(), "", gr.update(value=False),
+        )
+    # Build action
+    kwargs = {"action_type": action_type}
+    if action_type == "classify" and category:
+        kwargs["category"] = category
+    if action_type == "reply" and reply_text:
+        kwargs["reply_text"] = reply_text
+    if reason:
+        kwargs["reason"] = reason
+    try:
+        action = SupportAction(**kwargs)
+    except Exception as e:
+        return (
+            _current_obs.ticket_text,
+            f"❌ Invalid action: {e}", "",
+            _format_history(), "", gr.update(value=False),
+        )
+    obs = _env.step(action)
+    _current_obs = obs
+    _history.append({
+        "action": f"{action_type}" + (f"/{category}" if category and action_type == "classify" else ""),
+        "reward": obs.reward,
+        "feedback": obs.feedback,
+    })
+    ticket, status, score = _obs_to_display(obs)
+    done_msg = "🏁 Episode finished!" if obs.done else ""
+    return (
+        ticket, status, score,
+        _format_history(),
+        obs.feedback,
+        gr.update(value=obs.done),
+    )
+def do_state():
+    state = _env.state
+    return json.dumps({
+        "episode_id": state.episode_id,
+        "step_count": state.step_count,
+        "task_id": state.task_id,
+        "ticket_id": state.ticket_id,
+        "correct_category": state.correct_category,
+        "correct_action": state.correct_action,
+        "classified": state.classified,
+        "resolved": state.resolved,
+        "total_reward": state.total_reward,
+        "tickets_resolved": state.tickets_resolved,
+        "tickets_total": state.tickets_total,
+    }, indent=2)
+# ─── UI layout ──────────────────────────────────────────────────
+DESCRIPTION = """
+# 🎫 Customer Support Ticket Resolution Environment
+An **OpenEnv** environment for training AI agents to handle customer support tickets.
+**Tasks:** 1 = Classify · 2 = Classify + Action · 3 = Full Queue Resolution
+"""
+with gr.Blocks(title="Support Ticket Env", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        # ── Left panel: controls ────────────────────────────────
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Episode Setup")
+            task_slider = gr.Slider(1, 3, value=1, step=1, label="Task ID")
+            seed_input  = gr.Number(value=42, label="Seed", precision=0)
+            reset_btn   = gr.Button("🔄 Reset Episode", variant="primary")
+            gr.Markdown("### 🎬 Take Action")
+            action_type  = gr.Radio(
+                ["classify", "reply", "escalate", "close"],
+                value="classify", label="Action Type",
+            )
+            category_dd = gr.Dropdown(
+                ["billing", "technical", "account", "general", "refund"],
+                label="Category (for classify)",
+                value=None,
+            )
+            reply_box  = gr.Textbox(label="Reply Text (for reply)", lines=3)
+            reason_box = gr.Textbox(label="Reason (optional)")
+            step_btn   = gr.Button("▶️ Step", variant="secondary")
+            state_btn  = gr.Button("🔍 Show State")
+        # ── Right panel: observation ────────────────────────────
+        with gr.Column(scale=2):
+            gr.Markdown("### 📬 Current Ticket")
+            ticket_display  = gr.Markdown("_Reset to start._")
+            status_display  = gr.Markdown("")
+            score_display   = gr.Markdown("")
+            feedback_box    = gr.Textbox(label="Last Feedback", interactive=False)
+            done_checkbox   = gr.Checkbox(label="Episode Done", interactive=False)
+            gr.Markdown("### 📜 Action History")
+            history_display = gr.Markdown("_No actions yet._")
+            gr.Markdown("### 🗂️ Raw State (JSON)")
+            state_output = gr.Code(language="json", label="state()")
+    # ── wire up ─────────────────────────────────────────────────
+    reset_btn.click(
+        do_reset,
+        inputs=[task_slider, seed_input],
+        outputs=[ticket_display, status_display, score_display,
+                 history_display, step_btn, feedback_box, done_checkbox],
+    )
+    step_btn.click(
+        do_step,
+        inputs=[action_type, category_dd, reply_box, reason_box],
+        outputs=[ticket_display, status_display, score_display,
+                 history_display, feedback_box, done_checkbox],
+    )
+    state_btn.click(
+        do_state, inputs=[], outputs=[state_output],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7861, share=False)

models.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Typed models for the Customer Support Ticket Resolution Environment.
+Works with pydantic (production) or stdlib (offline/testing).
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+try:
+    from pydantic import BaseModel, ConfigDict
+    _USE_PYDANTIC = True
+except ImportError:
+    _USE_PYDANTIC = False
+# ── import base classes from openenv (or stub) ──────────────────
+from openenv.core.env_server.types import Action, Observation, State
+# ═══════════════════════════════════════════════════════════════
+# Action
+# ═══════════════════════════════════════════════════════════════
+if _USE_PYDANTIC:
+    class SupportAction(Action, BaseModel):  # type: ignore[misc]
+        model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+        metadata: Dict[str, Any] = {}
+        action_type: Literal["classify", "reply", "escalate", "close"]
+        category: Optional[
+            Literal["billing", "technical", "account", "general", "refund"]
+        ] = None
+        reply_text: Optional[str] = None
+        reason: Optional[str] = None
+        def model_dump(self, **kw):
+            return super().model_dump(**kw)
+else:
+    _VALID_ACTION_TYPES = {"classify", "reply", "escalate", "close"}
+    _VALID_CATEGORIES   = {"billing", "technical", "account", "general", "refund"}
+    class SupportAction(Action):  # type: ignore[no-redef]
+        def __init__(self, **kwargs):
+            action_type = kwargs.get("action_type")
+            if action_type not in _VALID_ACTION_TYPES:
+                raise ValueError(f"Invalid action_type: {action_type!r}")
+            category = kwargs.get("category")
+            if category is not None and category not in _VALID_CATEGORIES:
+                raise ValueError(f"Invalid category: {category!r}")
+            self.action_type = action_type
+            self.category    = category
+            self.reply_text  = kwargs.get("reply_text")
+            self.reason      = kwargs.get("reason")
+            self.metadata    = kwargs.get("metadata", {})
+# ═══════════════════════════════════════════════════════════════
+# Observation
+# ═══════════════════════════════════════════════════════════════
+if _USE_PYDANTIC:
+    class SupportObservation(Observation, BaseModel):  # type: ignore[misc]
+        model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+        done: bool = False
+        reward: Optional[float] = None
+        metadata: Dict[str, Any] = {}
+        ticket_id: str = ""
+        ticket_text: str = ""
+        task_id: int = 1
+        current_category: Optional[str] = None
+        resolved: bool = False
+        step_count: int = 0
+        feedback: str = ""
+        score: float = 0.0
+else:
+    class SupportObservation(Observation):  # type: ignore[no-redef]
+        def __init__(self, **kwargs):
+            self.done             = kwargs.pop("done", False)
+            self.reward           = kwargs.pop("reward", None)
+            self.metadata         = kwargs.pop("metadata", {})
+            self.ticket_id        = kwargs.pop("ticket_id", "")
+            self.ticket_text      = kwargs.pop("ticket_text", "")
+            self.task_id          = kwargs.pop("task_id", 1)
+            self.current_category = kwargs.pop("current_category", None)
+            self.resolved         = kwargs.pop("resolved", False)
+            self.step_count       = kwargs.pop("step_count", 0)
+            self.feedback         = kwargs.pop("feedback", "")
+            self.score            = kwargs.pop("score", 0.0)
+# ═══════════════════════════════════════════════════════════════
+# State
+# ═══════════════════════════════════════════════════════════════
+if _USE_PYDANTIC:
+    class SupportState(State, BaseModel):  # type: ignore[misc]
+        model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow")
+        episode_id: Optional[str] = None
+        step_count: int = 0
+        task_id: int = 1
+        ticket_id: str = ""
+        correct_category: str = ""
+        correct_action: str = ""
+        classified: bool = False
+        resolved: bool = False
+        total_reward: float = 0.0
+        tickets_resolved: int = 0
+        tickets_total: int = 1
+else:
+    class SupportState(State):  # type: ignore[no-redef]
+        def __init__(self, **kwargs):
+            self.episode_id       = kwargs.pop("episode_id", None)
+            self.step_count       = kwargs.pop("step_count", 0)
+            self.task_id          = kwargs.pop("task_id", 1)
+            self.ticket_id        = kwargs.pop("ticket_id", "")
+            self.correct_category = kwargs.pop("correct_category", "")
+            self.correct_action   = kwargs.pop("correct_action", "")
+            self.classified       = kwargs.pop("classified", False)
+            self.resolved         = kwargs.pop("resolved", False)
+            self.total_reward     = kwargs.pop("total_reward", 0.0)
+            self.tickets_resolved = kwargs.pop("tickets_resolved", 0)
+            self.tickets_total    = kwargs.pop("tickets_total", 1)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+name: support_ticket_env
+version: "1.0.0"
+description: >
+  A real-world customer support ticket triage environment.
+  An AI agent acts as a support executive: it classifies incoming tickets,
+  selects the correct action (reply / escalate / close), and resolves
+  multi-ticket queues efficiently.
+author: OpenEnv Hackathon Entry
+tags:
+  - openenv
+  - customer-support
+  - triage
+  - nlp
+  - real-world
+tasks:
+  - id: 1
+    name: Classification
+    difficulty: easy
+    description: >
+      Given a customer ticket, predict the correct category
+      (billing | technical | account | general | refund).
+    score_range: [0.0, 1.0]
+  - id: 2
+    name: Action Selection
+    difficulty: medium
+    description: >
+      First classify the ticket, then choose the best action:
+      reply, escalate, or close.
+    score_range: [0.0, 1.0]
+  - id: 3
+    name: Full Resolution
+    difficulty: hard
+    description: >
+      Handle a queue of 3 tickets. For each ticket classify it,
+      choose the right action, and (if replying) craft a relevant reply.
+      Bonus for fewer steps.
+    score_range: [0.0, 1.0]
+action_space:
+  type: SupportAction
+  fields:
+    action_type: "classify | reply | escalate | close"
+    category: "billing | technical | account | general | refund (required for classify)"
+    reply_text: "string (required for reply)"
+    reason: "optional justification"
+observation_space:
+  type: SupportObservation
+  fields:
+    ticket_id: string
+    ticket_text: string
+    task_id: integer
+    current_category: "string | null"
+    resolved: boolean
+    step_count: integer
+    feedback: string
+    score: float
+docker_image: "support-ticket-env:latest"
+hf_space: "openenv/support-ticket-env"

openenv_stub/openenv/__init__.py ADDED Viewed

File without changes

openenv_stub/openenv/core/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from openenv.core.env_client import EnvClient
+from openenv.core.env_server.types import Action, Observation, State
+from openenv.core.env_server.interfaces import Environment

openenv_stub/openenv/core/env_client.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Stub for openenv.core.env_client."""
+from abc import ABC
+from typing import Generic, TypeVar
+ActT = TypeVar("ActT")
+ObsT = TypeVar("ObsT")
+StateT = TypeVar("StateT")
+class EnvClient(ABC, Generic[ActT, ObsT, StateT]):
+    def __init__(self, base_url: str, **kwargs):
+        self.base_url = base_url

openenv_stub/openenv/core/env_server/__init__.py ADDED Viewed

File without changes

openenv_stub/openenv/core/env_server/http_server.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Stub for openenv.core.env_server.http_server."""
+from typing import Any, Callable, Optional, Type
+from openenv.core.env_server.types import Action, Observation
+def create_app(env, action_cls, observation_cls, env_name=None, max_concurrent_envs=1, **kwargs):
+    """Stub — returns None when FastAPI is not available."""
+    try:
+        from fastapi import FastAPI
+        app = FastAPI(title=env_name or "SupportTicketEnv")
+        return app
+    except ImportError:
+        return None

openenv_stub/openenv/core/env_server/interfaces.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Stub for openenv.core.env_server.interfaces."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+from openenv.core.env_server.types import Action, Observation, State
+class Environment(ABC):
+    SUPPORTS_CONCURRENT_SESSIONS: bool = False
+    def __init__(self, transform=None, rubric=None):
+        self.rubric = rubric
+    @abstractmethod
+    def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs) -> Observation:
+        ...
+    @abstractmethod
+    def step(self, action: Action, timeout_s: Optional[float] = None, **kwargs) -> Observation:
+        ...
+    @property
+    @abstractmethod
+    def state(self) -> State:
+        ...
+    def close(self) -> None:
+        pass

openenv_stub/openenv/core/env_server/types.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Offline stub for openenv.core.env_server.types.
+Uses stdlib only — no pydantic required.
+"""
+from __future__ import annotations
+from typing import Any, Dict, Optional
+class Action:
+    def __init__(self, **kwargs):
+        self.metadata = kwargs.pop("metadata", {})
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+    def model_dump(self):
+        return {k: v for k, v in vars(self).items()}
+class Observation:
+    def __init__(self, **kwargs):
+        self.done   = kwargs.pop("done", False)
+        self.reward = kwargs.pop("reward", None)
+        self.metadata = kwargs.pop("metadata", {})
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+class State:
+    def __init__(self, **kwargs):
+        self.episode_id = kwargs.pop("episode_id", None)
+        self.step_count = kwargs.pop("step_count", 0)
+        for k, v in kwargs.items():
+            setattr(self, k, v)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.backends.legacy:build"
+[project]
+name = "support-ticket-env"
+version = "1.0.0"
+description = "Customer Support Ticket Resolution — OpenEnv Environment"
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+dependencies = [
+    "openenv-core>=0.2.1",
+    "fastapi>=0.104.0",
+    "uvicorn[standard]>=0.24.0",
+    "pydantic>=2.0.0",
+    "openai>=1.0.0",
+    "pyyaml>=6.0",
+]
+[project.optional-dependencies]
+dev = ["pytest>=7.0", "pytest-asyncio", "httpx"]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["support_ticket_env*"]

run_tests.py ADDED Viewed

	@@ -0,0 +1,276 @@

+#!/usr/bin/env python3
+"""
+run_tests.py — Self-contained test runner for support_ticket_env.
+Runs all test cases using only the Python standard library.
+Usage:
+    python run_tests.py
+"""
+import sys
+import os
+import traceback
+from typing import Callable, List, Tuple
+# ─── path setup ────────────────────────────────────────────────
+ROOT = os.path.dirname(os.path.abspath(__file__))
+STUB = os.path.join(ROOT, "openenv_stub")
+sys.path.insert(0, STUB)
+sys.path.insert(0, ROOT)
+# ─── minimal test framework ────────────────────────────────────
+_tests: List[Tuple[str, Callable]] = []
+_passed = 0
+_failed = 0
+_errors = 0
+def test(fn: Callable) -> Callable:
+    _tests.append((fn.__qualname__, fn))
+    return fn
+def assert_eq(a, b, msg=""):
+    if a != b:
+        raise AssertionError(f"{msg} | expected {b!r}, got {a!r}")
+def assert_true(val, msg=""):
+    if not val:
+        raise AssertionError(msg or f"Expected truthy, got {val!r}")
+def assert_in_range(val, lo, hi, msg=""):
+    if not (lo <= val <= hi):
+        raise AssertionError(msg or f"Expected {val!r} in [{lo}, {hi}]")
+# ─────────────────────────────── imports ───────────────────────
+from support_ticket_env.graders import (
+    grade_task1, grade_task2, grade_task3, loop_penalty,
+)
+from support_ticket_env.server.support_environment import SupportTicketEnvironment
+from support_ticket_env.models import SupportAction
+def make_env():
+    return SupportTicketEnvironment()
+# ════════════════════════════════════════════════════════════════
+# GRADER TESTS
+# ════════════════════════════════════════════════════════════════
+@test
+def test_grade1_correct():
+    assert_eq(grade_task1("billing", "billing"), 1.0)
+@test
+def test_grade1_wrong():
+    assert_eq(grade_task1("technical", "billing"), 0.0)
+@test
+def test_grade1_all_categories():
+    for cat in ["billing", "technical", "account", "general", "refund"]:
+        assert_eq(grade_task1(cat, cat), 1.0, f"cat={cat}")
+@test
+def test_grade1_empty():
+    assert_eq(grade_task1("", "billing"), 0.0)
+@test
+def test_grade2_exact_reply():
+    assert_eq(grade_task2("reply", "reply"), 1.0)
+@test
+def test_grade2_exact_escalate():
+    assert_eq(grade_task2("escalate", "escalate"), 1.0)
+@test
+def test_grade2_exact_close():
+    assert_eq(grade_task2("close", "close"), 1.0)
+@test
+def test_grade2_partial_reply_escalate():
+    assert_eq(grade_task2("reply", "escalate"), 0.5)
+    assert_eq(grade_task2("escalate", "reply"), 0.5)
+@test
+def test_grade2_close_wrong():
+    assert_eq(grade_task2("close", "reply"), 0.0)
+@test
+def test_grade3_perfect():
+    score = grade_task3(True, True, False,
+                        "we will process your refund billing payment",
+                        "billing", True, 1, 5)
+    assert_true(score >= 0.9, f"Expected >=0.9, got {score}")
+@test
+def test_grade3_capped_at_one():
+    score = grade_task3(True, True, False,
+                        "refund billing payment account cancel subscription",
+                        "billing", True, 1, 5)
+    assert_true(score <= 1.0, f"Score exceeds 1.0: {score}")
+@test
+def test_grade3_partial_action_less_than_full():
+    s_partial = grade_task3(True, False, True, None, "technical", True, 2)
+    s_full = grade_task3(True, True, False, None, "technical", True, 2)
+    assert_true(s_partial < s_full, f"partial={s_partial} should < full={s_full}")
+@test
+def test_loop_penalty_none_within_limit():
+    assert_eq(loop_penalty(5), 0.0)
+    assert_eq(loop_penalty(10), 0.0)
+@test
+def test_loop_penalty_grows():
+    assert_true(loop_penalty(12) < loop_penalty(11))
+    assert_true(loop_penalty(11) < 0)
+# ════════════════════════════════════════════════════════════════
+# ENVIRONMENT TESTS
+# ════════════════════════════════════════════════════════════════
+@test
+def test_env_reset_task1():
+    env = make_env()
+    obs = env.reset(task_id=1, seed=42)
+    assert_true(obs.ticket_text != "", "ticket_text should not be empty")
+    assert_eq(obs.task_id, 1)
+    assert_eq(obs.done, False)
+@test
+def test_env_task1_correct_classification():
+    env = make_env()
+    env.reset(task_id=1, seed=42)
+    state = env.state
+    obs = env.step(SupportAction(action_type="classify", category=state.correct_category))
+    assert_eq(obs.reward, 1.0)
+    assert_eq(obs.done, True)
+@test
+def test_env_task1_wrong_classification():
+    env = make_env()
+    env.reset(task_id=1, seed=42)
+    state = env.state
+    wrong = next(c for c in ["billing","technical","account","general","refund"]
+                 if c != state.correct_category)
+    obs = env.step(SupportAction(action_type="classify", category=wrong))
+    assert_eq(obs.reward, 0.0)
+    assert_eq(obs.done, True)
+@test
+def test_env_task2_must_classify_first():
+    env = make_env()
+    env.reset(task_id=2, seed=42)
+    obs = env.step(SupportAction(action_type="escalate"))
+    assert_eq(obs.done, False)
+    assert_true("classify" in obs.feedback.lower())
+@test
+def test_env_task2_full_correct_episode():
+    env = make_env()
+    env.reset(task_id=2, seed=42)
+    state = env.state
+    env.step(SupportAction(action_type="classify", category=state.correct_category))
+    obs = env.step(SupportAction(action_type=state.correct_action))
+    assert_eq(obs.done, True)
+    assert_true(obs.reward >= 0.5, f"reward={obs.reward}")
+@test
+def test_env_task3_three_tickets():
+    env = make_env()
+    env.reset(task_id=3, seed=42)
+    assert_eq(env.state.tickets_total, 3)
+@test
+def test_env_task3_resolves_all():
+    env = make_env()
+    env.reset(task_id=3, seed=42)
+    done = False
+    steps = 0
+    while not done and steps < 30:
+        state = env.state
+        if not state.classified:
+            action = SupportAction(action_type="classify", category=state.correct_category)
+        else:
+            ca = state.correct_action
+            action = (SupportAction(action_type="reply",
+                                    reply_text=f"Handling your {state.correct_category} issue.")
+                      if ca == "reply" else SupportAction(action_type=ca))
+        obs = env.step(action)
+        done = obs.done
+        steps += 1
+    assert_true(done, "Episode did not finish")
+    assert_eq(env.state.tickets_resolved, 3)
+@test
+def test_env_state_step_count():
+    env = make_env()
+    env.reset(task_id=1, seed=0)
+    assert_eq(env.state.step_count, 0)
+    state = env.state
+    env.step(SupportAction(action_type="classify", category=state.correct_category))
+    assert_eq(env.state.step_count, 1)
+@test
+def test_env_reward_always_in_range():
+    for seed in [0, 1, 2, 42, 99]:
+        for task_id in [1, 2, 3]:
+            env = make_env()
+            env.reset(task_id=task_id, seed=seed)
+            state = env.state
+            obs = env.step(SupportAction(action_type="classify", category=state.correct_category))
+            r = obs.reward or 0.0
+            assert_in_range(r, -1.0, 1.0, f"task={task_id} seed={seed} reward={r}")
+@test
+def test_env_task3_total_reward_positive():
+    env = make_env()
+    env.reset(task_id=3, seed=7)
+    total = 0.0
+    done = False
+    steps = 0
+    while not done and steps < 20:
+        state = env.state
+        action = (SupportAction(action_type="classify", category=state.correct_category)
+                  if not state.classified
+                  else SupportAction(action_type=state.correct_action))
+        obs = env.step(action)
+        total += obs.reward or 0.0
+        done = obs.done
+        steps += 1
+    assert_true(total > 0.0, f"total_reward={total}")
+# ════════════════════════════════════════════════════════════════
+# Runner
+# ════════════════════════════════════════════════════════════════
+def run_all():
+    global _passed, _failed, _errors
+    width = max(len(name) for name, _ in _tests) + 2
+    print(f"\n{'='*60}")
+    print(f"  Running {len(_tests)} tests")
+    print(f"{'='*60}")
+    for name, fn in _tests:
+        try:
+            fn()
+            print(f"  ✅  {name}")
+            _passed += 1
+        except AssertionError as e:
+            print(f"  ❌  {name}")
+            print(f"       {e}")
+            _failed += 1
+        except Exception:
+            print(f"  💥  {name}")
+            traceback.print_exc(limit=3)
+            _errors += 1
+    total = _passed + _failed + _errors
+    print(f"\n{'='*60}")
+    print(f"  Results: {_passed}/{total} passed | {_failed} failed | {_errors} errors")
+    print(f"{'='*60}\n")
+    return _failed + _errors == 0
+if __name__ == "__main__":
+    ok = run_all()
+    sys.exit(0 if ok else 1)

server/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from support_ticket_env.server.support_environment import SupportTicketEnvironment
+from support_ticket_env.server.app import app
+__all__ = ["SupportTicketEnvironment", "app"]

server/app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+FastAPI application entry point for the Support Ticket Environment.
+Serves the OpenEnv HTTP/WebSocket API and optionally the Gradio UI at /web.
+"""
+import os
+from openenv.core.env_server.http_server import create_app
+from support_ticket_env.models import SupportAction, SupportObservation
+from support_ticket_env.server.support_environment import SupportTicketEnvironment
+app = create_app(
+    env=SupportTicketEnvironment,
+    action_cls=SupportAction,
+    observation_cls=SupportObservation,
+    env_name="support_ticket_env",
+    max_concurrent_envs=4,
+)
+# Mount Gradio UI at /web when requested
+if os.getenv("ENABLE_WEB_INTERFACE", "true").lower() == "true":
+    try:
+        import gradio as gr
+        from support_ticket_env.gradio_ui import demo
+        import gradio.routes
+        app = gr.mount_gradio_app(app, demo, path="/web")
+        print("Gradio UI mounted at /web")
+    except Exception as e:
+        print(f"Gradio UI not mounted: {e}")

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openenv-core>=0.2.1
+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+pydantic>=2.0.0
+pyyaml>=6.0
+gradio>=4.0.0
+openai>=1.0.0

server/support_environment.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""
+Customer Support Ticket Resolution — OpenEnv Environment (server side).
+Implements the three tasks:
+  Task 1 (easy)   – Classify a single ticket
+  Task 2 (medium) – Choose the correct action for a classified ticket
+  Task 3 (hard)   – Fully resolve a queue of tickets with minimal steps
+"""
+from __future__ import annotations
+import random
+from typing import Optional
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+from support_ticket_env.models import SupportAction, SupportObservation, SupportState
+from support_ticket_env.tickets import TICKETS, TICKET_LOOKUP
+from support_ticket_env.graders import (
+    grade_task1,
+    grade_task2,
+    grade_task3,
+    loop_penalty,
+)
+class SupportTicketEnvironment(Environment):
+    """
+    OpenEnv environment that simulates a customer-support triage desk.
+    The task_id (1, 2, or 3) is set when the environment is reset.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self) -> None:
+        super().__init__()
+        self._task_id: int = 1
+        self._ticket: dict = {}
+        self._classified: bool = False
+        self._resolved: bool = False
+        self._step_count: int = 0
+        self._total_reward: float = 0.0
+        self._episode_id: Optional[str] = None
+        # Task 3: queue of tickets
+        self._queue: list[dict] = []
+        self._tickets_resolved: int = 0
+        self._tickets_total: int = 1
+    # ──────────────────────── reset ────────────────────────────
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        task_id: int = 1,
+        **kwargs,
+    ) -> SupportObservation:
+        rng = random.Random(seed)
+        self._episode_id = episode_id
+        self._task_id = int(task_id)
+        self._step_count = 0
+        self._total_reward = 0.0
+        self._classified = False
+        self._resolved = False
+        if self._task_id == 3:
+            # Give the agent a queue of 3 tickets
+            self._queue = rng.sample(TICKETS, k=3)
+            self._tickets_total = len(self._queue)
+            self._tickets_resolved = 0
+            self._ticket = self._queue[0]
+        else:
+            self._ticket = rng.choice(TICKETS)
+            self._tickets_total = 1
+            self._tickets_resolved = 0
+        return self._make_obs(
+            feedback="New episode started. Read the ticket and take action.",
+            score=0.0,
+        )
+    # ──────────────────────── step ─────────────────────────────
+    def step(self, action: SupportAction, **kwargs) -> SupportObservation:  # type: ignore[override]
+        self._step_count += 1
+        penalty = loop_penalty(self._step_count)
+        if self._task_id == 1:
+            obs = self._step_task1(action)
+        elif self._task_id == 2:
+            obs = self._step_task2(action)
+        else:
+            obs = self._step_task3(action)
+        # Apply loop penalty on top of step reward
+        obs.reward = (obs.reward or 0.0) + penalty
+        obs.reward = round(max(-1.0, min(1.0, obs.reward)), 4)
+        self._total_reward += obs.reward
+        obs.step_count = self._step_count
+        return obs
+    # ──────────────────────── Task 1 ───────────────────────────
+    def _step_task1(self, action: SupportAction) -> SupportObservation:
+        if action.action_type != "classify":
+            return self._make_obs(
+                feedback="Task 1 requires a 'classify' action.",
+                score=0.0,
+                done=False,
+            )
+        score = grade_task1(
+            predicted_category=action.category or "",
+            correct_category=self._ticket["category"],
+        )
+        self._classified = score == 1.0
+        correct = self._ticket["category"]
+        if score == 1.0:
+            feedback = f"✅ Correct! Category: '{correct}'."
+            done = True
+        else:
+            feedback = (
+                f"❌ Wrong. You said '{action.category}', correct is '{correct}'."
+            )
+            done = True  # Task 1 is one-shot — agent gets one attempt
+        obs = self._make_obs(feedback=feedback, score=score, done=done)
+        if done:
+            self._resolved = True
+        return obs
+    # ──────────────────────── Task 2 ───────────────────────────
+    def _step_task2(self, action: SupportAction) -> SupportObservation:
+        # First step must be classification
+        if not self._classified:
+            if action.action_type != "classify":
+                return self._make_obs(
+                    feedback="Please classify the ticket first.",
+                    score=0.0,
+                )
+            cat_score = grade_task1(
+                action.category or "", self._ticket["category"]
+            )
+            self._classified = True
+            return self._make_obs(
+                feedback=(
+                    f"Classified as '{action.category}'. "
+                    f"{'Correct ✅' if cat_score == 1.0 else 'Incorrect ❌'} "
+                    "Now choose an action."
+                ),
+                score=cat_score * 0.3,   # partial credit toward max 1.0
+            )
+        # Second step: choose action
+        score = grade_task2(
+            action_type=action.action_type,
+            correct_action=self._ticket["correct_action"],
+            category=self._ticket["category"],
+        )
+        correct = self._ticket["correct_action"]
+        if score == 1.0:
+            feedback = f"✅ Correct action: '{correct}'."
+        elif score == 0.5:
+            feedback = (
+                f"⚠️ Partial credit. '{action.action_type}' is defensible "
+                f"but '{correct}' is preferred."
+            )
+        else:
+            feedback = f"❌ Wrong action. Correct: '{correct}'."
+        self._resolved = True
+        return self._make_obs(feedback=feedback, score=score, done=True)
+    # ──────────────────────── Task 3 ───────────────────────────
+    def _step_task3(self, action: SupportAction) -> SupportObservation:
+        MAX_STEPS = 15
+        if not self._classified:
+            # Must classify first
+            if action.action_type != "classify":
+                return self._make_obs(
+                    feedback="Classify the ticket before taking action.",
+                    score=0.0,
+                )
+            cat_score = grade_task1(
+                action.category or "", self._ticket["category"]
+            )
+            self._classified = True
+            return self._make_obs(
+                feedback=(
+                    f"Classified '{self._ticket['id']}' as '{action.category}'. "
+                    f"{'Correct ✅' if cat_score == 1.0 else 'Incorrect ❌'} "
+                    "Now resolve it."
+                ),
+                score=cat_score * 0.1,
+            )
+        # Resolve current ticket
+        action_correct = action.action_type == self._ticket["correct_action"]
+        pair = frozenset({action.action_type, self._ticket["correct_action"]})
+        action_partial = (not action_correct) and pair in {
+            frozenset({"reply", "escalate"})
+        }
+        score = grade_task3(
+            classified_correctly=self._classified,
+            action_correct=action_correct,
+            action_partial=action_partial,
+            reply_text=action.reply_text,
+            category=self._ticket["category"],
+            resolved=True,
+            steps_taken=self._step_count,
+            max_steps=MAX_STEPS,
+        )
+        self._tickets_resolved += 1
+        correct_action = self._ticket["correct_action"]
+        # Advance to next ticket in queue
+        if self._tickets_resolved < self._tickets_total:
+            self._ticket = self._queue[self._tickets_resolved]
+            self._classified = False
+            feedback = (
+                f"Ticket resolved (score {score:.2f}). "
+                f"Moving to next ticket ({self._tickets_resolved + 1}/{self._tickets_total})."
+            )
+            done = False
+        else:
+            feedback = (
+                f"All {self._tickets_total} tickets resolved! "
+                f"Episode score: {self._total_reward + score:.2f}"
+            )
+            done = True
+            self._resolved = True
+        return self._make_obs(feedback=feedback, score=score, done=done)
+    # ──────────────────────── helpers ──────────────────────────
+    def _make_obs(
+        self,
+        feedback: str,
+        score: float,
+        done: bool = False,
+    ) -> SupportObservation:
+        return SupportObservation(
+            ticket_id=self._ticket.get("id", ""),
+            ticket_text=self._ticket.get("text", ""),
+            task_id=self._task_id,
+            current_category=self._ticket.get("category") if self._classified else None,
+            resolved=self._resolved,
+            step_count=self._step_count,
+            feedback=feedback,
+            score=score,
+            reward=score,
+            done=done,
+        )
+    # ──────────────────────── state ────────────────────────────
+    @property
+    def state(self) -> SupportState:
+        return SupportState(
+            episode_id=self._episode_id,
+            step_count=self._step_count,
+            task_id=self._task_id,
+            ticket_id=self._ticket.get("id", ""),
+            correct_category=self._ticket.get("category", ""),
+            correct_action=self._ticket.get("correct_action", ""),
+            classified=self._classified,
+            resolved=self._resolved,
+            total_reward=self._total_reward,
+            tickets_resolved=self._tickets_resolved,
+            tickets_total=self._tickets_total,
+        )

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Pytest configuration for support_ticket_env tests."""
+import sys
+import os
+# Ensure the package root is on the path when running tests directly
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Tests for SupportTicketEnvironment — runs the environment directly
+(no HTTP server required).
+"""
+import pytest
+from support_ticket_env.server.support_environment import SupportTicketEnvironment
+from support_ticket_env.models import SupportAction
+# ─────────────────────────── fixtures ──────────────────────────
+@pytest.fixture
+def env():
+    return SupportTicketEnvironment()
+# ─────────────────────────── Task 1 ────────────────────────────
+class TestTask1:
+    def test_reset_returns_observation(self, env):
+        obs = env.reset(task_id=1, seed=42)
+        assert obs.ticket_text
+        assert obs.task_id == 1
+        assert obs.done is False
+    def test_correct_classification(self, env):
+        obs = env.reset(task_id=1, seed=42)
+        # Find out the correct category via state
+        state = env.state
+        action = SupportAction(
+            action_type="classify",
+            category=state.correct_category,
+        )
+        obs = env.step(action)
+        assert obs.reward == 1.0
+        assert obs.done is True
+    def test_wrong_classification(self, env):
+        env.reset(task_id=1, seed=42)
+        state = env.state
+        wrong_cats = [
+            c for c in ["billing", "technical", "account", "general", "refund"]
+            if c != state.correct_category
+        ]
+        action = SupportAction(action_type="classify", category=wrong_cats[0])
+        obs = env.step(action)
+        assert obs.reward == 0.0
+        assert obs.done is True
+    def test_non_classify_action_penalised(self, env):
+        env.reset(task_id=1, seed=42)
+        obs = env.step(SupportAction(action_type="reply", reply_text="hello"))
+        # Should not crash; done might be False and reward 0
+        assert obs.reward is not None
+# ─────────────────────────── Task 2 ────────────────────────────
+class TestTask2:
+    def test_full_correct_episode(self, env):
+        env.reset(task_id=2, seed=42)
+        state = env.state
+        # Step 1: classify
+        obs = env.step(SupportAction(
+            action_type="classify",
+            category=state.correct_category,
+        ))
+        assert obs.done is False
+        assert obs.reward > 0
+        # Step 2: correct action
+        obs = env.step(SupportAction(action_type=state.correct_action))
+        assert obs.done is True
+        assert obs.reward >= 0.5
+    def test_must_classify_first(self, env):
+        env.reset(task_id=2, seed=7)
+        obs = env.step(SupportAction(action_type="escalate"))
+        assert obs.done is False
+        assert "classify" in obs.feedback.lower()
+    def test_state_reflects_progress(self, env):
+        env.reset(task_id=2, seed=7)
+        state = env.state
+        assert state.classified is False
+        env.step(SupportAction(
+            action_type="classify",
+            category=state.correct_category,
+        ))
+        state2 = env.state
+        assert state2.classified is True
+        assert state2.step_count == 1
+# ─────────────────────────── Task 3 ────────────────────────────
+class TestTask3:
+    def test_queue_has_three_tickets(self, env):
+        env.reset(task_id=3, seed=42)
+        state = env.state
+        assert state.tickets_total == 3
+        assert state.tickets_resolved == 0
+    def test_resolve_all_tickets(self, env):
+        env.reset(task_id=3, seed=42)
+        done = False
+        steps = 0
+        while not done and steps < 30:
+            state = env.state
+            if not state.classified:
+                action = SupportAction(
+                    action_type="classify",
+                    category=state.correct_category,
+                )
+            else:
+                ca = state.correct_action
+                if ca == "reply":
+                    action = SupportAction(
+                        action_type="reply",
+                        reply_text=f"We are handling your {state.correct_category} issue.",
+                    )
+                else:
+                    action = SupportAction(action_type=ca)
+            obs = env.step(action)
+            done = obs.done
+            steps += 1
+        assert done, "Episode should finish after 3 tickets"
+        final_state = env.state
+        assert final_state.tickets_resolved == 3
+    def test_total_reward_positive(self, env):
+        env.reset(task_id=3, seed=123)
+        total = 0.0
+        done = False
+        steps = 0
+        while not done and steps < 20:
+            state = env.state
+            if not state.classified:
+                action = SupportAction(
+                    action_type="classify",
+                    category=state.correct_category,
+                )
+            else:
+                action = SupportAction(action_type=state.correct_action)
+            obs = env.step(action)
+            total += obs.reward or 0.0
+            done = obs.done
+            steps += 1
+        assert total > 0.0
+# ─────────────────────────── State API ─────────────────────────
+class TestStateAPI:
+    def test_state_after_reset(self, env):
+        env.reset(task_id=1, seed=0)
+        state = env.state
+        assert state.step_count == 0
+        assert state.task_id == 1
+        assert state.ticket_id != ""
+    def test_step_count_increments(self, env):
+        env.reset(task_id=1, seed=0)
+        state = env.state
+        env.step(SupportAction(action_type="classify", category=state.correct_category))
+        assert env.state.step_count == 1
+# ─────────────────────────── Reward bounds ─────────────────────
+class TestRewardBounds:
+    def test_reward_in_range(self, env):
+        for seed in [0, 1, 2, 3, 42]:
+            for task_id in [1, 2, 3]:
+                env.reset(task_id=task_id, seed=seed)
+                state = env.state
+                action = SupportAction(
+                    action_type="classify",
+                    category=state.correct_category,
+                )
+                obs = env.step(action)
+                assert -1.0 <= (obs.reward or 0.0) <= 1.0, (
+                    f"Reward out of bounds: {obs.reward}"
+                )

tests/test_graders.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""Unit tests for grader functions."""
+import pytest
+from support_ticket_env.graders import (
+    grade_task1,
+    grade_task2,
+    grade_task3,
+    loop_penalty,
+)
+class TestTask1Grader:
+    def test_correct_category(self):
+        assert grade_task1("billing", "billing") == 1.0
+    def test_wrong_category(self):
+        assert grade_task1("technical", "billing") == 0.0
+    def test_all_categories(self):
+        for cat in ["billing", "technical", "account", "general", "refund"]:
+            assert grade_task1(cat, cat) == 1.0
+    def test_empty_prediction(self):
+        assert grade_task1("", "billing") == 0.0
+class TestTask2Grader:
+    def test_exact_match(self):
+        assert grade_task2("reply", "reply") == 1.0
+        assert grade_task2("escalate", "escalate") == 1.0
+        assert grade_task2("close", "close") == 1.0
+    def test_partial_credit_reply_escalate(self):
+        score = grade_task2("reply", "escalate")
+        assert score == 0.5
+        score = grade_task2("escalate", "reply")
+        assert score == 0.5
+    def test_wrong_action_close(self):
+        assert grade_task2("close", "reply") == 0.0
+        assert grade_task2("close", "escalate") == 0.0
+    def test_classify_when_action_expected(self):
+        assert grade_task2("classify", "reply") == 0.0
+class TestTask3Grader:
+    def test_perfect_resolution(self):
+        score = grade_task3(
+            classified_correctly=True,
+            action_correct=True,
+            action_partial=False,
+            reply_text="we will process your refund billing payment",
+            category="billing",
+            resolved=True,
+            steps_taken=1,
+            max_steps=5,
+        )
+        assert score > 0.9
+    def test_no_classification(self):
+        score = grade_task3(
+            classified_correctly=False,
+            action_correct=True,
+            action_partial=False,
+            reply_text="here is the refund",
+            category="billing",
+            resolved=True,
+            steps_taken=2,
+        )
+        # Should not get the 0.20 classification bonus
+        assert score < 1.0
+    def test_partial_action(self):
+        score_partial = grade_task3(
+            classified_correctly=True,
+            action_correct=False,
+            action_partial=True,
+            reply_text=None,
+            category="technical",
+            resolved=True,
+            steps_taken=2,
+        )
+        score_correct = grade_task3(
+            classified_correctly=True,
+            action_correct=True,
+            action_partial=False,
+            reply_text=None,
+            category="technical",
+            resolved=True,
+            steps_taken=2,
+        )
+        assert score_partial < score_correct
+    def test_score_capped_at_one(self):
+        score = grade_task3(
+            classified_correctly=True,
+            action_correct=True,
+            action_partial=False,
+            reply_text="refund billing payment account cancel subscription",
+            category="billing",
+            resolved=True,
+            steps_taken=1,
+            max_steps=5,
+        )
+        assert score <= 1.0
+class TestLoopPenalty:
+    def test_no_penalty_within_limit(self):
+        assert loop_penalty(5) == 0.0
+        assert loop_penalty(10) == 0.0
+    def test_penalty_beyond_limit(self):
+        assert loop_penalty(11) < 0.0
+        assert loop_penalty(15) < loop_penalty(11)
+    def test_penalty_grows(self):
+        p1 = loop_penalty(12)
+        p2 = loop_penalty(14)
+        assert p2 < p1

tickets.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Realistic support ticket dataset with ground-truth labels.
+Each ticket includes:
+  - id
+  - text   : customer message
+  - category : ground-truth category
+  - correct_action : best first action ("reply" | "escalate" | "close")
+  - resolution_hint : ideal reply / close reason (used for reward scoring)
+"""
+TICKETS = [
+    {
+        "id": "T001",
+        "text": "Hi, I was charged twice for my subscription this month. Please help!",
+        "category": "billing",
+        "correct_action": "reply",
+        "resolution_hint": "apologize and initiate refund for duplicate charge",
+    },
+    {
+        "id": "T002",
+        "text": "I cannot log into my account. The password reset email never arrives.",
+        "category": "account",
+        "correct_action": "reply",
+        "resolution_hint": "guide user to check spam folder and verify email address",
+    },
+    {
+        "id": "T003",
+        "text": "Your app crashes every time I try to upload a file larger than 10 MB.",
+        "category": "technical",
+        "correct_action": "escalate",
+        "resolution_hint": "escalate to engineering team with crash details",
+    },
+    {
+        "id": "T004",
+        "text": "I'd like a full refund. I haven't used the service at all this month.",
+        "category": "refund",
+        "correct_action": "reply",
+        "resolution_hint": "verify account activity and process refund per policy",
+    },
+    {
+        "id": "T005",
+        "text": "What are your business hours and do you have a phone number I can call?",
+        "category": "general",
+        "correct_action": "reply",
+        "resolution_hint": "provide business hours and contact information",
+    },
+    {
+        "id": "T006",
+        "text": "My invoice shows a charge for a plan I never subscribed to.",
+        "category": "billing",
+        "correct_action": "escalate",
+        "resolution_hint": "escalate potential fraudulent charge to billing team",
+    },
+    {
+        "id": "T007",
+        "text": "How do I cancel my subscription? I can't find the option anywhere.",
+        "category": "account",
+        "correct_action": "reply",
+        "resolution_hint": "guide user to account settings > subscription > cancel",
+    },
+    {
+        "id": "T008",
+        "text": "The API is returning 500 errors intermittently for the past 2 hours.",
+        "category": "technical",
+        "correct_action": "escalate",
+        "resolution_hint": "escalate to on-call engineering with timestamps",
+    },
+    {
+        "id": "T009",
+        "text": "Thank you! The issue has been resolved. You guys are awesome.",
+        "category": "general",
+        "correct_action": "close",
+        "resolution_hint": "acknowledge and close the ticket",
+    },
+    {
+        "id": "T010",
+        "text": "I need an itemised invoice for my company's accounting department.",
+        "category": "billing",
+        "correct_action": "reply",
+        "resolution_hint": "generate and send itemised invoice to customer email",
+    },
+]
+TICKET_LOOKUP = {t["id"]: t for t in TICKETS}