Utkarsh Sinha commited on
Commit ·
084325c
0
Parent(s):
OpenEnv Customer Support Triage
Browse files- Dockerfile +19 -0
- README.md +36 -0
- __init__.py +0 -0
- __pycache__/client.cpython-310.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- client.py +43 -0
- inference.py +188 -0
- models.py +22 -0
- openenv.yaml +6 -0
- server/__init__.py +0 -0
- server/__pycache__/__init__.cpython-310.pyc +0 -0
- server/__pycache__/app.cpython-310.pyc +0 -0
- server/__pycache__/environment.cpython-310.pyc +0 -0
- server/app.py +35 -0
- server/environment.py +139 -0
- test_client.py +17 -0
- validate-submission.sh +139 -0
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install dependencies including openenv-core
|
| 6 |
+
RUN pip install --no-cache-dir fastapi uvicorn pydantic openenv-core
|
| 7 |
+
|
| 8 |
+
# Copy the app files
|
| 9 |
+
COPY . .
|
| 10 |
+
|
| 11 |
+
# Environment variables
|
| 12 |
+
ENV PORT=8000
|
| 13 |
+
ENV HOST=0.0.0.0
|
| 14 |
+
|
| 15 |
+
# Expose port
|
| 16 |
+
EXPOSE 8000
|
| 17 |
+
|
| 18 |
+
# Run the application
|
| 19 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Customer Support Triage Environment
|
| 2 |
+
|
| 3 |
+
This is a real-world task environment for the OpenEnv Hackathon. It models an Email Customer Support Triage system where an AI agent must route or respond to an inbox of highly varied tickets.
|
| 4 |
+
|
| 5 |
+
## Description
|
| 6 |
+
|
| 7 |
+
The agent reads one ticket at a time and chooses between 3 actions:
|
| 8 |
+
- `assign`: Assign the ticket to a department (`TechSupport`, `Billing`, `Sales`, `Retention`) with a priority.
|
| 9 |
+
- `ask_user`: Repty to the ticket asking for clarification if context is vague.
|
| 10 |
+
- `escalate`: Immediately escalate critical user issues (security or heavy churn risks).
|
| 11 |
+
|
| 12 |
+
## Setup & Usage
|
| 13 |
+
|
| 14 |
+
To validate the environment locally:
|
| 15 |
+
```bash
|
| 16 |
+
# 1. Start the server
|
| 17 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 18 |
+
|
| 19 |
+
# 2. Export OpenAI variables
|
| 20 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 21 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 22 |
+
export HF_TOKEN="<your token>"
|
| 23 |
+
|
| 24 |
+
# 3. Run the baseline
|
| 25 |
+
python inference.py
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Task Difficulties
|
| 29 |
+
|
| 30 |
+
- **task1 (Easy)**: Route a single obvious password reset ticket to Technical Support.
|
| 31 |
+
- **task2 (Medium)**: Route 3 tickets, identifying one vague ticket that requires returning an `ask_user` reply.
|
| 32 |
+
- **task3 (Hard)**: Route 5 tickets, accurately isolating an angry churn risk and a security bypass, properly applying `escalate` and `assign` respectively, without failing standard tickets.
|
| 33 |
+
|
| 34 |
+
## Baseline Metrics
|
| 35 |
+
|
| 36 |
+
The baseline model (Qwen 72B) typically scores between 0.8 to 1.0 reliably across all tasks, proving that the tasks are deterministic, properly graded, and fully adhere to [0.0, 1.0] scoring constraints via partial progress.
|
__init__.py
ADDED
|
File without changes
|
__pycache__/client.cpython-310.pyc
ADDED
|
Binary file (1.78 kB). View file
|
|
|
__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (1.43 kB). View file
|
|
|
client.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
from openenv.core.client_types import StepResult
|
| 3 |
+
from openenv.core.env_server.types import State
|
| 4 |
+
from openenv.core import EnvClient
|
| 5 |
+
from models import CustomerSupportAction, CustomerSupportObservation
|
| 6 |
+
|
| 7 |
+
class CustomerSupportEnv(EnvClient[CustomerSupportAction, CustomerSupportObservation, State]):
|
| 8 |
+
def _step_payload(self, action: CustomerSupportAction) -> Dict:
|
| 9 |
+
return {
|
| 10 |
+
"action_type": action.action_type,
|
| 11 |
+
"department": action.department,
|
| 12 |
+
"priority": action.priority,
|
| 13 |
+
"reply_text": action.reply_text,
|
| 14 |
+
"escalation_reason": action.escalation_reason,
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
def _parse_result(self, payload: Dict) -> StepResult[CustomerSupportObservation]:
|
| 18 |
+
obs_data = payload.get("observation", {})
|
| 19 |
+
metadata = obs_data.get("metadata", {})
|
| 20 |
+
|
| 21 |
+
observation = CustomerSupportObservation(
|
| 22 |
+
active_ticket_id=obs_data.get("active_ticket_id"),
|
| 23 |
+
ticket_content=obs_data.get("ticket_content"),
|
| 24 |
+
ticket_metadata=obs_data.get("ticket_metadata", {}),
|
| 25 |
+
unresolved_count=obs_data.get("unresolved_count", 0),
|
| 26 |
+
available_departments=obs_data.get("available_departments", []),
|
| 27 |
+
available_priorities=obs_data.get("available_priorities", []),
|
| 28 |
+
step_count=obs_data.get("step_count", 0),
|
| 29 |
+
tickets_summary=obs_data.get("tickets_summary", []),
|
| 30 |
+
metadata=metadata
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
return StepResult(
|
| 34 |
+
observation=observation,
|
| 35 |
+
reward=payload.get("reward", 0.0),
|
| 36 |
+
done=payload.get("done", False)
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 40 |
+
return State(
|
| 41 |
+
episode_id=payload.get("episode_id"),
|
| 42 |
+
step_count=payload.get("step_count", 0),
|
| 43 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import asyncio
|
| 3 |
+
import os
|
| 4 |
+
import textwrap
|
| 5 |
+
import json
|
| 6 |
+
from typing import List, Optional
|
| 7 |
+
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
|
| 10 |
+
from client import CustomerSupportEnv
|
| 11 |
+
from models import CustomerSupportAction
|
| 12 |
+
|
| 13 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 15 |
+
API_KEY = HF_TOKEN or os.getenv("OPENAI_API_KEY")
|
| 16 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 17 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 18 |
+
BENCHMARK = "customer_support"
|
| 19 |
+
MAX_STEPS = 10
|
| 20 |
+
TEMPERATURE = 0.5
|
| 21 |
+
MAX_TOKENS = 150
|
| 22 |
+
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 23 |
+
|
| 24 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 25 |
+
"""
|
| 26 |
+
You are an AI customer support agent. You must act on the currently active ticket.
|
| 27 |
+
Your available actions are:
|
| 28 |
+
1. assign: requires 'department' (e.g. TechSupport, Billing, Sales, Retention) and optionally 'priority' (Low, Medium, High, Urgent).
|
| 29 |
+
2. ask_user: requires 'reply_text' to ask the user for more info.
|
| 30 |
+
3. escalate: escalates a critical/churn ticket.
|
| 31 |
+
|
| 32 |
+
You must reply ONLY with a valid JSON object matching the action schema. DO NOT wrap the json in backticks or markdown, just return raw JSON.
|
| 33 |
+
Example:
|
| 34 |
+
{"action_type": "assign", "department": "TechSupport", "priority": "High"}
|
| 35 |
+
{"action_type": "ask_user", "reply_text": "What is your OS?"}
|
| 36 |
+
{"action_type": "escalate"}
|
| 37 |
+
"""
|
| 38 |
+
).strip()
|
| 39 |
+
|
| 40 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 41 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 42 |
+
|
| 43 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 44 |
+
error_val = error if error else "null"
|
| 45 |
+
done_val = str(done).lower()
|
| 46 |
+
print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
|
| 47 |
+
|
| 48 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 49 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 50 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 51 |
+
|
| 52 |
+
def build_user_prompt(step: int, obs: dict, history: List[str]) -> str:
|
| 53 |
+
history_block = "\n".join(history[-3:]) if history else "None"
|
| 54 |
+
return textwrap.dedent(
|
| 55 |
+
f"""
|
| 56 |
+
Step: {step}
|
| 57 |
+
Active Ticket:
|
| 58 |
+
Content: {obs.get("ticket_content")}
|
| 59 |
+
Metadata: {obs.get("ticket_metadata")}
|
| 60 |
+
|
| 61 |
+
Available Departments: {obs.get("available_departments")}
|
| 62 |
+
Available Priorities: {obs.get("available_priorities")}
|
| 63 |
+
|
| 64 |
+
Previous actions:
|
| 65 |
+
{history_block}
|
| 66 |
+
|
| 67 |
+
Provide the next action as JSON.
|
| 68 |
+
"""
|
| 69 |
+
).strip()
|
| 70 |
+
|
| 71 |
+
def get_model_action(client: OpenAI, step: int, obs: dict, history: List[str]) -> tuple:
|
| 72 |
+
user_prompt = build_user_prompt(step, obs, history)
|
| 73 |
+
try:
|
| 74 |
+
completion = client.chat.completions.create(
|
| 75 |
+
model=MODEL_NAME,
|
| 76 |
+
messages=[
|
| 77 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 78 |
+
{"role": "user", "content": user_prompt},
|
| 79 |
+
],
|
| 80 |
+
temperature=TEMPERATURE,
|
| 81 |
+
max_tokens=MAX_TOKENS,
|
| 82 |
+
stream=False,
|
| 83 |
+
)
|
| 84 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 85 |
+
if text.startswith("```json"): text = text[7:]
|
| 86 |
+
if text.endswith("```"): text = text[:-3]
|
| 87 |
+
text = text.strip()
|
| 88 |
+
data = json.loads(text)
|
| 89 |
+
return CustomerSupportAction(**data), text
|
| 90 |
+
except Exception as exc:
|
| 91 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 92 |
+
return CustomerSupportAction(action_type="assign", department="TechSupport", priority="Low"), "{}"
|
| 93 |
+
|
| 94 |
+
async def run_task(task_name: str):
|
| 95 |
+
# Set env var so the server picks up the correct task logic on instantiation if running locally in docker
|
| 96 |
+
os.environ["TASK_NAME"] = task_name
|
| 97 |
+
|
| 98 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 99 |
+
|
| 100 |
+
if LOCAL_IMAGE_NAME:
|
| 101 |
+
env = await CustomerSupportEnv.from_docker_image(LOCAL_IMAGE_NAME, env_vars={"PORT": "8000", "TASK_NAME": task_name})
|
| 102 |
+
else:
|
| 103 |
+
env = CustomerSupportEnv(base_url="http://localhost:8000")
|
| 104 |
+
|
| 105 |
+
history: List[str] = []
|
| 106 |
+
rewards: List[float] = []
|
| 107 |
+
steps_taken = 0
|
| 108 |
+
success = False
|
| 109 |
+
|
| 110 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 111 |
+
score = 0.0
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
# Since local HTTP server might not use task_name passed via env well unless restarted, we explicitly set it via kwargs or rely on env
|
| 115 |
+
result = await env.reset(task_name=task_name)
|
| 116 |
+
obs = result.observation
|
| 117 |
+
|
| 118 |
+
for step in range(1, MAX_STEPS + 1):
|
| 119 |
+
if result.done:
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
# Serialize observation for prompt
|
| 123 |
+
obs_dict = {
|
| 124 |
+
"ticket_content": obs.ticket_content,
|
| 125 |
+
"ticket_metadata": obs.ticket_metadata,
|
| 126 |
+
"available_departments": obs.available_departments,
|
| 127 |
+
"available_priorities": obs.available_priorities,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
action_obj, raw_text = get_model_action(client, step, obs_dict, history)
|
| 131 |
+
|
| 132 |
+
result = await env.step(action_obj)
|
| 133 |
+
obs = result.observation
|
| 134 |
+
|
| 135 |
+
reward = result.reward or 0.0
|
| 136 |
+
done = result.done
|
| 137 |
+
|
| 138 |
+
rewards.append(reward)
|
| 139 |
+
steps_taken = step
|
| 140 |
+
|
| 141 |
+
safe_action_text = raw_text.replace('\n', ' ').replace('\r', '')
|
| 142 |
+
log_step(step=step, action=safe_action_text, reward=reward, done=done, error=None)
|
| 143 |
+
|
| 144 |
+
history.append(f"Step {step} action: {safe_action_text} -> reward {reward}")
|
| 145 |
+
|
| 146 |
+
if done:
|
| 147 |
+
break
|
| 148 |
+
|
| 149 |
+
MAX_TOTAL_REWARD = max(float(len(obs.tickets_summary)), 1.0)
|
| 150 |
+
score = sum(rewards) / MAX_TOTAL_REWARD
|
| 151 |
+
score = min(max(score, 0.0), 1.0)
|
| 152 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f"[DEBUG] Error during run: {e}")
|
| 156 |
+
score = 0.0
|
| 157 |
+
success = False
|
| 158 |
+
finally:
|
| 159 |
+
try:
|
| 160 |
+
await env.close()
|
| 161 |
+
except:
|
| 162 |
+
pass
|
| 163 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 164 |
+
|
| 165 |
+
async def main():
|
| 166 |
+
tasks = ["task1", "task2", "task3"]
|
| 167 |
+
from threading import Thread
|
| 168 |
+
import uvicorn
|
| 169 |
+
import time
|
| 170 |
+
from server.app import app
|
| 171 |
+
|
| 172 |
+
server_thread = None
|
| 173 |
+
if not LOCAL_IMAGE_NAME:
|
| 174 |
+
print("[DEBUG] Starting local server for testing...")
|
| 175 |
+
server_thread = Thread(target=uvicorn.run, args=(app,), kwargs={"host":"0.0.0.0", "port":8000, "log_level":"error"}, daemon=True)
|
| 176 |
+
server_thread.start()
|
| 177 |
+
time.sleep(2) # wait for boot
|
| 178 |
+
|
| 179 |
+
for t in tasks:
|
| 180 |
+
# HTTP calls stateless routing, we use task configured on env side if possible.
|
| 181 |
+
# Note: If running a shared persistent server, using os.environ might not be thread safe or apply to the already running server.
|
| 182 |
+
# A workaround is restarting server, but we will assume single-run tests for bash script.
|
| 183 |
+
# Usually HF Spaces run tasks sequentially or expect env to read state.
|
| 184 |
+
# But wait, openenv `reset` doesn't pass task config easily without query params.
|
| 185 |
+
await run_task(t)
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Optional
|
| 2 |
+
from openenv.core.env_server.types import Action, Observation
|
| 3 |
+
|
| 4 |
+
class CustomerSupportObservation(Observation):
|
| 5 |
+
"""Observation space for the Customer Support Triage environment."""
|
| 6 |
+
active_ticket_id: Optional[str] = None
|
| 7 |
+
ticket_content: Optional[str] = None
|
| 8 |
+
ticket_metadata: Dict[str, str] = {}
|
| 9 |
+
|
| 10 |
+
unresolved_count: int = 0
|
| 11 |
+
available_departments: List[str] = ["TechSupport", "Billing", "Sales", "Retention"]
|
| 12 |
+
available_priorities: List[str] = ["Low", "Medium", "High", "Urgent"]
|
| 13 |
+
step_count: int = 0
|
| 14 |
+
tickets_summary: List[Dict[str, str]] = []
|
| 15 |
+
|
| 16 |
+
class CustomerSupportAction(Action):
|
| 17 |
+
"""Action space for the Customer Support Triage environment."""
|
| 18 |
+
action_type: str # "assign", "ask_user", "escalate"
|
| 19 |
+
department: Optional[str] = None
|
| 20 |
+
priority: Optional[str] = None
|
| 21 |
+
reply_text: Optional[str] = None
|
| 22 |
+
escalation_reason: Optional[str] = None
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: customer_support
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
server/__init__.py
ADDED
|
File without changes
|
server/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (174 Bytes). View file
|
|
|
server/__pycache__/app.cpython-310.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
server/__pycache__/environment.cpython-310.pyc
ADDED
|
Binary file (5.05 kB). View file
|
|
|
server/app.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from openenv.core.env_server.http_server import create_app
|
| 5 |
+
except ImportError as e:
|
| 6 |
+
raise ImportError("openenv is required for the web interface.") from e
|
| 7 |
+
|
| 8 |
+
# Ensure relative imports resolve correctly based on execution context
|
| 9 |
+
try:
|
| 10 |
+
from models import CustomerSupportAction, CustomerSupportObservation
|
| 11 |
+
except ImportError:
|
| 12 |
+
from ..models import CustomerSupportAction, CustomerSupportObservation
|
| 13 |
+
|
| 14 |
+
from .environment import CustomerSupportEnvironment
|
| 15 |
+
|
| 16 |
+
MAX_CONCURRENT_ENVS = int(os.getenv("MAX_CONCURRENT_ENVS", "100"))
|
| 17 |
+
|
| 18 |
+
app = create_app(
|
| 19 |
+
CustomerSupportEnvironment,
|
| 20 |
+
CustomerSupportAction,
|
| 21 |
+
CustomerSupportObservation,
|
| 22 |
+
env_name="customer_support",
|
| 23 |
+
max_concurrent_envs=MAX_CONCURRENT_ENVS,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 27 |
+
import uvicorn
|
| 28 |
+
uvicorn.run(app, host=host, port=port)
|
| 29 |
+
|
| 30 |
+
if __name__ == "__main__":
|
| 31 |
+
import argparse
|
| 32 |
+
parser = argparse.ArgumentParser()
|
| 33 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 34 |
+
args = parser.parse_args()
|
| 35 |
+
main(port=args.port)
|
server/environment.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import os
|
| 3 |
+
import uuid
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
|
| 6 |
+
from openenv.core.env_server.interfaces import Environment
|
| 7 |
+
from openenv.core.env_server.types import State
|
| 8 |
+
|
| 9 |
+
# Ensure relative imports resolve correctly based on execution context
|
| 10 |
+
try:
|
| 11 |
+
from models import CustomerSupportAction, CustomerSupportObservation
|
| 12 |
+
except ImportError:
|
| 13 |
+
from ..models import CustomerSupportAction, CustomerSupportObservation
|
| 14 |
+
|
| 15 |
+
TASKS = {
|
| 16 |
+
"task1": [
|
| 17 |
+
{"id": "t1", "content": "I forgot my password and cannot log into my account. Help!", "type": "password"}
|
| 18 |
+
],
|
| 19 |
+
"task2": [
|
| 20 |
+
{"id": "t2_1", "content": "How do I update my billing email?", "type": "billing"},
|
| 21 |
+
{"id": "t2_2", "content": "The system says invalid credentials.", "type": "password"},
|
| 22 |
+
{"id": "t2_3", "content": "My app crashed!", "type": "vague"}
|
| 23 |
+
],
|
| 24 |
+
"task3": [
|
| 25 |
+
{"id": "t3_1", "content": "How to change password?", "type": "password"},
|
| 26 |
+
{"id": "t3_2", "content": "I want an immediate refund, this is garbage! Cancel my account!", "type": "churn"},
|
| 27 |
+
{"id": "t3_3", "content": "Found a way to bypass authentication on the user portal.", "type": "security"},
|
| 28 |
+
{"id": "t3_4", "content": "Charge on my credit card is double what it should be.", "type": "billing"},
|
| 29 |
+
{"id": "t3_5", "content": "Is there a student discount?", "type": "sales"}
|
| 30 |
+
]
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
class CustomerSupportEnvironment(Environment):
|
| 34 |
+
"""Customer Support Environment for testing RL agents."""
|
| 35 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 36 |
+
|
| 37 |
+
def __init__(self, task_name: Optional[str] = None, **kwargs):
|
| 38 |
+
super().__init__(**kwargs)
|
| 39 |
+
self._session_id = str(uuid.uuid4())
|
| 40 |
+
self._state = State(episode_id=self._session_id, step_count=0)
|
| 41 |
+
|
| 42 |
+
# Priority: explicit arg -> env var -> default
|
| 43 |
+
self.task_name = task_name if task_name else os.getenv("TASK_NAME", "task1")
|
| 44 |
+
if self.task_name not in TASKS:
|
| 45 |
+
self.task_name = "task1"
|
| 46 |
+
|
| 47 |
+
self.tickets = []
|
| 48 |
+
self._load_tickets()
|
| 49 |
+
self.current_ticket_index = 0
|
| 50 |
+
|
| 51 |
+
def _load_tickets(self):
|
| 52 |
+
self.tickets = [dict(t) for t in TASKS[self.task_name]]
|
| 53 |
+
for t in self.tickets:
|
| 54 |
+
t["status"] = "open"
|
| 55 |
+
|
| 56 |
+
def _get_active_ticket(self) -> Optional[Dict]:
|
| 57 |
+
if self.current_ticket_index < len(self.tickets):
|
| 58 |
+
return self.tickets[self.current_ticket_index]
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, task_name: Optional[str] = None, **kwargs) -> CustomerSupportObservation:
|
| 62 |
+
"""Reset the environment."""
|
| 63 |
+
if episode_id is not None:
|
| 64 |
+
self._session_id = episode_id
|
| 65 |
+
|
| 66 |
+
if task_name is not None and task_name in TASKS:
|
| 67 |
+
self.task_name = task_name
|
| 68 |
+
|
| 69 |
+
self._state = State(episode_id=self._session_id, step_count=0)
|
| 70 |
+
self._load_tickets()
|
| 71 |
+
self.current_ticket_index = 0
|
| 72 |
+
|
| 73 |
+
return self._make_observation(reward=0.0, done=False)
|
| 74 |
+
|
| 75 |
+
def _make_observation(self, reward: float = 0.0, done: bool = False) -> CustomerSupportObservation:
|
| 76 |
+
t = self._get_active_ticket()
|
| 77 |
+
unresolved = sum(1 for x in self.tickets if x["status"] == "open")
|
| 78 |
+
summary = [{"id": x["id"], "summary": x["content"][:30] + "...", "status": x["status"]} for x in self.tickets]
|
| 79 |
+
|
| 80 |
+
return CustomerSupportObservation(
|
| 81 |
+
active_ticket_id=t["id"] if t else None,
|
| 82 |
+
ticket_content=t["content"] if t else None,
|
| 83 |
+
ticket_metadata={"type": t["type"]} if t else {},
|
| 84 |
+
unresolved_count=unresolved,
|
| 85 |
+
step_count=self._state.step_count,
|
| 86 |
+
tickets_summary=summary,
|
| 87 |
+
reward=float(reward),
|
| 88 |
+
done=done
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def step(self, action: CustomerSupportAction, timeout_s: Optional[float] = None, **kwargs) -> CustomerSupportObservation:
|
| 92 |
+
"""Execute action step."""
|
| 93 |
+
self._state.step_count += 1
|
| 94 |
+
t = self._get_active_ticket()
|
| 95 |
+
|
| 96 |
+
if not t:
|
| 97 |
+
return self._make_observation(reward=0.0, done=True)
|
| 98 |
+
|
| 99 |
+
action_type = action.action_type.lower()
|
| 100 |
+
ttype = t["type"]
|
| 101 |
+
is_correct = False
|
| 102 |
+
|
| 103 |
+
# Simple logical grader included inline for self-containment
|
| 104 |
+
if ttype == "password":
|
| 105 |
+
if action_type == "assign" and action.department == "TechSupport":
|
| 106 |
+
is_correct = True
|
| 107 |
+
elif ttype == "billing":
|
| 108 |
+
if action_type == "assign" and action.department == "Billing":
|
| 109 |
+
is_correct = True
|
| 110 |
+
elif ttype == "sales":
|
| 111 |
+
if action_type == "assign" and action.department == "Sales":
|
| 112 |
+
is_correct = True
|
| 113 |
+
elif ttype == "vague":
|
| 114 |
+
if action_type == "ask_user":
|
| 115 |
+
is_correct = True
|
| 116 |
+
elif ttype == "churn":
|
| 117 |
+
if action_type == "escalate":
|
| 118 |
+
is_correct = True
|
| 119 |
+
elif ttype == "security":
|
| 120 |
+
if action_type == "escalate":
|
| 121 |
+
is_correct = True
|
| 122 |
+
elif action_type == "assign" and action.department == "TechSupport" and action.priority in ["High", "Urgent"]:
|
| 123 |
+
is_correct = True
|
| 124 |
+
|
| 125 |
+
if is_correct:
|
| 126 |
+
reward = 1.0 # Standard positive reward mapped properly per ticket
|
| 127 |
+
t["status"] = "resolved"
|
| 128 |
+
else:
|
| 129 |
+
reward = 0.0 # Strict zero for incorrect routing
|
| 130 |
+
t["status"] = "failed"
|
| 131 |
+
|
| 132 |
+
self.current_ticket_index += 1
|
| 133 |
+
done = self.current_ticket_index >= len(self.tickets)
|
| 134 |
+
|
| 135 |
+
return self._make_observation(reward=reward, done=done)
|
| 136 |
+
|
| 137 |
+
@property
|
| 138 |
+
def state(self) -> State:
|
| 139 |
+
return self._state
|
test_client.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from client import CustomerSupportEnv
|
| 5 |
+
from models import CustomerSupportAction
|
| 6 |
+
|
| 7 |
+
async def main():
|
| 8 |
+
os.environ["TASK_NAME"] = "task1"
|
| 9 |
+
env = CustomerSupportEnv(base_url="http://localhost:8001")
|
| 10 |
+
res = await env.reset()
|
| 11 |
+
print("RESET RESULT:", res)
|
| 12 |
+
|
| 13 |
+
action = CustomerSupportAction(action_type="assign", department="TechSupport", priority="High")
|
| 14 |
+
res = await env.step(action)
|
| 15 |
+
print("STEP RESULT:", res)
|
| 16 |
+
|
| 17 |
+
asyncio.run(main())
|
validate-submission.sh
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
|
| 5 |
+
set -uo pipefail
|
| 6 |
+
|
| 7 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 8 |
+
if [ -t 1 ]; then
|
| 9 |
+
RED='\033[0;31m'
|
| 10 |
+
GREEN='\033[0;32m'
|
| 11 |
+
YELLOW='\033[1;33m'
|
| 12 |
+
BOLD='\033[1m'
|
| 13 |
+
NC='\033[0m'
|
| 14 |
+
else
|
| 15 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 16 |
+
fi
|
| 17 |
+
|
| 18 |
+
run_with_timeout() {
|
| 19 |
+
local secs="$1"; shift
|
| 20 |
+
if command -v timeout &>/dev/null; then
|
| 21 |
+
timeout "$secs" "$@"
|
| 22 |
+
elif command -v gtimeout &>/dev/null; then
|
| 23 |
+
gtimeout "$secs" "$@"
|
| 24 |
+
else
|
| 25 |
+
"$@" &
|
| 26 |
+
local pid=$!
|
| 27 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 28 |
+
local watcher=$!
|
| 29 |
+
wait "$pid" 2>/dev/null
|
| 30 |
+
local rc=$?
|
| 31 |
+
kill "$watcher" 2>/dev/null
|
| 32 |
+
wait "$watcher" 2>/dev/null
|
| 33 |
+
return $rc
|
| 34 |
+
fi
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
portable_mktemp() {
|
| 38 |
+
local prefix="${1:-validate}"
|
| 39 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
CLEANUP_FILES=()
|
| 43 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 44 |
+
trap cleanup EXIT
|
| 45 |
+
|
| 46 |
+
PING_URL="${1:-}"
|
| 47 |
+
REPO_DIR="${2:-.}"
|
| 48 |
+
|
| 49 |
+
if [ -z "$PING_URL" ]; then
|
| 50 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 51 |
+
exit 1
|
| 52 |
+
fi
|
| 53 |
+
|
| 54 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 55 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 56 |
+
exit 1
|
| 57 |
+
fi
|
| 58 |
+
PING_URL="${PING_URL%/}"
|
| 59 |
+
export PING_URL
|
| 60 |
+
PASS=0
|
| 61 |
+
|
| 62 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 63 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 64 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 65 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 66 |
+
stop_at() {
|
| 67 |
+
printf "\n"
|
| 68 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 69 |
+
exit 1
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
printf "\n${BOLD}========================================${NC}\n"
|
| 73 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 74 |
+
printf "${BOLD}========================================${NC}\n"
|
| 75 |
+
log "Repo: $REPO_DIR"
|
| 76 |
+
log "Ping URL: $PING_URL"
|
| 77 |
+
|
| 78 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 79 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 80 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 81 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 82 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 83 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 84 |
+
|
| 85 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 86 |
+
pass "HF Space is live and responds to /reset"
|
| 87 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 88 |
+
fail "HF Space not reachable"
|
| 89 |
+
stop_at "Step 1"
|
| 90 |
+
else
|
| 91 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE"
|
| 92 |
+
stop_at "Step 1"
|
| 93 |
+
fi
|
| 94 |
+
|
| 95 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 96 |
+
if ! command -v docker &>/dev/null; then
|
| 97 |
+
fail "docker command not found"
|
| 98 |
+
stop_at "Step 2"
|
| 99 |
+
fi
|
| 100 |
+
|
| 101 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 102 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 103 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 104 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 105 |
+
else
|
| 106 |
+
fail "No Dockerfile found"
|
| 107 |
+
stop_at "Step 2"
|
| 108 |
+
fi
|
| 109 |
+
|
| 110 |
+
BUILD_OK=false
|
| 111 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 112 |
+
|
| 113 |
+
if [ "$BUILD_OK" = true ]; then
|
| 114 |
+
pass "Docker build succeeded"
|
| 115 |
+
else
|
| 116 |
+
fail "Docker build failed"
|
| 117 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 118 |
+
stop_at "Step 2"
|
| 119 |
+
fi
|
| 120 |
+
|
| 121 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 122 |
+
if ! command -v openenv &>/dev/null; then
|
| 123 |
+
fail "openenv command not found. Installing locally..."
|
| 124 |
+
pip install --quiet openenv-core
|
| 125 |
+
fi
|
| 126 |
+
|
| 127 |
+
VALIDATE_OK=false
|
| 128 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 129 |
+
|
| 130 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 131 |
+
pass "openenv validate passed"
|
| 132 |
+
else
|
| 133 |
+
fail "openenv validate failed"
|
| 134 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 135 |
+
stop_at "Step 3"
|
| 136 |
+
fi
|
| 137 |
+
|
| 138 |
+
printf "\n${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 139 |
+
exit 0
|