Upload 13 files
Browse files- Dockerfile +28 -0
- README.md +39 -6
- env.py +121 -0
- full_validate.sh +185 -0
- inference.py +102 -0
- local_validate.sh +15 -0
- models.py +27 -0
- openenv.yaml +13 -0
- pyproject.toml +22 -0
- requirements.txt +6 -0
- server/app.py +21 -0
- tasks.py +46 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use lightweight python image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set strict limits awareness
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV API_BASE_URL="https://api.openai.com/v1"
|
| 7 |
+
ENV MODEL_NAME="gpt-4.1-mini"
|
| 8 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 9 |
+
|
| 10 |
+
# Setup working directory
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
ENV PYTHONPATH=/app
|
| 13 |
+
|
| 14 |
+
# Install dependencies securely
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy all core files
|
| 19 |
+
COPY . /app
|
| 20 |
+
|
| 21 |
+
# Ensure HF Space port exposes correctly
|
| 22 |
+
EXPOSE 7860
|
| 23 |
+
|
| 24 |
+
# Alternative CMD for testing locally:
|
| 25 |
+
# CMD ["python", "inference.py"]
|
| 26 |
+
|
| 27 |
+
# CMD strictly runs the FastAPI server
|
| 28 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,44 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
---
|
|
|
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Customer Support Triage
|
| 3 |
+
emoji: 🎧
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
+
sdk_version: "1.0.0"
|
| 8 |
+
python_version: "3.10.0"
|
| 9 |
+
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 11 |
---
|
| 12 |
+
# Customer Support Triage Environment
|
| 13 |
|
| 14 |
+
A real-world customer support simulation environment requiring information extraction, system actions, and routing.
|
| 15 |
+
Built using the official OpenEnv interface specification.
|
| 16 |
+
|
| 17 |
+
## Overview
|
| 18 |
+
This environment simulates a text-based customer support chat system. The agent acts on incoming support tickets and chooses actions such as asking the user for missing info, processing refunds, or routing the ticket to specialized teams.
|
| 19 |
+
|
| 20 |
+
## Action & Observation Spaces
|
| 21 |
+
- **Observation:** `ticket_id`, `customer_message`, `history`, `missing_info`, `status`, `refund_processed`, `done`, `reward`.
|
| 22 |
+
- **Action:**
|
| 23 |
+
- `action_type`: One of 'ROUTE', 'ASK_INFO', 'REFUND', 'CLOSE'
|
| 24 |
+
- `argument`: The specific queue (e.g., 'BILLING'), the info to ask for (e.g., 'serial_number'), or order to refund.
|
| 25 |
+
|
| 26 |
+
## Tasks
|
| 27 |
+
1. **easy_password_reset** (Easy): A simple IT support routing task requiring no additional info.
|
| 28 |
+
2. **medium_hardware_issue** (Medium): Requires the agent to first ask for the `serial_number` before routing to hardware support.
|
| 29 |
+
3. **hard_refund_processing** (Hard): Requires the agent to ask for `order_id` and `photo_evidence`, process a refund by taking the REFUND action, and finally route the ticket to billing.
|
| 30 |
+
|
| 31 |
+
## Usage
|
| 32 |
+
Start the environment server using Docker:
|
| 33 |
+
```bash
|
| 34 |
+
docker build -t customer-support-env .
|
| 35 |
+
docker run -p 8000:8000 customer-support-env
|
| 36 |
+
```
|
| 37 |
+
Or start directly via python:
|
| 38 |
+
```bash
|
| 39 |
+
uvicorn server.app:app --port 8000
|
| 40 |
+
```
|
| 41 |
+
Run the baseline baseline inference script:
|
| 42 |
+
```bash
|
| 43 |
+
python inference.py
|
| 44 |
+
```
|
env.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from models import Observation, Action, State
|
| 2 |
+
from tasks import TASKS, grader
|
| 3 |
+
import copy
|
| 4 |
+
from typing import Any, Optional
|
| 5 |
+
import openenv.core.env_server as es
|
| 6 |
+
|
| 7 |
+
class CustomerSupportEnv(es.Environment):
|
| 8 |
+
def __init__(self, **kwargs):
|
| 9 |
+
super().__init__(**kwargs)
|
| 10 |
+
self.current_task_idx = 0
|
| 11 |
+
self.state_data = {}
|
| 12 |
+
self.step_count = 0
|
| 13 |
+
self.max_steps = 10
|
| 14 |
+
self.done = False
|
| 15 |
+
self.reset()
|
| 16 |
+
|
| 17 |
+
def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, task_idx: int = 0, **kwargs: Any) -> Observation:
|
| 18 |
+
self.current_task_idx = task_idx
|
| 19 |
+
task = TASKS[self.current_task_idx]
|
| 20 |
+
self.step_count = 0
|
| 21 |
+
self.done = False
|
| 22 |
+
|
| 23 |
+
self.state_data = {
|
| 24 |
+
"ticket_id": f"TKT-{1000 + task_idx}",
|
| 25 |
+
"customer_message": task.initial_msg,
|
| 26 |
+
"history":[f"Customer: {task.initial_msg}"],
|
| 27 |
+
"missing_info": task.required_info.copy(),
|
| 28 |
+
"collected_info":[],
|
| 29 |
+
"route": None,
|
| 30 |
+
"refund_processed": False,
|
| 31 |
+
"status": "OPEN",
|
| 32 |
+
"episode_id": episode_id
|
| 33 |
+
}
|
| 34 |
+
return self._get_obs(reward=0.0, feedback="")
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def state(self) -> State:
|
| 38 |
+
return State(
|
| 39 |
+
ticket_id=self.state_data.get("ticket_id", ""),
|
| 40 |
+
customer_message=self.state_data.get("customer_message", ""),
|
| 41 |
+
history=copy.deepcopy(self.state_data.get("history", [])),
|
| 42 |
+
missing_info=copy.deepcopy(self.state_data.get("missing_info", [])),
|
| 43 |
+
status=self.state_data.get("status", "OPEN"),
|
| 44 |
+
refund_processed=self.state_data.get("refund_processed", False),
|
| 45 |
+
episode_id=self.state_data.get("episode_id", None),
|
| 46 |
+
step_count=self.step_count
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
def _get_obs(self, reward: float = 0.0, feedback: str = "") -> Observation:
|
| 50 |
+
return Observation(
|
| 51 |
+
ticket_id=self.state_data["ticket_id"],
|
| 52 |
+
customer_message=self.state_data["customer_message"],
|
| 53 |
+
history=self.state_data["history"],
|
| 54 |
+
missing_info=self.state_data["missing_info"],
|
| 55 |
+
status=self.state_data["status"],
|
| 56 |
+
refund_processed=self.state_data["refund_processed"],
|
| 57 |
+
done=self.done,
|
| 58 |
+
reward=reward,
|
| 59 |
+
metadata={"feedback": feedback, "state": self.state_data}
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def step(self, action: Action, timeout_s: Optional[float] = None, **kwargs: Any) -> Observation:
|
| 63 |
+
if self.done:
|
| 64 |
+
return self._get_obs(reward=0.0, feedback="Episode already done")
|
| 65 |
+
|
| 66 |
+
self.step_count += 1
|
| 67 |
+
reward_val = 0.0
|
| 68 |
+
feedback = ""
|
| 69 |
+
task = TASKS[self.current_task_idx]
|
| 70 |
+
|
| 71 |
+
# Penalize infinite loops / max steps
|
| 72 |
+
if self.step_count >= self.max_steps:
|
| 73 |
+
self.done = True
|
| 74 |
+
return self._get_obs(reward=-0.5, feedback="Max steps reached")
|
| 75 |
+
|
| 76 |
+
if action.action_type == "ASK_INFO":
|
| 77 |
+
asked = action.argument.lower()
|
| 78 |
+
found = False
|
| 79 |
+
for req in self.state_data["missing_info"]:
|
| 80 |
+
if req.lower() in asked.lower():
|
| 81 |
+
self.state_data["missing_info"].remove(req)
|
| 82 |
+
self.state_data["collected_info"].append(req)
|
| 83 |
+
reply = f"Here is my {req}: [MOCK_DATA]"
|
| 84 |
+
self.state_data["history"].extend([f"Agent: {action.argument}", f"Customer: {reply}"])
|
| 85 |
+
self.state_data["customer_message"] = reply
|
| 86 |
+
reward_val = 0.2
|
| 87 |
+
feedback = f"Successfully collected {req}"
|
| 88 |
+
found = True
|
| 89 |
+
break
|
| 90 |
+
if not found:
|
| 91 |
+
reward_val = -0.1
|
| 92 |
+
feedback = "Asked for unnecessary information."
|
| 93 |
+
|
| 94 |
+
elif action.action_type == "REFUND":
|
| 95 |
+
if task.needs_refund and "order_id" in self.state_data["collected_info"]:
|
| 96 |
+
self.state_data["refund_processed"] = True
|
| 97 |
+
reward_val = 0.3
|
| 98 |
+
feedback = "Refund processed successfully."
|
| 99 |
+
else:
|
| 100 |
+
reward_val = -0.5
|
| 101 |
+
feedback = "Cannot process refund without order ID or refund not required."
|
| 102 |
+
|
| 103 |
+
elif action.action_type == "ROUTE":
|
| 104 |
+
self.state_data["route"] = action.argument
|
| 105 |
+
if self.state_data["missing_info"]:
|
| 106 |
+
reward_val = -0.5
|
| 107 |
+
feedback = "Routed prematurely without gathering required info."
|
| 108 |
+
else:
|
| 109 |
+
self.done = True
|
| 110 |
+
final_score = grader(task, self.state_data)
|
| 111 |
+
reward_val = float(final_score)
|
| 112 |
+
feedback = f"Ticket routed. Final Score: {final_score}"
|
| 113 |
+
|
| 114 |
+
elif action.action_type == "CLOSE":
|
| 115 |
+
self.done = True
|
| 116 |
+
self.state_data["status"] = "CLOSED"
|
| 117 |
+
final_score = grader(task, self.state_data)
|
| 118 |
+
reward_val = float(final_score)
|
| 119 |
+
feedback = f"Ticket closed. Final Score: {final_score}"
|
| 120 |
+
|
| 121 |
+
return self._get_obs(reward=reward_val, feedback=feedback)
|
full_validate.sh
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
if ! command -v openenv &>/dev/null; then
|
| 161 |
+
fail "openenv command not found"
|
| 162 |
+
hint "Install it: pip install openenv-core"
|
| 163 |
+
stop_at "Step 3"
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
VALIDATE_OK=false
|
| 167 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
+
|
| 169 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
+
pass "openenv validate passed"
|
| 171 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
+
else
|
| 173 |
+
fail "openenv validate failed"
|
| 174 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
+
stop_at "Step 3"
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
printf "\n"
|
| 179 |
+
printf "${BOLD}========================================${NC}\n"
|
| 180 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
+
printf "${BOLD}========================================${NC}\n"
|
| 183 |
+
printf "\n"
|
| 184 |
+
|
| 185 |
+
exit 0
|
inference.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from openai import OpenAI
|
| 4 |
+
from env import CustomerSupportEnv
|
| 5 |
+
from models import Action
|
| 6 |
+
from tasks import TASKS
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
load_dotenv()
|
| 11 |
+
except ImportError:
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# 1. Required Environment Variables
|
| 16 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 17 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1-mini")
|
| 18 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
+
|
| 20 |
+
if HF_TOKEN is None:
|
| 21 |
+
raise ValueError("HF_TOKEN environment variable is required")
|
| 22 |
+
|
| 23 |
+
# 2. Open AI Client Only
|
| 24 |
+
client = OpenAI(
|
| 25 |
+
base_url=API_BASE_URL,
|
| 26 |
+
api_key=HF_TOKEN
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
def run_inference():
|
| 30 |
+
env = CustomerSupportEnv()
|
| 31 |
+
|
| 32 |
+
for idx, task in enumerate(TASKS):
|
| 33 |
+
obs = env.reset(task_idx=idx)
|
| 34 |
+
done = False
|
| 35 |
+
step_idx = 0
|
| 36 |
+
rewards_history = []
|
| 37 |
+
|
| 38 |
+
# [START] FORMAT
|
| 39 |
+
print(f"[START] task={task.name} env=customer_support model={MODEL_NAME}")
|
| 40 |
+
|
| 41 |
+
while not done:
|
| 42 |
+
step_idx += 1
|
| 43 |
+
error_msg = "null"
|
| 44 |
+
reward_val = 0.00
|
| 45 |
+
action_str = ""
|
| 46 |
+
|
| 47 |
+
# 🚀 HEAVILY ENGINEERED PROMPT FOR STRICT COMPLIANCE
|
| 48 |
+
prompt = (
|
| 49 |
+
"System: You are an automated customer support AI. You MUST respond strictly in JSON format matching this schema: "
|
| 50 |
+
"{\"action_type\": \"ROUTE\"|\"ASK_INFO\"|\"REFUND\"|\"CLOSE\", \"argument\": \"string\"}\n\n"
|
| 51 |
+
"CRITICAL RULES:\n"
|
| 52 |
+
"1. If 'missing_info' in the observation is empty ([]), DO NOT use ASK_INFO. You must take action (ROUTE or REFUND).\n"
|
| 53 |
+
"2. If 'missing_info' contains items, you MUST use ASK_INFO. The 'argument' MUST contain the EXACT string from 'missing_info' (e.g., 'serial_number', 'order_id', 'photo_evidence'). Ask for ONLY ONE missing item at a time\n"
|
| 54 |
+
"3. When using ROUTE, the 'argument' MUST be exactly one of these three codes: 'IT_SUPPORT', 'HARDWARE_SUPPORT', or 'BILLING'. Do not output full sentences.\n"
|
| 55 |
+
"4. If the user wants a refund, and you have collected 'order_id', you MUST first use the REFUND action. Then, in the next step, use ROUTE with 'BILLING'.\n\n"
|
| 56 |
+
f"Observation: {obs.model_dump_json()}"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
response = client.chat.completions.create(
|
| 61 |
+
model=MODEL_NAME,
|
| 62 |
+
messages=[{"role": "user", "content": prompt}],
|
| 63 |
+
response_format={"type": "json_object"}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Safely parse JSON in case Qwen outputs markdown ticks
|
| 67 |
+
raw_action = response.choices[0].message.content.strip()
|
| 68 |
+
if raw_action.startswith("```json"):
|
| 69 |
+
raw_action = raw_action[7:-3].strip()
|
| 70 |
+
elif raw_action.startswith("```"):
|
| 71 |
+
raw_action = raw_action[3:-3].strip()
|
| 72 |
+
|
| 73 |
+
action_data = json.loads(raw_action)
|
| 74 |
+
|
| 75 |
+
# Pydantic validation
|
| 76 |
+
action = Action(**action_data)
|
| 77 |
+
action_str = f"{action.action_type}('{action.argument}')"
|
| 78 |
+
|
| 79 |
+
# Env Step
|
| 80 |
+
obs = env.step(action)
|
| 81 |
+
done = obs.done
|
| 82 |
+
reward_val = float(obs.reward) if obs.reward is not None else 0.0
|
| 83 |
+
rewards_history.append(reward_val)
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
error_msg = str(e).replace('\n', ' ')
|
| 87 |
+
action_str = "ERROR"
|
| 88 |
+
done = True
|
| 89 |
+
rewards_history.append(0.00)
|
| 90 |
+
|
| 91 |
+
# [STEP] FORMAT
|
| 92 |
+
print(f"[STEP] step={step_idx} action={action_str} reward={reward_val:.2f} done={str(done).lower()} error={error_msg}")
|
| 93 |
+
|
| 94 |
+
# [END] FORMAT
|
| 95 |
+
# A score > 0.8 typically means success based on our grader logic
|
| 96 |
+
final_score = sum(rewards_history) if rewards_history else 0.0
|
| 97 |
+
success = final_score > 0.8
|
| 98 |
+
rewards_str = ",".join([f"{r:.2f}" for r in rewards_history])
|
| 99 |
+
print(f"[END] success={str(success).lower()} steps={step_idx} score={final_score:.2f} rewards={rewards_str}")
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
run_inference()
|
local_validate.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -uo pipefail
|
| 3 |
+
|
| 4 |
+
PING_URL="http://localhost:7860"
|
| 5 |
+
REPO_DIR="."
|
| 6 |
+
|
| 7 |
+
echo "Step 1/3: Pinging HF Space ($PING_URL/reset) ..."
|
| 8 |
+
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H "Content-Type: application/json" -d '{}' "$PING_URL/reset" --max-time 30 2>/dev/null || printf "000")
|
| 9 |
+
echo "HTTP_CODE: $HTTP_CODE"
|
| 10 |
+
|
| 11 |
+
echo "Step 2/3: Running docker build ..."
|
| 12 |
+
docker build "$REPO_DIR"
|
| 13 |
+
|
| 14 |
+
echo "Step 3/3: Running openenv validate ..."
|
| 15 |
+
openenv validate
|
models.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 2 |
+
from typing import List, Literal, Optional
|
| 3 |
+
import openenv.core.env_server.types as openenv_types
|
| 4 |
+
|
| 5 |
+
class Observation(openenv_types.Observation):
|
| 6 |
+
ticket_id: str = Field(default="", description="Unique ID of the ticket")
|
| 7 |
+
customer_message: str = Field(default="", description="The current message from the customer")
|
| 8 |
+
history: List[str] = Field(default_factory=list, description="Conversation history")
|
| 9 |
+
missing_info: List[str] = Field(default_factory=list, description="Fields required before routing")
|
| 10 |
+
status: str = Field(default="OPEN", description="Ticket status")
|
| 11 |
+
refund_processed: bool = Field(default=False, description="True if a refund was already executed")
|
| 12 |
+
|
| 13 |
+
class Action(openenv_types.Action):
|
| 14 |
+
action_type: Literal["ROUTE", "ASK_INFO", "REFUND", "CLOSE"] = Field(..., description="Type of action to take")
|
| 15 |
+
argument: str = Field(..., description="The category to route to, the question to ask, or the order ID to refund")
|
| 16 |
+
|
| 17 |
+
class State(openenv_types.State):
|
| 18 |
+
ticket_id: str = Field(default="", description="Unique ID of the ticket")
|
| 19 |
+
customer_message: str = Field(default="", description="The current message from the customer")
|
| 20 |
+
history: List[str] = Field(default_factory=list, description="Conversation history")
|
| 21 |
+
missing_info: List[str] = Field(default_factory=list, description="Fields required before routing")
|
| 22 |
+
status: str = Field(default="OPEN", description="Ticket status")
|
| 23 |
+
refund_processed: bool = Field(default=False, description="True if a refund was already executed")
|
| 24 |
+
|
| 25 |
+
class Reward(BaseModel):
|
| 26 |
+
value: float = Field(..., description="Numerical reward between -1.0 and 1.0")
|
| 27 |
+
feedback: str = Field(..., description="Incremental feedback for the agent")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: "customer-support-triage"
|
| 3 |
+
version: "1.0.0"
|
| 4 |
+
type: "space"
|
| 5 |
+
runtime: "fastapi"
|
| 6 |
+
app: "server.app:app"
|
| 7 |
+
port: 7860
|
| 8 |
+
description: "A real-world customer support simulation environment requiring information extraction, system actions, and routing."
|
| 9 |
+
tags: ["openenv", "text", "customer-support"]
|
| 10 |
+
tasks:
|
| 11 |
+
- easy_password_reset
|
| 12 |
+
- medium_hardware_issue
|
| 13 |
+
- hard_refund_processing
|
pyproject.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "metapy"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "A real-world customer support simulation environment"
|
| 5 |
+
dependencies = [
|
| 6 |
+
"openenv-core",
|
| 7 |
+
"fastapi",
|
| 8 |
+
"uvicorn",
|
| 9 |
+
"openai>=1.12.0",
|
| 10 |
+
"pydantic>=2.0.0",
|
| 11 |
+
"python-dotenv>=1.0.0"
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
[build-system]
|
| 15 |
+
requires = ["setuptools>=42", "wheel"]
|
| 16 |
+
build-backend = "setuptools.build_meta"
|
| 17 |
+
|
| 18 |
+
[tool.setuptools.packages.find]
|
| 19 |
+
where = ["."]
|
| 20 |
+
|
| 21 |
+
[project.scripts]
|
| 22 |
+
server = "server.app:main"
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openai>=1.12.0
|
| 2 |
+
pydantic>=2.0.0
|
| 3 |
+
python-dotenv>=1.0.0
|
| 4 |
+
openenv-core>=0.1.0
|
| 5 |
+
fastapi>=0.100.0
|
| 6 |
+
uvicorn>=0.23.0
|
server/app.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
load_dotenv()
|
| 3 |
+
|
| 4 |
+
import uvicorn
|
| 5 |
+
from openenv.core.env_server.http_server import create_app
|
| 6 |
+
from env import CustomerSupportEnv
|
| 7 |
+
from models import Action, Observation
|
| 8 |
+
|
| 9 |
+
# Use the environment class and Pydantic types to create the app
|
| 10 |
+
app = create_app(
|
| 11 |
+
CustomerSupportEnv,
|
| 12 |
+
Action,
|
| 13 |
+
Observation,
|
| 14 |
+
env_name="customer_support_env"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
def main():
|
| 18 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
main()
|
tasks.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class TaskDef:
|
| 2 |
+
def __init__(self, name: str, initial_msg: str, expected_route: str, required_info: list = None, needs_refund: bool = False):
|
| 3 |
+
self.name = name
|
| 4 |
+
self.initial_msg = initial_msg
|
| 5 |
+
self.expected_route = expected_route
|
| 6 |
+
self.required_info = required_info or[]
|
| 7 |
+
self.needs_refund = needs_refund
|
| 8 |
+
|
| 9 |
+
# Define 3 strict tasks (Easy, Medium, Hard)
|
| 10 |
+
TASKS =[
|
| 11 |
+
TaskDef(
|
| 12 |
+
name="easy_password_reset",
|
| 13 |
+
initial_msg="I forgot my password and cannot log in.",
|
| 14 |
+
expected_route="IT_SUPPORT"
|
| 15 |
+
),
|
| 16 |
+
TaskDef(
|
| 17 |
+
name="medium_hardware_issue",
|
| 18 |
+
initial_msg="My laptop won't turn on.",
|
| 19 |
+
expected_route="HARDWARE_SUPPORT",
|
| 20 |
+
required_info=["serial_number"]
|
| 21 |
+
),
|
| 22 |
+
TaskDef(
|
| 23 |
+
name="hard_refund_processing",
|
| 24 |
+
initial_msg="I want a refund for my recent purchase, it arrived broken.",
|
| 25 |
+
expected_route="BILLING",
|
| 26 |
+
required_info=["order_id", "photo_evidence"],
|
| 27 |
+
needs_refund=True
|
| 28 |
+
)
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
def grader(task: TaskDef, final_state: dict) -> float:
|
| 32 |
+
"""Deterministic programmatic grader returning 0.0 to 1.0"""
|
| 33 |
+
score = 0.0
|
| 34 |
+
total_checks = 1 + len(task.required_info) + (1 if task.needs_refund else 0)
|
| 35 |
+
|
| 36 |
+
if final_state.get("route") == task.expected_route:
|
| 37 |
+
score += 1.0
|
| 38 |
+
|
| 39 |
+
for info in task.required_info:
|
| 40 |
+
if info in final_state.get("collected_info",[]):
|
| 41 |
+
score += 1.0
|
| 42 |
+
|
| 43 |
+
if task.needs_refund and final_state.get("refund_processed", False):
|
| 44 |
+
score += 1.0
|
| 45 |
+
|
| 46 |
+
return score / total_checks
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|