Spaces:
Sleeping
Sleeping
Vighnesh commited on
Commit ·
95dc191
1
Parent(s): a016315
Cleanup: remove junk files, update .gitignore
Browse files- .gitignore +37 -4
- fix_metadata.py +0 -12
- fix_readme.py +0 -100
- inference.py +0 -267
- plot_reward_curve.py +0 -171
- reward_curve.png +0 -0
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -1,4 +1,37 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
*.
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
*.egg
|
| 10 |
+
|
| 11 |
+
# Environments
|
| 12 |
+
venv/
|
| 13 |
+
.venv/
|
| 14 |
+
env/
|
| 15 |
+
.env
|
| 16 |
+
|
| 17 |
+
# Jupyter
|
| 18 |
+
.ipynb_checkpoints/
|
| 19 |
+
*.ipynb_checkpoints
|
| 20 |
+
|
| 21 |
+
# Secrets
|
| 22 |
+
token.txt
|
| 23 |
+
*.token
|
| 24 |
+
.env*
|
| 25 |
+
|
| 26 |
+
# Lock files (not needed for this project)
|
| 27 |
+
uv.lock
|
| 28 |
+
|
| 29 |
+
# OS
|
| 30 |
+
.DS_Store
|
| 31 |
+
Thumbs.db
|
| 32 |
+
|
| 33 |
+
# One-off scripts (not part of environment)
|
| 34 |
+
fix_*.py
|
| 35 |
+
inference.py
|
| 36 |
+
plot_reward_curve.py
|
| 37 |
+
reward_curve.png
|
fix_metadata.py
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
content = open('server/support_environment.py', 'r', encoding='utf-8').read()
|
| 2 |
-
content = content.replace('\tdef get_metadata', ' def get_metadata')
|
| 3 |
-
content = content.replace('\t from openenv', ' from openenv')
|
| 4 |
-
content = content.replace('\t return EnvironmentMetadata', ' return EnvironmentMetadata')
|
| 5 |
-
content = content.replace('\t name=', ' name=')
|
| 6 |
-
content = content.replace('\t description=', ' description=')
|
| 7 |
-
content = content.replace('\t version=', ' version=')
|
| 8 |
-
content = content.replace('\t author=', ' author=')
|
| 9 |
-
content = content.replace('\t documentation_url=', ' documentation_url=')
|
| 10 |
-
content = content.replace('\t )', ' )')
|
| 11 |
-
open('server/support_environment.py', 'w', encoding='utf-8').write(content)
|
| 12 |
-
print('Done!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fix_readme.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
| 1 |
-
f = open('README.md', 'w', encoding='utf-8')
|
| 2 |
-
f.write('---\n')
|
| 3 |
-
f.write('title: Support Ticket Env\n')
|
| 4 |
-
f.write('emoji: \U0001f3ab\n')
|
| 5 |
-
f.write('colorFrom: blue\n')
|
| 6 |
-
f.write('colorTo: green\n')
|
| 7 |
-
f.write('sdk: docker\n')
|
| 8 |
-
f.write('tags:\n')
|
| 9 |
-
f.write(' - openenv\n')
|
| 10 |
-
f.write('pinned: false\n')
|
| 11 |
-
f.write('---\n\n')
|
| 12 |
-
f.write('# Customer Support Ticket Resolution Environment\n\n')
|
| 13 |
-
f.write('A real-world [OpenEnv](https://github.com/meta-pytorch/OpenEnv) environment where an AI agent acts as a customer support executive, triaging and resolving incoming tickets.\n\n')
|
| 14 |
-
f.write('## Overview\n\n')
|
| 15 |
-
f.write('Customer support triage is one of the most common real-world tasks for AI agents. Every company handles thousands of tickets daily. Getting the classification wrong routes the ticket to the wrong team. Choosing the wrong action has direct business impact. This environment trains agents to handle exactly this challenge.\n\n')
|
| 16 |
-
f.write('## Quick Start\n\n')
|
| 17 |
-
f.write('```python\n')
|
| 18 |
-
f.write('from support_ticket_env import SupportAction, SupportTicketEnv\n\n')
|
| 19 |
-
f.write('with SupportTicketEnv(base_url="https://algocore-support-ticket-env.hf.space").sync() as env:\n')
|
| 20 |
-
f.write(' # Task 1 - Classify a ticket\n')
|
| 21 |
-
f.write(' result = env.reset(task_id=1, seed=42)\n')
|
| 22 |
-
f.write(' print(result.observation.ticket_text)\n\n')
|
| 23 |
-
f.write(' result = env.step(SupportAction(action_type="classify", category="billing"))\n')
|
| 24 |
-
f.write(' print(result.reward) # 1.0 if correct\n')
|
| 25 |
-
f.write('```\n\n')
|
| 26 |
-
f.write('## Tasks\n\n')
|
| 27 |
-
f.write('| Task | Difficulty | Description | Score Range |\n')
|
| 28 |
-
f.write('|------|-----------|-------------|-------------|\n')
|
| 29 |
-
f.write('| Task 1 | Easy | Classify ticket into correct category | 0.0 - 1.0 |\n')
|
| 30 |
-
f.write('| Task 2 | Medium | Classify then choose correct action | 0.0 - 1.0 |\n')
|
| 31 |
-
f.write('| Task 3 | Hard | Resolve a full queue of 3 tickets | 0.0 - 1.0 |\n\n')
|
| 32 |
-
f.write('## Action Space\n\n')
|
| 33 |
-
f.write('Actions are `SupportAction` Pydantic objects:\n\n')
|
| 34 |
-
f.write('| Field | Type | Required | Values |\n')
|
| 35 |
-
f.write('|-------|------|----------|--------|\n')
|
| 36 |
-
f.write('| `action_type` | str | always | `classify` / `reply` / `escalate` / `close` |\n')
|
| 37 |
-
f.write('| `category` | str | for classify | `billing` / `technical` / `account` / `general` / `refund` |\n')
|
| 38 |
-
f.write('| `reply_text` | str | for reply | free text |\n')
|
| 39 |
-
f.write('| `reason` | str | optional | free text |\n\n')
|
| 40 |
-
f.write('## Observation Space\n\n')
|
| 41 |
-
f.write('| Field | Type | Description |\n')
|
| 42 |
-
f.write('|-------|------|-------------|\n')
|
| 43 |
-
f.write('| `ticket_id` | str | Unique ticket ID |\n')
|
| 44 |
-
f.write('| `ticket_text` | str | Customer message |\n')
|
| 45 |
-
f.write('| `task_id` | int | 1, 2, or 3 |\n')
|
| 46 |
-
f.write('| `current_category` | str | Category assigned so far |\n')
|
| 47 |
-
f.write('| `resolved` | bool | Whether ticket is resolved |\n')
|
| 48 |
-
f.write('| `step_count` | int | Steps taken this episode |\n')
|
| 49 |
-
f.write('| `feedback` | str | Human-readable feedback |\n')
|
| 50 |
-
f.write('| `reward` | float | Reward signal |\n')
|
| 51 |
-
f.write('| `done` | bool | Episode finished |\n\n')
|
| 52 |
-
f.write('## Reward Function\n\n')
|
| 53 |
-
f.write('Rewards provide partial progress signals throughout the trajectory:\n\n')
|
| 54 |
-
f.write('- **Task 1:** 1.0 for correct category, 0.0 for wrong\n')
|
| 55 |
-
f.write('- **Task 2:** 1.0 correct action, 0.5 defensible alternative, 0.3 classification only\n')
|
| 56 |
-
f.write('- **Task 3:** 0.20 classification + 0.40 action + 0.25 reply quality + 0.15 efficiency bonus\n')
|
| 57 |
-
f.write('- **Penalty:** -0.05 per step over 10 (loop deterrent)\n\n')
|
| 58 |
-
f.write('## Project Structure\n\n')
|
| 59 |
-
f.write('```\n')
|
| 60 |
-
f.write('support_ticket_env/\n')
|
| 61 |
-
f.write('├── __init__.py # Package exports\n')
|
| 62 |
-
f.write('├── models.py # SupportAction, SupportObservation, SupportState\n')
|
| 63 |
-
f.write('├── tickets.py # Ticket dataset with ground-truth labels\n')
|
| 64 |
-
f.write('├── graders.py # Reward/grader functions for all 3 tasks\n')
|
| 65 |
-
f.write('├── client.py # EnvClient subclass\n')
|
| 66 |
-
f.write('├── baseline.py # Baseline inference script\n')
|
| 67 |
-
f.write('├── openenv.yaml # Environment metadata\n')
|
| 68 |
-
f.write('├── Dockerfile # Container definition\n')
|
| 69 |
-
f.write('└── server/\n')
|
| 70 |
-
f.write(' ├── app.py # FastAPI entry point\n')
|
| 71 |
-
f.write(' └── support_environment.py # Environment logic\n')
|
| 72 |
-
f.write('```\n\n')
|
| 73 |
-
f.write('## Setup\n\n')
|
| 74 |
-
f.write('```bash\n')
|
| 75 |
-
f.write('# Install dependencies\n')
|
| 76 |
-
f.write('pip install openenv-core fastapi uvicorn pydantic gradio openai\n\n')
|
| 77 |
-
f.write('# Run locally\n')
|
| 78 |
-
f.write('cd support_ticket_env\n')
|
| 79 |
-
f.write('uvicorn server.app:app --host 0.0.0.0 --port 7860\n\n')
|
| 80 |
-
f.write('# Docker\n')
|
| 81 |
-
f.write('docker build -t support-ticket-env .\n')
|
| 82 |
-
f.write('docker run -p 7860:7860 support-ticket-env\n\n')
|
| 83 |
-
f.write('# Run tests\n')
|
| 84 |
-
f.write('python run_tests.py\n')
|
| 85 |
-
f.write('```\n\n')
|
| 86 |
-
f.write('## Baseline Scores\n\n')
|
| 87 |
-
f.write('Measured with `gpt-4o-mini`, seeds `[42, 7, 123]`:\n\n')
|
| 88 |
-
f.write('| Task | Avg Score |\n')
|
| 89 |
-
f.write('|------|-----------|\n')
|
| 90 |
-
f.write('| Task 1 - Classification | 0.87 |\n')
|
| 91 |
-
f.write('| Task 2 - Action Selection | 0.71 |\n')
|
| 92 |
-
f.write('| Task 3 - Full Resolution | 0.58 |\n')
|
| 93 |
-
f.write('| **Overall** | **0.72** |\n\n')
|
| 94 |
-
f.write('## Links\n\n')
|
| 95 |
-
f.write('- **HuggingFace Space:** https://huggingface.co/spaces/AlgoCore/support-ticket-env\n')
|
| 96 |
-
f.write('- **GitHub:** https://github.com/TryingHardToBeDeveloper/support-ticket-env\n')
|
| 97 |
-
f.write('- **OpenEnv Docs:** https://meta-pytorch.org/OpenEnv/\n\n')
|
| 98 |
-
f.write('## License\n\nMIT\n')
|
| 99 |
-
f.close()
|
| 100 |
-
print('Done!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
DELETED
|
@@ -1,267 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
inference.py - Support Ticket Resolution Environment
|
| 3 |
-
Follows mandatory [START] [STEP] [END] logging format.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import asyncio
|
| 7 |
-
import os
|
| 8 |
-
import sys
|
| 9 |
-
import json
|
| 10 |
-
import re
|
| 11 |
-
from typing import List, Optional
|
| 12 |
-
|
| 13 |
-
ROOT = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
-
sys.path.insert(0, ROOT)
|
| 15 |
-
|
| 16 |
-
from openai import OpenAI
|
| 17 |
-
from support_ticket_env.server.support_environment import SupportTicketEnvironment
|
| 18 |
-
from support_ticket_env.models import SupportAction
|
| 19 |
-
|
| 20 |
-
# ── Environment variables ────────────────────────────────────────
|
| 21 |
-
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 22 |
-
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 23 |
-
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 24 |
-
TASK_NAME = "support-ticket-resolution"
|
| 25 |
-
BENCHMARK = "support_ticket_env"
|
| 26 |
-
MAX_STEPS = 10
|
| 27 |
-
SUCCESS_SCORE_THRESHOLD = 0.5
|
| 28 |
-
|
| 29 |
-
VALID_CATEGORIES = ["billing", "technical", "account", "general", "refund"]
|
| 30 |
-
VALID_ACTIONS = ["classify", "reply", "escalate", "close"]
|
| 31 |
-
|
| 32 |
-
SYSTEM_PROMPT = """You are a customer support AI agent handling tickets.
|
| 33 |
-
You receive a JSON with ticket_text, task_id, feedback, and current_category.
|
| 34 |
-
|
| 35 |
-
Respond ONLY with a JSON object (no markdown, no explanation):
|
| 36 |
-
{
|
| 37 |
-
"action_type": "classify" | "reply" | "escalate" | "close",
|
| 38 |
-
"category": "billing" | "technical" | "account" | "general" | "refund",
|
| 39 |
-
"reply_text": "...",
|
| 40 |
-
"reason": "..."
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
Rules:
|
| 44 |
-
- For task 1: use action_type=classify and pick the correct category.
|
| 45 |
-
- For task 2: first classify, then on next step reply/escalate/close.
|
| 46 |
-
- For task 3: classify each ticket then resolve it (classify first, then action).
|
| 47 |
-
- category is ONLY needed when action_type=classify.
|
| 48 |
-
- reply_text is ONLY needed when action_type=reply.
|
| 49 |
-
|
| 50 |
-
Category detection rules:
|
| 51 |
-
- billing: mentions charge, invoice, payment, bill, subscription, price, cost, fee
|
| 52 |
-
- technical: mentions error, bug, crash, not working, broken, API, 500, upload, fail
|
| 53 |
-
- account: mentions login, password, account, access, sign in, email, cancel, subscription cancel
|
| 54 |
-
- refund: mentions refund, return, money back, reimburse, unused
|
| 55 |
-
- general: mentions hours, phone, contact, business hours, information
|
| 56 |
-
|
| 57 |
-
Action rules:
|
| 58 |
-
- technical tickets -> escalate (include 'escalate' and 'engineering' in reason)
|
| 59 |
-
- general tickets that are resolved/thank you -> close
|
| 60 |
-
- all others -> reply
|
| 61 |
-
|
| 62 |
-
When replying, your reply_text MUST include relevant keywords:
|
| 63 |
-
- billing reply: include words like 'charge', 'invoice', 'payment', 'billing'
|
| 64 |
-
- account reply: include words like 'account', 'password', 'login', 'subscription'
|
| 65 |
-
- refund reply: include words like 'refund', 'return', 'credit', 'process'
|
| 66 |
-
- general reply: include words like 'hours', 'contact', 'phone', 'information'
|
| 67 |
-
- technical escalation reason: include 'engineering', 'escalate', 'bug', 'error'
|
| 68 |
-
"""
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def log_start(task: str, env: str, model: str) -> None:
|
| 72 |
-
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 76 |
-
error_val = error if error else "null"
|
| 77 |
-
done_val = str(done).lower()
|
| 78 |
-
print(
|
| 79 |
-
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 80 |
-
flush=True,
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 85 |
-
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 86 |
-
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def parse_response(text: str) -> dict:
|
| 90 |
-
text = text.strip()
|
| 91 |
-
text = re.sub(r"^```(?:json)?\s*", "", text)
|
| 92 |
-
text = re.sub(r"\s*```$", "", text)
|
| 93 |
-
try:
|
| 94 |
-
return json.loads(text)
|
| 95 |
-
except Exception:
|
| 96 |
-
match = re.search(r"\{.*\}", text, re.DOTALL)
|
| 97 |
-
if match:
|
| 98 |
-
return json.loads(match.group())
|
| 99 |
-
raise
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
CATEGORY_KEYWORDS = {
|
| 103 |
-
"billing": ["charge", "invoice", "payment", "bill", "refund", "subscription", "price", "cost", "fee", "money"],
|
| 104 |
-
"technical": ["error", "bug", "crash", "not working", "broken", "issue", "problem", "fail", "500", "api"],
|
| 105 |
-
"account": ["login", "password", "account", "access", "sign in", "email", "username", "cancel"],
|
| 106 |
-
"refund": ["refund", "return", "money back", "reimburse", "cancel order"],
|
| 107 |
-
"general": ["hours", "contact", "phone", "help", "question", "info", "support"],
|
| 108 |
-
}
|
| 109 |
-
|
| 110 |
-
def rule_based_action(obs) -> dict:
|
| 111 |
-
"""Simple deterministic fallback agent — no API needed."""
|
| 112 |
-
text = obs.ticket_text.lower()
|
| 113 |
-
# Classify by keywords
|
| 114 |
-
if not obs.current_category:
|
| 115 |
-
best_cat = "general"
|
| 116 |
-
best_score = 0
|
| 117 |
-
for cat, keywords in CATEGORY_KEYWORDS.items():
|
| 118 |
-
score = sum(1 for kw in keywords if kw in text)
|
| 119 |
-
if score > best_score:
|
| 120 |
-
best_score = score
|
| 121 |
-
best_cat = cat
|
| 122 |
-
return {"action_type": "classify", "category": best_cat}
|
| 123 |
-
# After classification — choose action based on category
|
| 124 |
-
cat = obs.current_category
|
| 125 |
-
if cat == "technical":
|
| 126 |
-
return {"action_type": "escalate", "reason": "Technical issue requires engineering team"}
|
| 127 |
-
elif cat == "general":
|
| 128 |
-
return {"action_type": "close", "reason": "General inquiry resolved"}
|
| 129 |
-
else:
|
| 130 |
-
return {
|
| 131 |
-
"action_type": "reply",
|
| 132 |
-
"reply_text": f"Thank you for contacting us about your {cat} issue. We are looking into it and will resolve it shortly."
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
def get_model_action(client: OpenAI, obs, history: List[str]) -> dict:
|
| 137 |
-
"""Try LLM first, fall back to rule-based if API unavailable."""
|
| 138 |
-
if not API_KEY:
|
| 139 |
-
return rule_based_action(obs)
|
| 140 |
-
user_prompt = json.dumps({
|
| 141 |
-
"ticket_id": obs.ticket_id,
|
| 142 |
-
"ticket_text": obs.ticket_text,
|
| 143 |
-
"task_id": obs.task_id,
|
| 144 |
-
"current_category": obs.current_category,
|
| 145 |
-
"step_count": obs.step_count,
|
| 146 |
-
"feedback": obs.feedback,
|
| 147 |
-
})
|
| 148 |
-
messages = [
|
| 149 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 150 |
-
{"role": "user", "content": user_prompt},
|
| 151 |
-
]
|
| 152 |
-
try:
|
| 153 |
-
completion = client.chat.completions.create(
|
| 154 |
-
model=MODEL_NAME,
|
| 155 |
-
messages=messages,
|
| 156 |
-
temperature=0.0,
|
| 157 |
-
max_tokens=256,
|
| 158 |
-
stream=False,
|
| 159 |
-
)
|
| 160 |
-
text = (completion.choices[0].message.content or "").strip()
|
| 161 |
-
return parse_response(text)
|
| 162 |
-
except Exception as exc:
|
| 163 |
-
print(f"[DEBUG] Model request failed, using fallback: {exc}", flush=True)
|
| 164 |
-
return rule_based_action(obs)
|
| 165 |
-
user_prompt = json.dumps({
|
| 166 |
-
"ticket_id": obs.ticket_id,
|
| 167 |
-
"ticket_text": obs.ticket_text,
|
| 168 |
-
"task_id": obs.task_id,
|
| 169 |
-
"current_category": obs.current_category,
|
| 170 |
-
"step_count": obs.step_count,
|
| 171 |
-
"feedback": obs.feedback,
|
| 172 |
-
})
|
| 173 |
-
messages = [
|
| 174 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 175 |
-
{"role": "user", "content": user_prompt},
|
| 176 |
-
]
|
| 177 |
-
try:
|
| 178 |
-
completion = client.chat.completions.create(
|
| 179 |
-
model=MODEL_NAME,
|
| 180 |
-
messages=messages,
|
| 181 |
-
temperature=0.0,
|
| 182 |
-
max_tokens=256,
|
| 183 |
-
stream=False,
|
| 184 |
-
)
|
| 185 |
-
text = (completion.choices[0].message.content or "").strip()
|
| 186 |
-
return parse_response(text)
|
| 187 |
-
except Exception as exc:
|
| 188 |
-
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 189 |
-
return {"action_type": "classify", "category": "general"}
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
def run_task(task_id: int, seed: int, client: OpenAI) -> float:
|
| 193 |
-
env = SupportTicketEnvironment()
|
| 194 |
-
obs = env.reset(task_id=task_id, seed=seed)
|
| 195 |
-
|
| 196 |
-
history: List[str] = []
|
| 197 |
-
rewards: List[float] = []
|
| 198 |
-
steps_taken = 0
|
| 199 |
-
score = 0.0
|
| 200 |
-
success = False
|
| 201 |
-
|
| 202 |
-
log_start(task=f"{TASK_NAME}-task{task_id}", env=BENCHMARK, model=MODEL_NAME)
|
| 203 |
-
|
| 204 |
-
try:
|
| 205 |
-
for step in range(1, MAX_STEPS + 1):
|
| 206 |
-
if obs.done:
|
| 207 |
-
break
|
| 208 |
-
|
| 209 |
-
action_dict = get_model_action(client, obs, history)
|
| 210 |
-
action_str = f"{action_dict.get('action_type','?')}"
|
| 211 |
-
if action_dict.get("category"):
|
| 212 |
-
action_str += f"/{action_dict['category']}"
|
| 213 |
-
|
| 214 |
-
error = None
|
| 215 |
-
try:
|
| 216 |
-
action = SupportAction(**action_dict)
|
| 217 |
-
obs = env.step(action)
|
| 218 |
-
reward = obs.reward or 0.0
|
| 219 |
-
done = obs.done
|
| 220 |
-
except Exception as e:
|
| 221 |
-
reward = 0.0
|
| 222 |
-
done = False
|
| 223 |
-
error = str(e)
|
| 224 |
-
|
| 225 |
-
rewards.append(reward)
|
| 226 |
-
steps_taken = step
|
| 227 |
-
|
| 228 |
-
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 229 |
-
|
| 230 |
-
history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
|
| 231 |
-
|
| 232 |
-
if done:
|
| 233 |
-
break
|
| 234 |
-
|
| 235 |
-
total = sum(rewards)
|
| 236 |
-
score = min(max(round(total / max(steps_taken, 1), 3), 0.0), 1.0)
|
| 237 |
-
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 238 |
-
|
| 239 |
-
finally:
|
| 240 |
-
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 241 |
-
|
| 242 |
-
return score
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
def main() -> None:
|
| 246 |
-
if not API_KEY:
|
| 247 |
-
print("[DEBUG] HF_TOKEN not set", flush=True)
|
| 248 |
-
sys.exit(1)
|
| 249 |
-
|
| 250 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 251 |
-
|
| 252 |
-
all_scores = {}
|
| 253 |
-
for task_id in [1, 2, 3]:
|
| 254 |
-
scores = []
|
| 255 |
-
for seed in [42, 7, 123, 0, 99]:
|
| 256 |
-
score = run_task(task_id, seed, client)
|
| 257 |
-
scores.append(score)
|
| 258 |
-
avg = round(sum(scores) / len(scores), 4)
|
| 259 |
-
all_scores[f"task{task_id}"] = avg
|
| 260 |
-
print(f"[DEBUG] Task {task_id} avg score: {avg}", flush=True)
|
| 261 |
-
|
| 262 |
-
overall = round(sum(all_scores.values()) / len(all_scores), 4)
|
| 263 |
-
print(f"[DEBUG] Overall avg score: {overall}", flush=True)
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
if __name__ == "__main__":
|
| 267 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plot_reward_curve.py
DELETED
|
@@ -1,171 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
plot_reward_curve.py — Reward curve bar chart for hackathon pitch.
|
| 3 |
-
Shows Before (rule-based baseline) vs After (LLM agent) scores for Task 1/2/3.
|
| 4 |
-
|
| 5 |
-
Usage:
|
| 6 |
-
python plot_reward_curve.py # uses hardcoded scores
|
| 7 |
-
python plot_reward_curve.py --run-inference # runs inference.py first (needs HF_TOKEN)
|
| 8 |
-
|
| 9 |
-
Output: reward_curve.png (saved next to this script)
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import os
|
| 13 |
-
import sys
|
| 14 |
-
import subprocess
|
| 15 |
-
import json
|
| 16 |
-
import re
|
| 17 |
-
import argparse
|
| 18 |
-
|
| 19 |
-
import matplotlib
|
| 20 |
-
matplotlib.use("Agg") # headless — safe on all machines
|
| 21 |
-
import matplotlib.pyplot as plt
|
| 22 |
-
import matplotlib.patches as mpatches
|
| 23 |
-
import numpy as np
|
| 24 |
-
|
| 25 |
-
# ── Baseline scores (rule-based, from session recap) ────────────────────────
|
| 26 |
-
BASELINE = {
|
| 27 |
-
"Task 1": 0.10,
|
| 28 |
-
"Task 2": 0.11,
|
| 29 |
-
"Task 3": 0.26,
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
# ── After scores — override these after running inference, or use --run-inference
|
| 33 |
-
AFTER = {
|
| 34 |
-
"Task 1": 0.72,
|
| 35 |
-
"Task 2": 0.65,
|
| 36 |
-
"Task 3": 0.54,
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def run_inference_and_parse() -> dict:
|
| 41 |
-
"""Run inference.py with seeds 42,7,123 and parse [DEBUG] avg lines."""
|
| 42 |
-
print("[plot] Running inference.py to collect live scores...", flush=True)
|
| 43 |
-
env = os.environ.copy()
|
| 44 |
-
result = subprocess.run(
|
| 45 |
-
[sys.executable, os.path.join(os.path.dirname(__file__), "inference.py")],
|
| 46 |
-
capture_output=True, text=True, env=env
|
| 47 |
-
)
|
| 48 |
-
output = result.stdout + result.stderr
|
| 49 |
-
print(output, flush=True)
|
| 50 |
-
|
| 51 |
-
scores = {}
|
| 52 |
-
for line in output.splitlines():
|
| 53 |
-
m = re.search(r"\[DEBUG\] Task (\d) avg score: ([0-9.]+)", line)
|
| 54 |
-
if m:
|
| 55 |
-
scores[f"Task {m.group(1)}"] = float(m.group(2))
|
| 56 |
-
|
| 57 |
-
if len(scores) < 3:
|
| 58 |
-
print("[plot] WARNING: Could not parse all 3 task scores. Using hardcoded AFTER values.", flush=True)
|
| 59 |
-
return AFTER
|
| 60 |
-
return scores
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def plot_chart(baseline: dict, after: dict, out_path: str) -> None:
|
| 64 |
-
tasks = list(baseline.keys())
|
| 65 |
-
x = np.arange(len(tasks))
|
| 66 |
-
width = 0.32
|
| 67 |
-
|
| 68 |
-
# ── Colours ─────────────────────────────────────────────────────────────
|
| 69 |
-
COLOR_BEFORE = "#E05A5A" # warm red
|
| 70 |
-
COLOR_AFTER = "#4CAF82" # teal green
|
| 71 |
-
BG = "#1A1A2E"
|
| 72 |
-
PANEL = "#16213E"
|
| 73 |
-
TEXT = "#E0E0E0"
|
| 74 |
-
GRID = "#2A2A4A"
|
| 75 |
-
|
| 76 |
-
fig, ax = plt.subplots(figsize=(10, 6))
|
| 77 |
-
fig.patch.set_facecolor(BG)
|
| 78 |
-
ax.set_facecolor(PANEL)
|
| 79 |
-
|
| 80 |
-
bars_before = ax.bar(x - width/2, [baseline[t] for t in tasks],
|
| 81 |
-
width, label="Before (Rule-based)", color=COLOR_BEFORE,
|
| 82 |
-
zorder=3, edgecolor="none", linewidth=0)
|
| 83 |
-
bars_after = ax.bar(x + width/2, [after[t] for t in tasks],
|
| 84 |
-
width, label="After (LLM Agent)", color=COLOR_AFTER,
|
| 85 |
-
zorder=3, edgecolor="none", linewidth=0)
|
| 86 |
-
|
| 87 |
-
# ── Value labels on bars ─────────────────────────────────────────────────
|
| 88 |
-
for bar in bars_before:
|
| 89 |
-
h = bar.get_height()
|
| 90 |
-
ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015,
|
| 91 |
-
f"{h:.2f}", ha="center", va="bottom",
|
| 92 |
-
color=COLOR_BEFORE, fontsize=11, fontweight="bold")
|
| 93 |
-
|
| 94 |
-
for bar in bars_after:
|
| 95 |
-
h = bar.get_height()
|
| 96 |
-
ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015,
|
| 97 |
-
f"{h:.2f}", ha="center", va="bottom",
|
| 98 |
-
color=COLOR_AFTER, fontsize=11, fontweight="bold")
|
| 99 |
-
|
| 100 |
-
# ── Improvement arrows ───────────────────────────────────────────────────
|
| 101 |
-
for i, task in enumerate(tasks):
|
| 102 |
-
b, a = baseline[task], after[task]
|
| 103 |
-
delta = a - b
|
| 104 |
-
mid_x = x[i]
|
| 105 |
-
arrow_y = max(b, a) + 0.07
|
| 106 |
-
ax.annotate(
|
| 107 |
-
f"+{delta:.2f}",
|
| 108 |
-
xy=(mid_x, arrow_y),
|
| 109 |
-
ha="center", va="bottom",
|
| 110 |
-
color="#FFD700", fontsize=10, fontweight="bold",
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
-
# ── Axes styling ─────────────────────────────────────────────────────────
|
| 114 |
-
ax.set_xticks(x)
|
| 115 |
-
ax.set_xticklabels(tasks, color=TEXT, fontsize=13)
|
| 116 |
-
ax.set_ylim(0, 1.05)
|
| 117 |
-
ax.set_ylabel("Score (0.0 – 1.0)", color=TEXT, fontsize=12)
|
| 118 |
-
ax.set_xlabel("Environment Task", color=TEXT, fontsize=12)
|
| 119 |
-
ax.tick_params(colors=TEXT)
|
| 120 |
-
ax.yaxis.grid(True, color=GRID, linewidth=0.8, zorder=0)
|
| 121 |
-
ax.set_axisbelow(True)
|
| 122 |
-
for spine in ax.spines.values():
|
| 123 |
-
spine.set_visible(False)
|
| 124 |
-
|
| 125 |
-
# ── Title ────────────────────────────────────────────────────────────────
|
| 126 |
-
ax.set_title(
|
| 127 |
-
"Support Ticket Env — Reward Improvement\nRule-Based Baseline vs LLM Agent (Qwen2.5-72B)",
|
| 128 |
-
color=TEXT, fontsize=14, fontweight="bold", pad=16,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# ── Legend ───────────────────────────────────────────────────────────────
|
| 132 |
-
legend = ax.legend(
|
| 133 |
-
handles=[
|
| 134 |
-
mpatches.Patch(color=COLOR_BEFORE, label="Before (Rule-based Baseline)"),
|
| 135 |
-
mpatches.Patch(color=COLOR_AFTER, label="After (LLM Agent — Qwen2.5-72B)"),
|
| 136 |
-
],
|
| 137 |
-
facecolor=BG, edgecolor=GRID, labelcolor=TEXT, fontsize=11,
|
| 138 |
-
loc="upper right",
|
| 139 |
-
)
|
| 140 |
-
|
| 141 |
-
# ── Overall delta watermark ───────────────────────────────────────────────
|
| 142 |
-
overall_before = round(sum(baseline.values()) / len(baseline), 3)
|
| 143 |
-
overall_after = round(sum(after.values()) / len(after), 3)
|
| 144 |
-
fig.text(
|
| 145 |
-
0.5, 0.01,
|
| 146 |
-
f"Overall: {overall_before:.2f} → {overall_after:.2f} (+{overall_after - overall_before:.2f})",
|
| 147 |
-
ha="center", color="#FFD700", fontsize=11, fontweight="bold",
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
plt.tight_layout(rect=[0, 0.04, 1, 1])
|
| 151 |
-
fig.savefig(out_path, dpi=150, bbox_inches="tight", facecolor=BG)
|
| 152 |
-
print(f"[plot] Chart saved -> {out_path}", flush=True)
|
| 153 |
-
plt.close(fig)
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
def main():
|
| 157 |
-
parser = argparse.ArgumentParser(description="Plot reward curve chart")
|
| 158 |
-
parser.add_argument("--run-inference", action="store_true",
|
| 159 |
-
help="Run inference.py first and use live scores as AFTER values")
|
| 160 |
-
parser.add_argument("--out", default=os.path.join(os.path.dirname(__file__), "reward_curve.png"),
|
| 161 |
-
help="Output PNG path (default: reward_curve.png)")
|
| 162 |
-
args = parser.parse_args()
|
| 163 |
-
|
| 164 |
-
after_scores = run_inference_and_parse() if args.run_inference else AFTER
|
| 165 |
-
|
| 166 |
-
plot_chart(BASELINE, after_scores, args.out)
|
| 167 |
-
print("[plot] Done.", flush=True)
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if __name__ == "__main__":
|
| 171 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reward_curve.png
DELETED
|
Binary file (73.2 kB)
|
|
|
uv.lock
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|