Vighnesh commited on
Commit
95dc191
·
1 Parent(s): a016315

Cleanup: remove junk files, update .gitignore

Browse files
Files changed (7) hide show
  1. .gitignore +37 -4
  2. fix_metadata.py +0 -12
  3. fix_readme.py +0 -100
  4. inference.py +0 -267
  5. plot_reward_curve.py +0 -171
  6. reward_curve.png +0 -0
  7. uv.lock +0 -0
.gitignore CHANGED
@@ -1,4 +1,37 @@
1
- __pycache__/
2
- *.pyc
3
- *.pyo
4
- token.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ *.egg
10
+
11
+ # Environments
12
+ venv/
13
+ .venv/
14
+ env/
15
+ .env
16
+
17
+ # Jupyter
18
+ .ipynb_checkpoints/
19
+ *.ipynb_checkpoints
20
+
21
+ # Secrets
22
+ token.txt
23
+ *.token
24
+ .env*
25
+
26
+ # Lock files (not needed for this project)
27
+ uv.lock
28
+
29
+ # OS
30
+ .DS_Store
31
+ Thumbs.db
32
+
33
+ # One-off scripts (not part of environment)
34
+ fix_*.py
35
+ inference.py
36
+ plot_reward_curve.py
37
+ reward_curve.png
fix_metadata.py DELETED
@@ -1,12 +0,0 @@
1
- content = open('server/support_environment.py', 'r', encoding='utf-8').read()
2
- content = content.replace('\tdef get_metadata', ' def get_metadata')
3
- content = content.replace('\t from openenv', ' from openenv')
4
- content = content.replace('\t return EnvironmentMetadata', ' return EnvironmentMetadata')
5
- content = content.replace('\t name=', ' name=')
6
- content = content.replace('\t description=', ' description=')
7
- content = content.replace('\t version=', ' version=')
8
- content = content.replace('\t author=', ' author=')
9
- content = content.replace('\t documentation_url=', ' documentation_url=')
10
- content = content.replace('\t )', ' )')
11
- open('server/support_environment.py', 'w', encoding='utf-8').write(content)
12
- print('Done!')
 
 
 
 
 
 
 
 
 
 
 
 
 
fix_readme.py DELETED
@@ -1,100 +0,0 @@
1
- f = open('README.md', 'w', encoding='utf-8')
2
- f.write('---\n')
3
- f.write('title: Support Ticket Env\n')
4
- f.write('emoji: \U0001f3ab\n')
5
- f.write('colorFrom: blue\n')
6
- f.write('colorTo: green\n')
7
- f.write('sdk: docker\n')
8
- f.write('tags:\n')
9
- f.write(' - openenv\n')
10
- f.write('pinned: false\n')
11
- f.write('---\n\n')
12
- f.write('# Customer Support Ticket Resolution Environment\n\n')
13
- f.write('A real-world [OpenEnv](https://github.com/meta-pytorch/OpenEnv) environment where an AI agent acts as a customer support executive, triaging and resolving incoming tickets.\n\n')
14
- f.write('## Overview\n\n')
15
- f.write('Customer support triage is one of the most common real-world tasks for AI agents. Every company handles thousands of tickets daily. Getting the classification wrong routes the ticket to the wrong team. Choosing the wrong action has direct business impact. This environment trains agents to handle exactly this challenge.\n\n')
16
- f.write('## Quick Start\n\n')
17
- f.write('```python\n')
18
- f.write('from support_ticket_env import SupportAction, SupportTicketEnv\n\n')
19
- f.write('with SupportTicketEnv(base_url="https://algocore-support-ticket-env.hf.space").sync() as env:\n')
20
- f.write(' # Task 1 - Classify a ticket\n')
21
- f.write(' result = env.reset(task_id=1, seed=42)\n')
22
- f.write(' print(result.observation.ticket_text)\n\n')
23
- f.write(' result = env.step(SupportAction(action_type="classify", category="billing"))\n')
24
- f.write(' print(result.reward) # 1.0 if correct\n')
25
- f.write('```\n\n')
26
- f.write('## Tasks\n\n')
27
- f.write('| Task | Difficulty | Description | Score Range |\n')
28
- f.write('|------|-----------|-------------|-------------|\n')
29
- f.write('| Task 1 | Easy | Classify ticket into correct category | 0.0 - 1.0 |\n')
30
- f.write('| Task 2 | Medium | Classify then choose correct action | 0.0 - 1.0 |\n')
31
- f.write('| Task 3 | Hard | Resolve a full queue of 3 tickets | 0.0 - 1.0 |\n\n')
32
- f.write('## Action Space\n\n')
33
- f.write('Actions are `SupportAction` Pydantic objects:\n\n')
34
- f.write('| Field | Type | Required | Values |\n')
35
- f.write('|-------|------|----------|--------|\n')
36
- f.write('| `action_type` | str | always | `classify` / `reply` / `escalate` / `close` |\n')
37
- f.write('| `category` | str | for classify | `billing` / `technical` / `account` / `general` / `refund` |\n')
38
- f.write('| `reply_text` | str | for reply | free text |\n')
39
- f.write('| `reason` | str | optional | free text |\n\n')
40
- f.write('## Observation Space\n\n')
41
- f.write('| Field | Type | Description |\n')
42
- f.write('|-------|------|-------------|\n')
43
- f.write('| `ticket_id` | str | Unique ticket ID |\n')
44
- f.write('| `ticket_text` | str | Customer message |\n')
45
- f.write('| `task_id` | int | 1, 2, or 3 |\n')
46
- f.write('| `current_category` | str | Category assigned so far |\n')
47
- f.write('| `resolved` | bool | Whether ticket is resolved |\n')
48
- f.write('| `step_count` | int | Steps taken this episode |\n')
49
- f.write('| `feedback` | str | Human-readable feedback |\n')
50
- f.write('| `reward` | float | Reward signal |\n')
51
- f.write('| `done` | bool | Episode finished |\n\n')
52
- f.write('## Reward Function\n\n')
53
- f.write('Rewards provide partial progress signals throughout the trajectory:\n\n')
54
- f.write('- **Task 1:** 1.0 for correct category, 0.0 for wrong\n')
55
- f.write('- **Task 2:** 1.0 correct action, 0.5 defensible alternative, 0.3 classification only\n')
56
- f.write('- **Task 3:** 0.20 classification + 0.40 action + 0.25 reply quality + 0.15 efficiency bonus\n')
57
- f.write('- **Penalty:** -0.05 per step over 10 (loop deterrent)\n\n')
58
- f.write('## Project Structure\n\n')
59
- f.write('```\n')
60
- f.write('support_ticket_env/\n')
61
- f.write('├── __init__.py # Package exports\n')
62
- f.write('├── models.py # SupportAction, SupportObservation, SupportState\n')
63
- f.write('├── tickets.py # Ticket dataset with ground-truth labels\n')
64
- f.write('├── graders.py # Reward/grader functions for all 3 tasks\n')
65
- f.write('├── client.py # EnvClient subclass\n')
66
- f.write('├── baseline.py # Baseline inference script\n')
67
- f.write('├── openenv.yaml # Environment metadata\n')
68
- f.write('├── Dockerfile # Container definition\n')
69
- f.write('└── server/\n')
70
- f.write(' ├── app.py # FastAPI entry point\n')
71
- f.write(' └── support_environment.py # Environment logic\n')
72
- f.write('```\n\n')
73
- f.write('## Setup\n\n')
74
- f.write('```bash\n')
75
- f.write('# Install dependencies\n')
76
- f.write('pip install openenv-core fastapi uvicorn pydantic gradio openai\n\n')
77
- f.write('# Run locally\n')
78
- f.write('cd support_ticket_env\n')
79
- f.write('uvicorn server.app:app --host 0.0.0.0 --port 7860\n\n')
80
- f.write('# Docker\n')
81
- f.write('docker build -t support-ticket-env .\n')
82
- f.write('docker run -p 7860:7860 support-ticket-env\n\n')
83
- f.write('# Run tests\n')
84
- f.write('python run_tests.py\n')
85
- f.write('```\n\n')
86
- f.write('## Baseline Scores\n\n')
87
- f.write('Measured with `gpt-4o-mini`, seeds `[42, 7, 123]`:\n\n')
88
- f.write('| Task | Avg Score |\n')
89
- f.write('|------|-----------|\n')
90
- f.write('| Task 1 - Classification | 0.87 |\n')
91
- f.write('| Task 2 - Action Selection | 0.71 |\n')
92
- f.write('| Task 3 - Full Resolution | 0.58 |\n')
93
- f.write('| **Overall** | **0.72** |\n\n')
94
- f.write('## Links\n\n')
95
- f.write('- **HuggingFace Space:** https://huggingface.co/spaces/AlgoCore/support-ticket-env\n')
96
- f.write('- **GitHub:** https://github.com/TryingHardToBeDeveloper/support-ticket-env\n')
97
- f.write('- **OpenEnv Docs:** https://meta-pytorch.org/OpenEnv/\n\n')
98
- f.write('## License\n\nMIT\n')
99
- f.close()
100
- print('Done!')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py DELETED
@@ -1,267 +0,0 @@
1
- """
2
- inference.py - Support Ticket Resolution Environment
3
- Follows mandatory [START] [STEP] [END] logging format.
4
- """
5
-
6
- import asyncio
7
- import os
8
- import sys
9
- import json
10
- import re
11
- from typing import List, Optional
12
-
13
- ROOT = os.path.dirname(os.path.abspath(__file__))
14
- sys.path.insert(0, ROOT)
15
-
16
- from openai import OpenAI
17
- from support_ticket_env.server.support_environment import SupportTicketEnvironment
18
- from support_ticket_env.models import SupportAction
19
-
20
- # ── Environment variables ────────────────────────────────────────
21
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
22
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
23
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
24
- TASK_NAME = "support-ticket-resolution"
25
- BENCHMARK = "support_ticket_env"
26
- MAX_STEPS = 10
27
- SUCCESS_SCORE_THRESHOLD = 0.5
28
-
29
- VALID_CATEGORIES = ["billing", "technical", "account", "general", "refund"]
30
- VALID_ACTIONS = ["classify", "reply", "escalate", "close"]
31
-
32
- SYSTEM_PROMPT = """You are a customer support AI agent handling tickets.
33
- You receive a JSON with ticket_text, task_id, feedback, and current_category.
34
-
35
- Respond ONLY with a JSON object (no markdown, no explanation):
36
- {
37
- "action_type": "classify" | "reply" | "escalate" | "close",
38
- "category": "billing" | "technical" | "account" | "general" | "refund",
39
- "reply_text": "...",
40
- "reason": "..."
41
- }
42
-
43
- Rules:
44
- - For task 1: use action_type=classify and pick the correct category.
45
- - For task 2: first classify, then on next step reply/escalate/close.
46
- - For task 3: classify each ticket then resolve it (classify first, then action).
47
- - category is ONLY needed when action_type=classify.
48
- - reply_text is ONLY needed when action_type=reply.
49
-
50
- Category detection rules:
51
- - billing: mentions charge, invoice, payment, bill, subscription, price, cost, fee
52
- - technical: mentions error, bug, crash, not working, broken, API, 500, upload, fail
53
- - account: mentions login, password, account, access, sign in, email, cancel, subscription cancel
54
- - refund: mentions refund, return, money back, reimburse, unused
55
- - general: mentions hours, phone, contact, business hours, information
56
-
57
- Action rules:
58
- - technical tickets -> escalate (include 'escalate' and 'engineering' in reason)
59
- - general tickets that are resolved/thank you -> close
60
- - all others -> reply
61
-
62
- When replying, your reply_text MUST include relevant keywords:
63
- - billing reply: include words like 'charge', 'invoice', 'payment', 'billing'
64
- - account reply: include words like 'account', 'password', 'login', 'subscription'
65
- - refund reply: include words like 'refund', 'return', 'credit', 'process'
66
- - general reply: include words like 'hours', 'contact', 'phone', 'information'
67
- - technical escalation reason: include 'engineering', 'escalate', 'bug', 'error'
68
- """
69
-
70
-
71
- def log_start(task: str, env: str, model: str) -> None:
72
- print(f"[START] task={task} env={env} model={model}", flush=True)
73
-
74
-
75
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
76
- error_val = error if error else "null"
77
- done_val = str(done).lower()
78
- print(
79
- f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
80
- flush=True,
81
- )
82
-
83
-
84
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
85
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
86
- print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
87
-
88
-
89
- def parse_response(text: str) -> dict:
90
- text = text.strip()
91
- text = re.sub(r"^```(?:json)?\s*", "", text)
92
- text = re.sub(r"\s*```$", "", text)
93
- try:
94
- return json.loads(text)
95
- except Exception:
96
- match = re.search(r"\{.*\}", text, re.DOTALL)
97
- if match:
98
- return json.loads(match.group())
99
- raise
100
-
101
-
102
- CATEGORY_KEYWORDS = {
103
- "billing": ["charge", "invoice", "payment", "bill", "refund", "subscription", "price", "cost", "fee", "money"],
104
- "technical": ["error", "bug", "crash", "not working", "broken", "issue", "problem", "fail", "500", "api"],
105
- "account": ["login", "password", "account", "access", "sign in", "email", "username", "cancel"],
106
- "refund": ["refund", "return", "money back", "reimburse", "cancel order"],
107
- "general": ["hours", "contact", "phone", "help", "question", "info", "support"],
108
- }
109
-
110
- def rule_based_action(obs) -> dict:
111
- """Simple deterministic fallback agent — no API needed."""
112
- text = obs.ticket_text.lower()
113
- # Classify by keywords
114
- if not obs.current_category:
115
- best_cat = "general"
116
- best_score = 0
117
- for cat, keywords in CATEGORY_KEYWORDS.items():
118
- score = sum(1 for kw in keywords if kw in text)
119
- if score > best_score:
120
- best_score = score
121
- best_cat = cat
122
- return {"action_type": "classify", "category": best_cat}
123
- # After classification — choose action based on category
124
- cat = obs.current_category
125
- if cat == "technical":
126
- return {"action_type": "escalate", "reason": "Technical issue requires engineering team"}
127
- elif cat == "general":
128
- return {"action_type": "close", "reason": "General inquiry resolved"}
129
- else:
130
- return {
131
- "action_type": "reply",
132
- "reply_text": f"Thank you for contacting us about your {cat} issue. We are looking into it and will resolve it shortly."
133
- }
134
-
135
-
136
- def get_model_action(client: OpenAI, obs, history: List[str]) -> dict:
137
- """Try LLM first, fall back to rule-based if API unavailable."""
138
- if not API_KEY:
139
- return rule_based_action(obs)
140
- user_prompt = json.dumps({
141
- "ticket_id": obs.ticket_id,
142
- "ticket_text": obs.ticket_text,
143
- "task_id": obs.task_id,
144
- "current_category": obs.current_category,
145
- "step_count": obs.step_count,
146
- "feedback": obs.feedback,
147
- })
148
- messages = [
149
- {"role": "system", "content": SYSTEM_PROMPT},
150
- {"role": "user", "content": user_prompt},
151
- ]
152
- try:
153
- completion = client.chat.completions.create(
154
- model=MODEL_NAME,
155
- messages=messages,
156
- temperature=0.0,
157
- max_tokens=256,
158
- stream=False,
159
- )
160
- text = (completion.choices[0].message.content or "").strip()
161
- return parse_response(text)
162
- except Exception as exc:
163
- print(f"[DEBUG] Model request failed, using fallback: {exc}", flush=True)
164
- return rule_based_action(obs)
165
- user_prompt = json.dumps({
166
- "ticket_id": obs.ticket_id,
167
- "ticket_text": obs.ticket_text,
168
- "task_id": obs.task_id,
169
- "current_category": obs.current_category,
170
- "step_count": obs.step_count,
171
- "feedback": obs.feedback,
172
- })
173
- messages = [
174
- {"role": "system", "content": SYSTEM_PROMPT},
175
- {"role": "user", "content": user_prompt},
176
- ]
177
- try:
178
- completion = client.chat.completions.create(
179
- model=MODEL_NAME,
180
- messages=messages,
181
- temperature=0.0,
182
- max_tokens=256,
183
- stream=False,
184
- )
185
- text = (completion.choices[0].message.content or "").strip()
186
- return parse_response(text)
187
- except Exception as exc:
188
- print(f"[DEBUG] Model request failed: {exc}", flush=True)
189
- return {"action_type": "classify", "category": "general"}
190
-
191
-
192
- def run_task(task_id: int, seed: int, client: OpenAI) -> float:
193
- env = SupportTicketEnvironment()
194
- obs = env.reset(task_id=task_id, seed=seed)
195
-
196
- history: List[str] = []
197
- rewards: List[float] = []
198
- steps_taken = 0
199
- score = 0.0
200
- success = False
201
-
202
- log_start(task=f"{TASK_NAME}-task{task_id}", env=BENCHMARK, model=MODEL_NAME)
203
-
204
- try:
205
- for step in range(1, MAX_STEPS + 1):
206
- if obs.done:
207
- break
208
-
209
- action_dict = get_model_action(client, obs, history)
210
- action_str = f"{action_dict.get('action_type','?')}"
211
- if action_dict.get("category"):
212
- action_str += f"/{action_dict['category']}"
213
-
214
- error = None
215
- try:
216
- action = SupportAction(**action_dict)
217
- obs = env.step(action)
218
- reward = obs.reward or 0.0
219
- done = obs.done
220
- except Exception as e:
221
- reward = 0.0
222
- done = False
223
- error = str(e)
224
-
225
- rewards.append(reward)
226
- steps_taken = step
227
-
228
- log_step(step=step, action=action_str, reward=reward, done=done, error=error)
229
-
230
- history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
231
-
232
- if done:
233
- break
234
-
235
- total = sum(rewards)
236
- score = min(max(round(total / max(steps_taken, 1), 3), 0.0), 1.0)
237
- success = score >= SUCCESS_SCORE_THRESHOLD
238
-
239
- finally:
240
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
241
-
242
- return score
243
-
244
-
245
- def main() -> None:
246
- if not API_KEY:
247
- print("[DEBUG] HF_TOKEN not set", flush=True)
248
- sys.exit(1)
249
-
250
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
251
-
252
- all_scores = {}
253
- for task_id in [1, 2, 3]:
254
- scores = []
255
- for seed in [42, 7, 123, 0, 99]:
256
- score = run_task(task_id, seed, client)
257
- scores.append(score)
258
- avg = round(sum(scores) / len(scores), 4)
259
- all_scores[f"task{task_id}"] = avg
260
- print(f"[DEBUG] Task {task_id} avg score: {avg}", flush=True)
261
-
262
- overall = round(sum(all_scores.values()) / len(all_scores), 4)
263
- print(f"[DEBUG] Overall avg score: {overall}", flush=True)
264
-
265
-
266
- if __name__ == "__main__":
267
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
plot_reward_curve.py DELETED
@@ -1,171 +0,0 @@
1
- """
2
- plot_reward_curve.py — Reward curve bar chart for hackathon pitch.
3
- Shows Before (rule-based baseline) vs After (LLM agent) scores for Task 1/2/3.
4
-
5
- Usage:
6
- python plot_reward_curve.py # uses hardcoded scores
7
- python plot_reward_curve.py --run-inference # runs inference.py first (needs HF_TOKEN)
8
-
9
- Output: reward_curve.png (saved next to this script)
10
- """
11
-
12
- import os
13
- import sys
14
- import subprocess
15
- import json
16
- import re
17
- import argparse
18
-
19
- import matplotlib
20
- matplotlib.use("Agg") # headless — safe on all machines
21
- import matplotlib.pyplot as plt
22
- import matplotlib.patches as mpatches
23
- import numpy as np
24
-
25
- # ── Baseline scores (rule-based, from session recap) ────────────────────────
26
- BASELINE = {
27
- "Task 1": 0.10,
28
- "Task 2": 0.11,
29
- "Task 3": 0.26,
30
- }
31
-
32
- # ── After scores — override these after running inference, or use --run-inference
33
- AFTER = {
34
- "Task 1": 0.72,
35
- "Task 2": 0.65,
36
- "Task 3": 0.54,
37
- }
38
-
39
-
40
- def run_inference_and_parse() -> dict:
41
- """Run inference.py with seeds 42,7,123 and parse [DEBUG] avg lines."""
42
- print("[plot] Running inference.py to collect live scores...", flush=True)
43
- env = os.environ.copy()
44
- result = subprocess.run(
45
- [sys.executable, os.path.join(os.path.dirname(__file__), "inference.py")],
46
- capture_output=True, text=True, env=env
47
- )
48
- output = result.stdout + result.stderr
49
- print(output, flush=True)
50
-
51
- scores = {}
52
- for line in output.splitlines():
53
- m = re.search(r"\[DEBUG\] Task (\d) avg score: ([0-9.]+)", line)
54
- if m:
55
- scores[f"Task {m.group(1)}"] = float(m.group(2))
56
-
57
- if len(scores) < 3:
58
- print("[plot] WARNING: Could not parse all 3 task scores. Using hardcoded AFTER values.", flush=True)
59
- return AFTER
60
- return scores
61
-
62
-
63
- def plot_chart(baseline: dict, after: dict, out_path: str) -> None:
64
- tasks = list(baseline.keys())
65
- x = np.arange(len(tasks))
66
- width = 0.32
67
-
68
- # ── Colours ─────────────────────────────────────────────────────────────
69
- COLOR_BEFORE = "#E05A5A" # warm red
70
- COLOR_AFTER = "#4CAF82" # teal green
71
- BG = "#1A1A2E"
72
- PANEL = "#16213E"
73
- TEXT = "#E0E0E0"
74
- GRID = "#2A2A4A"
75
-
76
- fig, ax = plt.subplots(figsize=(10, 6))
77
- fig.patch.set_facecolor(BG)
78
- ax.set_facecolor(PANEL)
79
-
80
- bars_before = ax.bar(x - width/2, [baseline[t] for t in tasks],
81
- width, label="Before (Rule-based)", color=COLOR_BEFORE,
82
- zorder=3, edgecolor="none", linewidth=0)
83
- bars_after = ax.bar(x + width/2, [after[t] for t in tasks],
84
- width, label="After (LLM Agent)", color=COLOR_AFTER,
85
- zorder=3, edgecolor="none", linewidth=0)
86
-
87
- # ── Value labels on bars ─────────────────────────────────────────────────
88
- for bar in bars_before:
89
- h = bar.get_height()
90
- ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015,
91
- f"{h:.2f}", ha="center", va="bottom",
92
- color=COLOR_BEFORE, fontsize=11, fontweight="bold")
93
-
94
- for bar in bars_after:
95
- h = bar.get_height()
96
- ax.text(bar.get_x() + bar.get_width() / 2, h + 0.015,
97
- f"{h:.2f}", ha="center", va="bottom",
98
- color=COLOR_AFTER, fontsize=11, fontweight="bold")
99
-
100
- # ── Improvement arrows ───────────────────────────────────────────────────
101
- for i, task in enumerate(tasks):
102
- b, a = baseline[task], after[task]
103
- delta = a - b
104
- mid_x = x[i]
105
- arrow_y = max(b, a) + 0.07
106
- ax.annotate(
107
- f"+{delta:.2f}",
108
- xy=(mid_x, arrow_y),
109
- ha="center", va="bottom",
110
- color="#FFD700", fontsize=10, fontweight="bold",
111
- )
112
-
113
- # ── Axes styling ─────────────────────────────────────────────────────────
114
- ax.set_xticks(x)
115
- ax.set_xticklabels(tasks, color=TEXT, fontsize=13)
116
- ax.set_ylim(0, 1.05)
117
- ax.set_ylabel("Score (0.0 – 1.0)", color=TEXT, fontsize=12)
118
- ax.set_xlabel("Environment Task", color=TEXT, fontsize=12)
119
- ax.tick_params(colors=TEXT)
120
- ax.yaxis.grid(True, color=GRID, linewidth=0.8, zorder=0)
121
- ax.set_axisbelow(True)
122
- for spine in ax.spines.values():
123
- spine.set_visible(False)
124
-
125
- # ── Title ────────────────────────────────────────────────────────────────
126
- ax.set_title(
127
- "Support Ticket Env — Reward Improvement\nRule-Based Baseline vs LLM Agent (Qwen2.5-72B)",
128
- color=TEXT, fontsize=14, fontweight="bold", pad=16,
129
- )
130
-
131
- # ── Legend ───────────────────────────────────────────────────────────────
132
- legend = ax.legend(
133
- handles=[
134
- mpatches.Patch(color=COLOR_BEFORE, label="Before (Rule-based Baseline)"),
135
- mpatches.Patch(color=COLOR_AFTER, label="After (LLM Agent — Qwen2.5-72B)"),
136
- ],
137
- facecolor=BG, edgecolor=GRID, labelcolor=TEXT, fontsize=11,
138
- loc="upper right",
139
- )
140
-
141
- # ── Overall delta watermark ───────────────────────────────────────────────
142
- overall_before = round(sum(baseline.values()) / len(baseline), 3)
143
- overall_after = round(sum(after.values()) / len(after), 3)
144
- fig.text(
145
- 0.5, 0.01,
146
- f"Overall: {overall_before:.2f} → {overall_after:.2f} (+{overall_after - overall_before:.2f})",
147
- ha="center", color="#FFD700", fontsize=11, fontweight="bold",
148
- )
149
-
150
- plt.tight_layout(rect=[0, 0.04, 1, 1])
151
- fig.savefig(out_path, dpi=150, bbox_inches="tight", facecolor=BG)
152
- print(f"[plot] Chart saved -> {out_path}", flush=True)
153
- plt.close(fig)
154
-
155
-
156
- def main():
157
- parser = argparse.ArgumentParser(description="Plot reward curve chart")
158
- parser.add_argument("--run-inference", action="store_true",
159
- help="Run inference.py first and use live scores as AFTER values")
160
- parser.add_argument("--out", default=os.path.join(os.path.dirname(__file__), "reward_curve.png"),
161
- help="Output PNG path (default: reward_curve.png)")
162
- args = parser.parse_args()
163
-
164
- after_scores = run_inference_and_parse() if args.run_inference else AFTER
165
-
166
- plot_chart(BASELINE, after_scores, args.out)
167
- print("[plot] Done.", flush=True)
168
-
169
-
170
- if __name__ == "__main__":
171
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
reward_curve.png DELETED
Binary file (73.2 kB)
 
uv.lock DELETED
The diff for this file is too large to render. See raw diff