vivekvish2004 commited on
Commit
08e86b6
·
1 Parent(s): 44db10b

feat: proper task design — realistic scenarios, clearer graders, auto-validate

Browse files

Task Design (evaluation criterion):
- tasks.json: 7 tasks with real-world scenarios embedded, example_input/actions,
scoring breakdowns, passing_threshold, actions_required per task
- EASY: binary single-action tasks (classify OR priority only)
- MEDIUM: 2-step tasks with partial credit (0.5 each)
- HARD: multi-step with strict per-component scoring
- env.py: 12 rich real-world scenarios covering all 6 categories:
refund/technical_issue/login_issue/general_inquiry/security/feedback
Each with context fields, varied sentiments, realistic urgency signals
- Better reward messages (emojis, specific feedback)
- Stricter resolve: requires classify+priority+response first
- Empathy detection improved for angry/panicked/concerned sentiment

Fix duplicate Operation ID warning in /reset endpoint

scripts/validate-submission.sh:
- Auto-starts server if not running (no manual server needed)
- Waits up to 15s for server ready
- Tests task design fields + difficulty spread
- Cleans up server after test
- Exit codes: 0=pass, 1=fail

Files changed (4) hide show
  1. scripts/validate-submission.sh +197 -66
  2. server/app.py +3 -2
  3. server/env.py +170 -85
  4. tasks.json +104 -28
scripts/validate-submission.sh CHANGED
@@ -1,132 +1,263 @@
1
  #!/usr/bin/env bash
2
  # ============================================================
3
  # OpenEnv Submission Validator
4
- # Tests all 4 evaluation criteria locally before submission
5
- # Usage: bash scripts/validate-submission.sh
6
  # ============================================================
7
 
8
- set -e
9
- BASE="http://localhost:7860"
10
  PASS=0
11
  FAIL=0
12
- RED='\033[0;31m'
13
- GREEN='\033[0;32m'
14
- YELLOW='\033[1;33m'
15
- NC='\033[0m'
16
 
17
  ok() { echo -e " ${GREEN}✅ PASS${NC} — $1"; ((PASS++)); }
18
  fail() { echo -e " ${RED}❌ FAIL${NC} — $1"; ((FAIL++)); }
19
  info() { echo -e " ${YELLOW}ℹ️ ${NC} $1"; }
 
20
 
21
  echo ""
22
  echo "╔══════════════════════════════════════════════════════╗"
23
  echo "║ OpenEnv Customer Support — Submission Validator ║"
24
  echo "╚══════════════════════════════════════════════════════╝"
25
- echo ""
26
 
27
- # ── 1. Runtime Correctness ──────────────────────────────────
28
- echo "▶ 1. RUNTIME CORRECTNESS — Runs without errors"
 
 
 
29
 
30
- STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/health")
31
- if [ "$STATUS" = "200" ]; then
32
- HEALTH=$(curl -s "$BASE/health")
33
- if echo "$HEALTH" | grep -q '"healthy"'; then
34
- ok "/health returns {status: healthy} (HTTP 200)"
35
  else
36
- fail "/health response missing 'healthy': $HEALTH"
 
37
  fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  else
39
- fail "/health endpoint unreachable (HTTP $STATUS)"
40
  fi
41
 
 
42
  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/openapi.json")
43
- [ "$STATUS" = "200" ] && ok "/openapi.json reachable (HTTP 200)" || fail "/openapi.json not found"
44
 
45
- echo ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- # ── 2. Interface Compliance ─────────────────────────────────
48
- echo "▶ 2. INTERFACE COMPLIANCE — Follows OpenEnv standard"
 
 
49
 
50
  # /metadata
51
  META=$(curl -s "$BASE/metadata")
52
  echo "$META" | grep -q '"name"' && echo "$META" | grep -q '"description"' \
53
- && ok "/metadata has name + description" \
54
- || fail "/metadata missing required fields: $META"
55
 
56
  # /schema
57
  SCHEMA=$(curl -s "$BASE/schema")
58
- echo "$SCHEMA" | grep -q '"action"' && echo "$SCHEMA" | grep -q '"observation"' && echo "$SCHEMA" | grep -q '"state"' \
59
- && ok "/schema has action + observation + state" \
60
- || fail "/schema missing required fields"
 
 
 
 
 
 
61
 
62
- # /reset
63
  RESET=$(curl -s -X POST "$BASE/reset")
64
- echo "$RESET" | grep -q '"observation"' \
65
- && ok "/reset returns observation" \
66
- || fail "/reset bad response: $RESET"
 
 
 
 
 
 
67
 
68
- # /step
69
  STEP=$(curl -s -X POST "$BASE/step" \
70
  -H "Content-Type: application/json" \
71
- -d '{"action_type":"classify_ticket","payload":{"classification":"refund"}}')
72
- echo "$STEP" | grep -q '"observation"' && echo "$STEP" | grep -q '"reward"' && echo "$STEP" | grep -q '"done"' \
73
- && ok "/step returns observation + reward + done" \
74
- || fail "/step bad response: $STEP"
 
 
 
 
 
 
 
 
 
 
75
 
76
- # /state
77
  STATE=$(curl -s "$BASE/state")
78
- echo "$STATE" | grep -q '"observation"' \
79
- && ok "/state returns observation" \
80
- || fail "/state bad response: $STATE"
81
 
82
- echo ""
 
 
83
 
84
- # ── 3. Task Design ──────────────────────────────────────────
85
- echo "▶ 3. TASK DESIGN — Clear, realistic, testable"
 
 
86
 
87
  TASKS=$(curl -s "$BASE/tasks")
88
- TASK_COUNT=$(echo "$TASKS" | python3 -c "import sys,json; t=json.load(sys.stdin); print(len(t))" 2>/dev/null)
89
  GRADED_COUNT=$(echo "$TASKS" | python3 -c "import sys,json; t=json.load(sys.stdin); print(sum(1 for x in t if x.get('grader')))" 2>/dev/null)
90
 
91
- info "Found $TASK_COUNT tasks, $GRADED_COUNT with graders"
92
- [ "$TASK_COUNT" -ge 3 ] && ok "At least 3 tasks defined" || fail "Need ≥3 tasks, found $TASK_COUNT"
93
- [ "$GRADED_COUNT" -ge 3 ] && ok "At least 3 tasks have graders" || fail "Need ≥3 tasks with graders, found $GRADED_COUNT"
94
 
95
- echo ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- # ── 4. Grading Logic ────────────────────────────────────────
98
- echo "▶ 4. GRADING LOGIC — Reward system makes sense"
 
 
99
 
100
- TASK_IDS=$(echo "$TASKS" | python3 -c "import sys,json; t=json.load(sys.stdin); [print(x['id']) for x in t if x.get('grader')]" 2>/dev/null)
 
 
 
 
 
101
 
102
  GRADER_OK=0
103
  GRADER_FAIL=0
104
  for TID in $TASK_IDS; do
105
  RESULT=$(curl -s "$BASE/grader?task_id=$TID")
106
- SCORE=$(echo "$RESULT" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('score',''))" 2>/dev/null)
107
- if [ -n "$SCORE" ]; then
108
- IN_RANGE=$(python3 -c "s=float('$SCORE'); print('ok' if 0.0<=s<=1.0 else 'fail')" 2>/dev/null)
109
- if [ "$IN_RANGE" = "ok" ]; then
110
- info "$TID score=$SCORE ✅"
111
- ((GRADER_OK++))
112
- else
113
- info "$TID → score=$SCORE ❌ (out of range)"
114
- ((GRADER_FAIL++))
115
- fi
 
 
 
116
  else
117
- info "$TID → grader error: $RESULT"
118
  ((GRADER_FAIL++))
119
  fi
120
  done
121
 
122
- [ "$GRADER_OK" -ge 3 ] && ok "$GRADER_OK graders return valid scores in [0.0, 1.0]" \
123
- || fail "Only $GRADER_OK graders valid, need ≥3"
 
124
 
 
 
 
125
  echo ""
126
- echo "══════════════════════════════════════════════════════"
127
  echo -e " Results: ${GREEN}$PASS passed${NC} | ${RED}$FAIL failed${NC}"
128
- echo "══════════════════════════════════════════════════════"
129
  echo ""
 
 
 
 
 
 
 
130
  if [ "$FAIL" -eq 0 ]; then
131
  echo -e "${GREEN}🎉 ALL CHECKS PASSED — Ready for OpenEnv submission!${NC}"
132
  exit 0
 
1
  #!/usr/bin/env bash
2
  # ============================================================
3
  # OpenEnv Submission Validator
4
+ # Tests all 4 evaluation criteria auto-starts server
5
+ # Usage: bash scripts/validate-submission.sh [port]
6
  # ============================================================
7
 
8
+ PORT="${1:-7860}"
9
+ BASE="http://localhost:$PORT"
10
  PASS=0
11
  FAIL=0
12
+ SERVER_STARTED=false
13
+
14
+ RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
 
15
 
16
  ok() { echo -e " ${GREEN}✅ PASS${NC} — $1"; ((PASS++)); }
17
  fail() { echo -e " ${RED}❌ FAIL${NC} — $1"; ((FAIL++)); }
18
  info() { echo -e " ${YELLOW}ℹ️ ${NC} $1"; }
19
+ hdr() { echo -e "\n${BLUE}▶ $1${NC}"; }
20
 
21
  echo ""
22
  echo "╔══════════════════════════════════════════════════════╗"
23
  echo "║ OpenEnv Customer Support — Submission Validator ║"
24
  echo "╚══════════════════════════════════════════════════════╝"
 
25
 
26
+ # ── Auto-start server if not running ─────────────────────────
27
+ echo ""
28
+ echo "Checking server at $BASE ..."
29
+ if ! curl -s --max-time 2 "$BASE/health" > /dev/null 2>&1; then
30
+ echo -e "${YELLOW}⚡ Server not running — starting on port $PORT ...${NC}"
31
 
32
+ # Find python/uvicorn
33
+ if [ -f ".venv/bin/python" ]; then
34
+ PYTHON=".venv/bin/python"
35
+ elif command -v python3 &>/dev/null; then
36
+ PYTHON="python3"
37
  else
38
+ echo -e "${RED}❌ Python not found. Activate venv first.${NC}"
39
+ exit 1
40
  fi
41
+
42
+ $PYTHON -m uvicorn server.app:app --host 0.0.0.0 --port "$PORT" --log-level warning &
43
+ SERVER_PID=$!
44
+ SERVER_STARTED=true
45
+
46
+ # Wait for server to be ready (up to 15s)
47
+ for i in {1..15}; do
48
+ sleep 1
49
+ if curl -s --max-time 1 "$BASE/health" > /dev/null 2>&1; then
50
+ echo -e "${GREEN}✅ Server ready on port $PORT${NC}"
51
+ break
52
+ fi
53
+ if [ "$i" -eq 15 ]; then
54
+ echo -e "${RED}❌ Server failed to start after 15s${NC}"
55
+ kill $SERVER_PID 2>/dev/null
56
+ exit 1
57
+ fi
58
+ done
59
+ else
60
+ echo -e "${GREEN}✅ Server already running${NC}"
61
+ fi
62
+
63
+ echo ""
64
+
65
+ # ══════════════════════════════════════════════════════════════
66
+ # 1. RUNTIME CORRECTNESS — Runs without errors
67
+ # ══════════════════════════════════════════════════════════════
68
+ hdr "1. RUNTIME CORRECTNESS — Runs without errors"
69
+
70
+ # Health
71
+ HEALTH=$(curl -s "$BASE/health")
72
+ if echo "$HEALTH" | grep -q '"healthy"'; then
73
+ ok "/health → {status: healthy}"
74
  else
75
+ fail "/health bad response: $HEALTH"
76
  fi
77
 
78
+ # OpenAPI docs
79
  STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$BASE/openapi.json")
80
+ [ "$STATUS" = "200" ] && ok "/openapi.json available (HTTP 200)" || fail "/openapi.json HTTP $STATUS"
81
 
82
+ # Reset doesn't crash
83
+ RESET=$(curl -s -X POST "$BASE/reset" 2>&1)
84
+ if echo "$RESET" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then
85
+ ok "/reset returns valid JSON"
86
+ else
87
+ fail "/reset error: $RESET"
88
+ fi
89
+
90
+ # Step doesn't crash
91
+ STEP=$(curl -s -X POST "$BASE/step" \
92
+ -H "Content-Type: application/json" \
93
+ -d '{"action_type":"classify_ticket","payload":{"classification":"refund"}}' 2>&1)
94
+ if echo "$STEP" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then
95
+ ok "/step returns valid JSON"
96
+ else
97
+ fail "/step error: $STEP"
98
+ fi
99
 
100
+ # ══════════════════════════════════════════════════════════════
101
+ # 2. INTERFACE COMPLIANCE — Follows OpenEnv standard
102
+ # ══════��═══════════════════════════════════════════════════════
103
+ hdr "2. INTERFACE COMPLIANCE — Follows OpenEnv standard"
104
 
105
  # /metadata
106
  META=$(curl -s "$BASE/metadata")
107
  echo "$META" | grep -q '"name"' && echo "$META" | grep -q '"description"' \
108
+ && ok "/metadata has required fields (name, description)" \
109
+ || fail "/metadata missing fields: $META"
110
 
111
  # /schema
112
  SCHEMA=$(curl -s "$BASE/schema")
113
+ if echo "$SCHEMA" | python3 -c "
114
+ import sys, json
115
+ d = json.load(sys.stdin)
116
+ assert 'action' in d and 'observation' in d and 'state' in d
117
+ " 2>/dev/null; then
118
+ ok "/schema has action + observation + state"
119
+ else
120
+ fail "/schema missing required fields: $SCHEMA"
121
+ fi
122
 
123
+ # /reset response shape
124
  RESET=$(curl -s -X POST "$BASE/reset")
125
+ if echo "$RESET" | python3 -c "
126
+ import sys, json
127
+ d = json.load(sys.stdin)
128
+ assert 'observation' in d and 'reward' in d and 'done' in d
129
+ " 2>/dev/null; then
130
+ ok "/reset response has {observation, reward, done}"
131
+ else
132
+ fail "/reset bad shape: $RESET"
133
+ fi
134
 
135
+ # /step response shape
136
  STEP=$(curl -s -X POST "$BASE/step" \
137
  -H "Content-Type: application/json" \
138
+ -d '{"action_type":"assign_priority","payload":{"priority":"high"}}')
139
+ if echo "$STEP" | python3 -c "
140
+ import sys, json
141
+ d = json.load(sys.stdin)
142
+ assert 'observation' in d
143
+ assert 'reward' in d
144
+ assert 'done' in d
145
+ r = d['reward']
146
+ assert isinstance(r, (int, float)), f'reward must be float, got {type(r)}'
147
+ " 2>/dev/null; then
148
+ ok "/step returns {observation, reward(float), done, info}"
149
+ else
150
+ fail "/step bad shape or reward not float: $STEP"
151
+ fi
152
 
153
+ # /state response shape
154
  STATE=$(curl -s "$BASE/state")
155
+ echo "$STATE" | grep -q '"observation"' && ok "/state returns {observation}" || fail "/state bad shape: $STATE"
 
 
156
 
157
+ # /mcp JSON-RPC
158
+ MCP=$(curl -s -X POST "$BASE/mcp" -H "Content-Type: application/json" -d '{"jsonrpc":"2.0","method":"initialize","id":1}')
159
+ echo "$MCP" | grep -q '"jsonrpc"' && ok "/mcp returns JSON-RPC 2.0 response" || fail "/mcp bad response: $MCP"
160
 
161
+ # ══════════════════════════════════════════════════════════════
162
+ # 3. TASK DESIGN — Clear, realistic, testable
163
+ # ══════════════════════════════════════════════════════════════
164
+ hdr "3. TASK DESIGN — Clear, realistic, testable"
165
 
166
  TASKS=$(curl -s "$BASE/tasks")
167
+ TASK_COUNT=$(echo "$TASKS" | python3 -c "import sys,json; print(len(json.load(sys.stdin)))" 2>/dev/null)
168
  GRADED_COUNT=$(echo "$TASKS" | python3 -c "import sys,json; t=json.load(sys.stdin); print(sum(1 for x in t if x.get('grader')))" 2>/dev/null)
169
 
170
+ info "Total tasks: $TASK_COUNT | Tasks with graders: $GRADED_COUNT"
 
 
171
 
172
+ [ "${TASK_COUNT:-0}" -ge 3 ] \
173
+ && ok "≥3 tasks defined (found $TASK_COUNT)" \
174
+ || fail "Need ≥3 tasks, found ${TASK_COUNT:-0}"
175
+
176
+ [ "${GRADED_COUNT:-0}" -ge 3 ] \
177
+ && ok "≥3 tasks have grader=true (found $GRADED_COUNT)" \
178
+ || fail "Need ≥3 graded tasks, found ${GRADED_COUNT:-0}"
179
+
180
+ # Check tasks have required design fields
181
+ DESIGN_OK=$(echo "$TASKS" | python3 -c "
182
+ import sys, json
183
+ tasks = json.load(sys.stdin)
184
+ required = ['id','name','difficulty','objective','description']
185
+ missing = []
186
+ for t in tasks:
187
+ for f in required:
188
+ if f not in t:
189
+ missing.append(f'{t.get(\"id\",\"?\")}:{f}')
190
+ print(len(missing))
191
+ " 2>/dev/null)
192
+ [ "${DESIGN_OK:-1}" -eq 0 ] \
193
+ && ok "All tasks have required design fields (id, name, difficulty, objective, description)" \
194
+ || fail "Some tasks missing design fields: $DESIGN_OK"
195
+
196
+ # Check difficulty spread
197
+ DIFF_SPREAD=$(echo "$TASKS" | python3 -c "
198
+ import sys,json
199
+ t = json.load(sys.stdin)
200
+ diffs = set(x.get('difficulty','') for x in t)
201
+ print('ok' if len(diffs) >= 2 else 'fail')
202
+ " 2>/dev/null)
203
+ [ "$DIFF_SPREAD" = "ok" ] \
204
+ && ok "Tasks span multiple difficulty levels (EASY / MEDIUM / HARD)" \
205
+ || fail "All tasks same difficulty — needs spread"
206
 
207
+ # ══════════════════════════════════════════════════════════════
208
+ # 4. GRADING LOGIC — Reward system makes sense
209
+ # ══════════════════════════════════════════════════════════════
210
+ hdr "4. GRADING LOGIC — Reward system in [0.0, 1.0]"
211
 
212
+ TASK_IDS=$(echo "$TASKS" | python3 -c "
213
+ import sys,json
214
+ t=json.load(sys.stdin)
215
+ for x in t:
216
+ if x.get('grader'): print(x['id'])
217
+ " 2>/dev/null)
218
 
219
  GRADER_OK=0
220
  GRADER_FAIL=0
221
  for TID in $TASK_IDS; do
222
  RESULT=$(curl -s "$BASE/grader?task_id=$TID")
223
+ CHECK=$(echo "$RESULT" | python3 -c "
224
+ import sys,json
225
+ d=json.load(sys.stdin)
226
+ s=float(d.get('score','-1'))
227
+ if 0.0 <= s <= 1.0:
228
+ print(f'ok:{s}')
229
+ else:
230
+ print(f'fail:{s}')
231
+ " 2>/dev/null)
232
+ if echo "$CHECK" | grep -q "^ok:"; then
233
+ SCORE=$(echo "$CHECK" | cut -d: -f2)
234
+ info "$TID → score=$SCORE ✅"
235
+ ((GRADER_OK++))
236
  else
237
+ info "$TID → grader error or out-of-range: $RESULT"
238
  ((GRADER_FAIL++))
239
  fi
240
  done
241
 
242
+ [ "$GRADER_OK" -ge 3 ] \
243
+ && ok "$GRADER_OK graders return valid scores in [0.0, 1.0]" \
244
+ || fail "Only $GRADER_OK graders valid — need ≥3"
245
 
246
+ # ══════════════════════════════════════════════════════════════
247
+ # Summary
248
+ # ══════════════════════════════════════════════════════════════
249
  echo ""
250
+ echo "══════════════════════════════════════════════════════════"
251
  echo -e " Results: ${GREEN}$PASS passed${NC} | ${RED}$FAIL failed${NC}"
252
+ echo "══════════════════════════════════════════════════════════"
253
  echo ""
254
+
255
+ # Cleanup server if we started it
256
+ if [ "$SERVER_STARTED" = true ] && [ -n "$SERVER_PID" ]; then
257
+ kill $SERVER_PID 2>/dev/null
258
+ echo "Server stopped."
259
+ fi
260
+
261
  if [ "$FAIL" -eq 0 ]; then
262
  echo -e "${GREEN}🎉 ALL CHECKS PASSED — Ready for OpenEnv submission!${NC}"
263
  exit 0
server/app.py CHANGED
@@ -107,9 +107,10 @@ def get_schema():
107
  }
108
 
109
 
110
- @app.api_route("/reset", methods=["GET", "POST"], tags=["Environment Control"])
 
111
  def reset_env():
112
- """Reset the environment and yield the initial observation."""
113
  obs = env_instance.reset()
114
  return {
115
  "observation": obs.state,
 
107
  }
108
 
109
 
110
+ @app.get("/reset", tags=["Environment Control"], operation_id="reset_env_get")
111
+ @app.post("/reset", tags=["Environment Control"], operation_id="reset_env_post")
112
  def reset_env():
113
+ """Reset the environment and yield the initial observation (GET or POST)."""
114
  obs = env_instance.reset()
115
  return {
116
  "observation": obs.state,
server/env.py CHANGED
@@ -1,112 +1,170 @@
1
  import random
2
- import time
3
  import copy
4
  from typing import Tuple, List, Dict, Any
5
  from server.models import Action, Observation, Reward
6
  from server.tasks import TASKS
7
 
8
- # Expanded Scenarios with SLA metadata
 
 
9
  SCENARIOS = [
 
10
  {
11
- "ticket_text": "I bought a premium subscription but it's not working. I want my money back right now!",
12
  "sentiment": "angry",
13
  "expected_classification": "refund",
14
  "expected_priority": "high",
15
  "sla_steps": 5,
 
16
  },
 
17
  {
18
- "ticket_text": "How do I change my profile picture? I tried looking in the settings but couldn't find it.",
19
  "sentiment": "neutral",
20
- "expected_classification": "general_inquiry",
21
- "expected_priority": "low",
22
  "sla_steps": 8,
 
 
 
 
 
 
 
 
 
 
23
  },
 
24
  {
25
- "ticket_text": "I can't log into my account, and I have a huge presentation in 10 minutes that needs the data!",
26
  "sentiment": "panicked",
27
- "expected_classification": "login_issue",
28
  "expected_priority": "high",
29
  "sla_steps": 3,
 
30
  },
 
31
  {
32
- "ticket_text": "The latest update keeps crashing on my Android phone. Please fix it ASAP.",
33
- "sentiment": "angry",
34
  "expected_classification": "technical_issue",
35
- "expected_priority": "medium",
36
- "sla_steps": 6,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  },
 
38
  {
39
- "ticket_text": "Do you offer any discounts for students or non-profits?",
40
  "sentiment": "curious",
41
  "expected_classification": "general_inquiry",
42
  "expected_priority": "low",
43
  "sla_steps": 10,
 
44
  },
 
45
  {
46
- "ticket_text": "I just wanted to say that your support team is amazing! Thank you for the quick help.",
47
- "sentiment": "happy",
48
- "expected_classification": "feedback",
49
  "expected_priority": "low",
50
- "sla_steps": 12,
 
 
 
 
 
 
 
 
 
 
51
  },
 
52
  {
53
- "ticket_text": "I'm worried about my data privacy after the recent news. Can you explain your encryption?",
54
  "sentiment": "concerned",
55
  "expected_classification": "security",
56
  "expected_priority": "medium",
57
  "sla_steps": 7,
58
- }
 
 
 
 
 
 
 
 
 
 
59
  ]
60
 
 
61
  class CustomerSupportEnv:
62
  def __init__(self):
63
  """Initialize the Enterprise AI Customer Support environment."""
64
  self.queue: List[Dict] = []
65
  self.resolved_count = 0
66
  self.total_reward = 0.0
67
- self.max_steps_per_ticket = 10
68
  self.current_step = 0
69
- self.actions_taken = set()
70
- self.history = []
71
 
72
  def reset(self) -> Observation:
73
- """Initialize a new enterprise session with a queue of tickets."""
74
- # Pick 3 random unique scenarios for the queue
75
  self.queue = [copy.deepcopy(s) for s in random.sample(SCENARIOS, 3)]
76
  self.resolved_count = 0
77
  self.total_reward = 0.0
78
  self.current_step = 0
79
  self.actions_taken = set()
80
  self.history = []
81
-
82
  return self.state()
83
 
84
  def state(self) -> Observation:
85
  """Standard OpenEnv API: Retrieve the current observation state."""
86
- # Shared info for both state and info fields to satisfy frontend expectations
87
  current_info = {
88
- "queue": [t["ticket_text"][:30] + "..." for t in self.queue],
89
  "resolved": self.resolved_count,
90
  "total_reward": self.total_reward,
91
- "queue_size": len(self.queue)
92
  }
93
 
94
  if not self.queue:
95
  return Observation(
96
  state={
97
- "status": "session_complete",
98
  "message": "All tickets in queue processed.",
99
  "total_reward": self.total_reward,
100
  "resolved": self.resolved_count,
101
- "info": current_info
102
  },
103
- info=current_info
104
  )
105
-
106
  ticket = self.queue[0]
107
  obs_state = {
108
  "ticket_text": ticket["ticket_text"],
109
  "sentiment": ticket["sentiment"],
 
110
  "priority": ticket.get("priority"),
111
  "status": ticket.get("status", "open"),
112
  "steps_taken": self.current_step,
@@ -118,29 +176,25 @@ class CustomerSupportEnv:
118
  "total_reward": self.total_reward,
119
  "resolved": self.resolved_count,
120
  "last_step_status": self.history[-1]["status"] if self.history else "neutral",
121
- "info": current_info # Redundant but fixes frontend lookups
122
  }
123
-
124
  return Observation(state=obs_state, info=current_info)
125
 
126
  @property
127
- def current_state(self):
128
- """Helper for the grader to access the current ticket state dictionary."""
129
- obs = self.state()
130
- return obs.state
131
 
132
  @property
133
- def ground_truth(self):
134
- """Helper for the grader to access the expected values of the current ticket."""
135
- if not self.queue:
136
- return None
137
- return self.queue[0]
138
 
139
  # Static tasks attribute for discovery
140
  tasks = TASKS
141
 
142
  def get_tasks(self) -> List[Dict]:
143
- """Expose available tasks for OpenEnv discovery as a method."""
144
  return TASKS
145
 
146
  def grade(self, task_id: str, history: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> float:
@@ -148,10 +202,9 @@ class CustomerSupportEnv:
148
  return self.grade_task(task_id, history, ground_truth)
149
 
150
  def grade_task(self, task_id: str, history: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> float:
151
- """Convenience method for the validator to grade a specific task execution."""
152
  from server.grader import score_episode
153
 
154
- # Determine difficulty from task definition
155
  diff = "EASY"
156
  for t in TASKS:
157
  if t["id"] == task_id:
@@ -169,88 +222,120 @@ class CustomerSupportEnv:
169
  reward_val = 0.0
170
  is_terminal = False
171
  message = ""
172
-
173
  current_ticket = self.queue[0]
174
  a_type = action.action_type
175
  payload = action.payload
176
 
177
- # Action Logic
178
  if a_type == "classify_ticket":
179
- cat = payload.get("classification")
180
  current_ticket["classification"] = cat
181
- reward_val += 0.3 if cat == current_ticket["expected_classification"] else -0.2
182
- message = "Classified. "
183
-
 
 
 
 
184
  elif a_type == "assign_priority":
185
- pri = payload.get("priority")
186
  current_ticket["priority"] = pri
187
- reward_val += 0.2 if pri == current_ticket["expected_priority"] else -0.2
188
- message = "Priority set. "
189
-
 
 
 
 
 
 
 
190
  elif a_type == "generate_response":
191
  resp = payload.get("response", "")
192
  current_ticket["response"] = resp
193
- has_empathy = any(w in resp.lower() for w in ["sorry", "apologize", "understand", "help"])
194
- if current_ticket["sentiment"] == "angry" and not has_empathy:
195
- reward_val -= 0.1
196
- reward_val += 0.2 if resp.strip() else -0.2
197
- message = "Response drafted. "
198
-
 
 
 
 
 
 
 
 
 
 
199
  elif a_type == "resolve":
200
- # Check completion
201
- if current_ticket.get("classification") and current_ticket.get("priority") and current_ticket.get("response"):
 
 
 
202
  reward_val += 0.4
203
- message = "Ticket Resolved!"
204
  current_ticket["status"] = "closed"
205
  self.resolved_count += 1
206
- # SLA Check
 
207
  if self.current_step > current_ticket["sla_steps"]:
208
- reward_val -= 0.3
209
- message += " (SLA Breached)"
210
  else:
 
211
  reward_val -= 0.2
212
- message = "Resolution attempted without full data."
213
-
214
- # Move to next ticket
215
  self.queue.pop(0)
216
  self.current_step = 0
217
  self.actions_taken = set()
218
  if not self.queue:
219
  is_terminal = True
220
-
221
  elif a_type == "escalate":
222
- reward_val += 0.2 if current_ticket["sentiment"] == "angry" else -0.2
223
- message = "Escalated to Manager."
224
- # Escalation closes current and moves on
 
 
 
225
  self.queue.pop(0)
226
  self.current_step = 0
 
227
  if not self.queue:
228
  is_terminal = True
229
 
230
- # Penalize repeated or excessive steps
 
 
 
 
231
  if a_type in self.actions_taken:
232
  reward_val -= 0.1
 
233
  self.actions_taken.add(a_type)
234
- reward_val -= 0.05 # Smaller per-step cost
235
 
236
- self.total_reward += reward_val
 
237
 
238
- # Step-level Status Logic
239
  status = "success" if reward_val > 0 else "failed" if reward_val < 0 else "neutral"
240
-
241
- # Update History with detailed metadata
242
  self.history.append({
243
  "step_count": len(self.history) + 1,
244
  "action": a_type,
245
  "reward": reward_val,
246
  "status": status,
247
- "message": message
248
  })
249
-
250
  step_info = {
251
  "message": message,
252
  "status": status,
253
- "reward": reward_val
254
  }
255
-
256
  return self.state(), Reward(value=reward_val, is_terminal=is_terminal), is_terminal, step_info
 
1
  import random
 
2
  import copy
3
  from typing import Tuple, List, Dict, Any
4
  from server.models import Action, Observation, Reward
5
  from server.tasks import TASKS
6
 
7
+ # ── Real-world customer support scenarios ─────────────────────────────────────
8
+ # Each scenario covers a distinct category with strong classification signals
9
+ # and realistic urgency cues so agents can learn correct behavior.
10
  SCENARIOS = [
11
+ # REFUND — angry, clear billing error
12
  {
13
+ "ticket_text": "I was charged twice for my annual subscription this month. I have the bank statement to prove it. I want one payment refunded immediately.",
14
  "sentiment": "angry",
15
  "expected_classification": "refund",
16
  "expected_priority": "high",
17
  "sla_steps": 5,
18
+ "context": "Duplicate billing charge. Customer has proof. High urgency.",
19
  },
20
+ # REFUND — neutral, post-cancellation billing
21
  {
22
+ "ticket_text": "I cancelled my subscription 3 days ago but was still billed for next month. I need this refunded please.",
23
  "sentiment": "neutral",
24
+ "expected_classification": "refund",
25
+ "expected_priority": "medium",
26
  "sla_steps": 8,
27
+ "context": "Post-cancellation charge. Polite customer, standard urgency.",
28
+ },
29
+ # TECHNICAL ISSUE — angry, regression crash
30
+ {
31
+ "ticket_text": "The app crashes every single time I open a file larger than 50MB. This has been broken since last week's update — I cannot do my work.",
32
+ "sentiment": "angry",
33
+ "expected_classification": "technical_issue",
34
+ "expected_priority": "high",
35
+ "sla_steps": 6,
36
+ "context": "Regression bug blocking core workflow.",
37
  },
38
+ # TECHNICAL ISSUE — panicked, team outage
39
  {
40
+ "ticket_text": "Our entire development team cannot access the API since this morning. We have a production deployment in 2 hours this is a critical emergency!",
41
  "sentiment": "panicked",
42
+ "expected_classification": "technical_issue",
43
  "expected_priority": "high",
44
  "sla_steps": 3,
45
+ "context": "P0 outage. Production deadline imminent.",
46
  },
47
+ # TECHNICAL ISSUE — neutral, minor UI bug
48
  {
49
+ "ticket_text": "The dark mode setting doesn't save when I refresh the page. It reverts to light mode every time. Minor issue but a bit annoying.",
50
+ "sentiment": "neutral",
51
  "expected_classification": "technical_issue",
52
+ "expected_priority": "low",
53
+ "sla_steps": 10,
54
+ "context": "Minor UI preference bug. No business impact.",
55
+ },
56
+ # LOGIN ISSUE — panicked, team locked out
57
+ {
58
+ "ticket_text": "I reset my password twice but I still cannot log in. My whole team is locked out and we have a client demo starting in 15 minutes!",
59
+ "sentiment": "panicked",
60
+ "expected_classification": "login_issue",
61
+ "expected_priority": "high",
62
+ "sla_steps": 4,
63
+ "context": "Password reset loop, team locked out. Time critical.",
64
+ },
65
+ # LOGIN ISSUE — neutral, standard password reset
66
+ {
67
+ "ticket_text": "Hi, I forgot my password. Can you help me reset it or send me a recovery link? No rush, just let me know when you can.",
68
+ "sentiment": "neutral",
69
+ "expected_classification": "login_issue",
70
+ "expected_priority": "low",
71
+ "sla_steps": 12,
72
+ "context": "Standard password recovery. No urgency.",
73
  },
74
+ # GENERAL INQUIRY — curious, pricing
75
  {
76
+ "ticket_text": "Do you offer a non-profit discount? We are a registered charity and your standard price is a little high for our annual budget.",
77
  "sentiment": "curious",
78
  "expected_classification": "general_inquiry",
79
  "expected_priority": "low",
80
  "sla_steps": 10,
81
+ "context": "Pricing question. Low urgency.",
82
  },
83
+ # GENERAL INQUIRY — neutral, how-to
84
  {
85
+ "ticket_text": "How do I export all my project data to CSV? I need to share it with a client in a different format.",
86
+ "sentiment": "neutral",
87
+ "expected_classification": "general_inquiry",
88
  "expected_priority": "low",
89
+ "sla_steps": 10,
90
+ "context": "Basic how-to question. No urgency.",
91
+ },
92
+ # SECURITY — concerned, unauthorized login
93
+ {
94
+ "ticket_text": "I received an alert that someone logged into my account from a location I don't recognize. I did not authorize this. Is my account compromised?",
95
+ "sentiment": "concerned",
96
+ "expected_classification": "security",
97
+ "expected_priority": "high",
98
+ "sla_steps": 4,
99
+ "context": "Potential account takeover. Must be high priority.",
100
  },
101
+ # SECURITY — concerned, encryption question
102
  {
103
+ "ticket_text": "After reading about recent data breaches at other SaaS companies, I want to understand what encryption you use to protect my credit card details.",
104
  "sentiment": "concerned",
105
  "expected_classification": "security",
106
  "expected_priority": "medium",
107
  "sla_steps": 7,
108
+ "context": "Security assurance question. No active breach.",
109
+ },
110
+ # FEEDBACK — happy, positive
111
+ {
112
+ "ticket_text": "The new dashboard redesign is fantastic! Generating a report used to take me 10 minutes — now it's instant. Your team did an amazing job!",
113
+ "sentiment": "happy",
114
+ "expected_classification": "feedback",
115
+ "expected_priority": "low",
116
+ "sla_steps": 15,
117
+ "context": "Positive feedback. No action needed urgently.",
118
+ },
119
  ]
120
 
121
+
122
  class CustomerSupportEnv:
123
  def __init__(self):
124
  """Initialize the Enterprise AI Customer Support environment."""
125
  self.queue: List[Dict] = []
126
  self.resolved_count = 0
127
  self.total_reward = 0.0
 
128
  self.current_step = 0
129
+ self.actions_taken: set = set()
130
+ self.history: List[Dict] = []
131
 
132
  def reset(self) -> Observation:
133
+ """Standard OpenEnv API: Initialize a new session with a queue of 3 tickets."""
 
134
  self.queue = [copy.deepcopy(s) for s in random.sample(SCENARIOS, 3)]
135
  self.resolved_count = 0
136
  self.total_reward = 0.0
137
  self.current_step = 0
138
  self.actions_taken = set()
139
  self.history = []
 
140
  return self.state()
141
 
142
  def state(self) -> Observation:
143
  """Standard OpenEnv API: Retrieve the current observation state."""
 
144
  current_info = {
145
+ "queue": [t["ticket_text"][:40] + "..." for t in self.queue],
146
  "resolved": self.resolved_count,
147
  "total_reward": self.total_reward,
148
+ "queue_size": len(self.queue),
149
  }
150
 
151
  if not self.queue:
152
  return Observation(
153
  state={
154
+ "status": "session_complete",
155
  "message": "All tickets in queue processed.",
156
  "total_reward": self.total_reward,
157
  "resolved": self.resolved_count,
158
+ "info": current_info,
159
  },
160
+ info=current_info,
161
  )
162
+
163
  ticket = self.queue[0]
164
  obs_state = {
165
  "ticket_text": ticket["ticket_text"],
166
  "sentiment": ticket["sentiment"],
167
+ "context": ticket.get("context", ""),
168
  "priority": ticket.get("priority"),
169
  "status": ticket.get("status", "open"),
170
  "steps_taken": self.current_step,
 
176
  "total_reward": self.total_reward,
177
  "resolved": self.resolved_count,
178
  "last_step_status": self.history[-1]["status"] if self.history else "neutral",
179
+ "info": current_info,
180
  }
 
181
  return Observation(state=obs_state, info=current_info)
182
 
183
  @property
184
+ def current_state(self) -> Dict:
185
+ """Helper: current ticket state dict for grading."""
186
+ return self.state().state
 
187
 
188
  @property
189
+ def ground_truth(self) -> Dict | None:
190
+ """Helper: expected values for the current ticket."""
191
+ return self.queue[0] if self.queue else None
 
 
192
 
193
  # Static tasks attribute for discovery
194
  tasks = TASKS
195
 
196
  def get_tasks(self) -> List[Dict]:
197
+ """Expose available tasks for OpenEnv discovery."""
198
  return TASKS
199
 
200
  def grade(self, task_id: str, history: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> float:
 
202
  return self.grade_task(task_id, history, ground_truth)
203
 
204
  def grade_task(self, task_id: str, history: List[Dict[str, Any]], ground_truth: Dict[str, Any]) -> float:
205
+ """Grade a specific task execution. Returns float in [0.0, 1.0]."""
206
  from server.grader import score_episode
207
 
 
208
  diff = "EASY"
209
  for t in TASKS:
210
  if t["id"] == task_id:
 
222
  reward_val = 0.0
223
  is_terminal = False
224
  message = ""
225
+
226
  current_ticket = self.queue[0]
227
  a_type = action.action_type
228
  payload = action.payload
229
 
230
+ # ── Action Logic ──────────────────────────────────────────────────────
231
  if a_type == "classify_ticket":
232
+ cat = payload.get("classification", "")
233
  current_ticket["classification"] = cat
234
+ if cat == current_ticket["expected_classification"]:
235
+ reward_val += 0.35
236
+ message = f"✅ Classified correctly as '{cat}'."
237
+ else:
238
+ reward_val -= 0.2
239
+ message = f"❌ Wrong classification '{cat}' (expected: {current_ticket['expected_classification']})."
240
+
241
  elif a_type == "assign_priority":
242
+ pri = payload.get("priority", "")
243
  current_ticket["priority"] = pri
244
+ if pri == current_ticket["expected_priority"]:
245
+ reward_val += 0.25
246
+ message = f"✅ Priority set to '{pri}' correctly."
247
+ elif pri in ("high", "medium", "low"):
248
+ reward_val -= 0.15
249
+ message = f"⚠️ Priority '{pri}' (expected: {current_ticket['expected_priority']})."
250
+ else:
251
+ reward_val -= 0.2
252
+ message = f"❌ Invalid priority value '{pri}'."
253
+
254
  elif a_type == "generate_response":
255
  resp = payload.get("response", "")
256
  current_ticket["response"] = resp
257
+ if not resp.strip():
258
+ reward_val -= 0.2
259
+ message = "❌ Empty response — no reward."
260
+ else:
261
+ reward_val += 0.2
262
+ # Empathy check for negative sentiment
263
+ if current_ticket["sentiment"] in ("angry", "panicked", "concerned"):
264
+ empathy_words = ["sorry", "apologize", "understand", "concern", "frustrat"]
265
+ if not any(w in resp.lower() for w in empathy_words):
266
+ reward_val -= 0.1
267
+ message = "⚠️ Response drafted but missing empathy for upset customer."
268
+ else:
269
+ message = "✅ Empathetic response drafted."
270
+ else:
271
+ message = "✅ Response drafted."
272
+
273
  elif a_type == "resolve":
274
+ has_classify = bool(current_ticket.get("classification"))
275
+ has_priority = bool(current_ticket.get("priority"))
276
+ has_response = bool(current_ticket.get("response"))
277
+
278
+ if has_classify and has_priority and has_response:
279
  reward_val += 0.4
 
280
  current_ticket["status"] = "closed"
281
  self.resolved_count += 1
282
+ message = "✅ Ticket fully resolved!"
283
+ # SLA penalty
284
  if self.current_step > current_ticket["sla_steps"]:
285
+ reward_val -= 0.25
286
+ message += " ⚠️ SLA breached."
287
  else:
288
+ missing = [k for k, v in [("classification", has_classify), ("priority", has_priority), ("response", has_response)] if not v]
289
  reward_val -= 0.2
290
+ message = f" Cannot resolve missing: {', '.join(missing)}."
291
+
 
292
  self.queue.pop(0)
293
  self.current_step = 0
294
  self.actions_taken = set()
295
  if not self.queue:
296
  is_terminal = True
297
+
298
  elif a_type == "escalate":
299
+ if current_ticket["sentiment"] in ("angry", "panicked"):
300
+ reward_val += 0.15
301
+ message = "✅ Escalated appropriate for high-urgency customer."
302
+ else:
303
+ reward_val -= 0.15
304
+ message = "⚠️ Escalated a non-urgent ticket — overkill."
305
  self.queue.pop(0)
306
  self.current_step = 0
307
+ self.actions_taken = set()
308
  if not self.queue:
309
  is_terminal = True
310
 
311
+ else:
312
+ reward_val -= 0.1
313
+ message = f"❌ Unknown action type '{a_type}'."
314
+
315
+ # Penalize repeated actions on the same ticket
316
  if a_type in self.actions_taken:
317
  reward_val -= 0.1
318
+ message += " (Repeated action penalty)"
319
  self.actions_taken.add(a_type)
 
320
 
321
+ # Small per-step cost to encourage efficiency
322
+ reward_val -= 0.02
323
 
324
+ self.total_reward += reward_val
325
  status = "success" if reward_val > 0 else "failed" if reward_val < 0 else "neutral"
326
+
 
327
  self.history.append({
328
  "step_count": len(self.history) + 1,
329
  "action": a_type,
330
  "reward": reward_val,
331
  "status": status,
332
+ "message": message,
333
  })
334
+
335
  step_info = {
336
  "message": message,
337
  "status": status,
338
+ "reward": reward_val,
339
  }
340
+
341
  return self.state(), Reward(value=reward_val, is_terminal=is_terminal), is_terminal, step_info
tasks.json CHANGED
@@ -3,110 +3,186 @@
3
  "id": "task_easy_1",
4
  "name": "Ticket Classification",
5
  "difficulty": "EASY",
6
- "objective": "Classify the issue correctly using classify_ticket. Choose the right category from: refund, technical_issue, login_issue, general_inquiry, feedback, security.",
7
- "description": "The agent reads a customer ticket and must call classify_ticket with the correct category. No other action is required. Score = 1.0 if classification matches, 0.0 otherwise.",
 
 
 
 
 
 
 
 
 
8
  "actions_required": ["classify_ticket"],
9
  "scoring": {
10
  "classification_correct": 1.0,
11
  "classification_wrong": 0.0
12
  },
 
13
  "has_grader": true,
14
  "has_evaluator": true,
15
  "grader": true
16
  },
17
  {
18
  "id": "task_easy_2",
19
- "name": "Priority Assignment",
20
  "difficulty": "EASY",
21
- "objective": "Assign the correct priority (low / medium / high) based on customer urgency and sentiment.",
22
- "description": "The agent reads the ticket sentiment and urgency signals, then calls assign_priority. Score = 1.0 for correct priority, 0.0 otherwise.",
 
 
 
 
 
 
 
 
 
23
  "actions_required": ["assign_priority"],
24
  "scoring": {
25
  "priority_correct": 1.0,
26
  "priority_wrong": 0.0
27
  },
 
28
  "has_grader": true,
29
  "has_evaluator": true,
30
  "grader": true
31
  },
32
  {
33
  "id": "task_medium_1",
34
- "name": "Classify and Respond",
35
  "difficulty": "MEDIUM",
36
- "objective": "Correctly classify the ticket AND generate an appropriate response. If the customer is angry, the response must include empathy keywords (sorry, apologize, understand, help, concern).",
37
- "description": "Two-step task: classify (0.5 pts) + empathetic response (0.5 pts). Angry customers without empathy score 0 on the response component.",
 
 
 
 
 
 
 
 
 
38
  "actions_required": ["classify_ticket", "generate_response"],
39
  "scoring": {
40
  "classification_correct": 0.5,
41
- "response_with_empathy": 0.5
42
  },
 
43
  "has_grader": true,
44
  "has_evaluator": true,
45
  "grader": true
46
  },
47
  {
48
  "id": "task_medium_2",
49
- "name": "Professional Resolution",
50
  "difficulty": "MEDIUM",
51
- "objective": "Classify the ticket and draft a professional response containing at least one solution-oriented keyword (help, support, assist, resolve, solution).",
52
- "description": "Two-step task: classify (0.5 pts) + professional keyword response (0.5 pts). Tests the agent's ability to generate actionable, not just sympathetic, responses.",
 
 
 
 
 
 
 
 
 
53
  "actions_required": ["classify_ticket", "generate_response"],
54
  "scoring": {
55
  "classification_correct": 0.5,
56
- "response_professional": 0.5
57
  },
 
58
  "has_grader": true,
59
  "has_evaluator": true,
60
  "grader": true
61
  },
62
  {
63
  "id": "task_hard_1",
64
- "name": "Full Support Workflow",
65
  "difficulty": "HARD",
66
- "objective": "Complete the full 4-step workflow: (1) classify, (2) assign priority, (3) generate empathetic response, (4) resolve the ticket.",
67
- "description": "Each of the 4 steps is worth 0.25. The agent must complete all steps correctly to achieve 1.0. Resolving without classification/priority/response penalizes the resolve step.",
 
 
 
 
 
 
 
 
 
 
 
68
  "actions_required": ["classify_ticket", "assign_priority", "generate_response", "resolve"],
69
  "scoring": {
70
  "classification_correct": 0.25,
71
  "priority_correct": 0.25,
72
- "response_with_empathy": 0.25,
73
- "ticket_resolved": 0.25
74
  },
 
75
  "has_grader": true,
76
  "has_evaluator": true,
77
  "grader": true
78
  },
79
  {
80
  "id": "task_hard_2",
81
- "name": "High-Priority Angry Customer",
82
  "difficulty": "HARD",
83
- "objective": "Identify an angry/panicked customer, set priority to high, write a reassuring empathetic response, and correctly classify the issue.",
84
- "description": "4-component grader: classification (0.25) + high priority (0.25) + empathy in response (0.25) + correct sentiment detection (0.25). Designed for de-escalation training.",
 
 
 
 
 
 
 
 
 
 
85
  "actions_required": ["classify_ticket", "assign_priority", "generate_response"],
86
  "scoring": {
87
  "classification_correct": 0.25,
88
- "priority_high": 0.25,
89
  "response_empathetic": 0.25,
90
- "sentiment_angry_or_panicked": 0.25
91
  },
 
92
  "has_grader": true,
93
  "has_evaluator": true,
94
  "grader": true
95
  },
96
  {
97
  "id": "task_hard_3",
98
- "name": "Efficiency Challenge",
99
  "difficulty": "HARD",
100
- "objective": "Complete the full workflow (classify + priority + respond + resolve) in 4 steps or fewer for a speed bonus. Accuracy on each step still matters.",
101
- "description": "5-component grader: classification (0.2) + priority (0.2) + response (0.2) + resolve (0.2) + efficiency bonus (0.2 for ≤4 steps, 0.1 for ≤6 steps). Max score = 1.0.",
 
 
 
 
 
 
 
 
 
 
 
102
  "actions_required": ["classify_ticket", "assign_priority", "generate_response", "resolve"],
103
  "scoring": {
104
  "classification_correct": 0.2,
105
  "priority_correct": 0.2,
106
- "response_present": 0.2,
107
  "ticket_resolved": 0.2,
108
- "efficiency_bonus": 0.2
 
109
  },
 
110
  "has_grader": true,
111
  "has_evaluator": true,
112
  "grader": true
 
3
  "id": "task_easy_1",
4
  "name": "Ticket Classification",
5
  "difficulty": "EASY",
6
+ "scenario": "A customer writes: 'I was charged twice for my subscription this month. Please refund one payment.' The agent must identify this is a billing/refund issue.",
7
+ "objective": "Call classify_ticket with the correct category. Categories: refund | technical_issue | login_issue | general_inquiry | feedback | security. Score = 1.0 for correct, 0.0 for wrong.",
8
+ "description": "Single-action task. The agent reads the ticket text, identifies the issue type from clear signal words (e.g. 'refund', 'charged', 'can't login', 'data breach'), and calls classify_ticket once. No priority or response needed.",
9
+ "example_input": {
10
+ "ticket_text": "I was charged twice for my subscription. Please refund one payment.",
11
+ "sentiment": "angry"
12
+ },
13
+ "example_action": {
14
+ "action_type": "classify_ticket",
15
+ "payload": {"classification": "refund"}
16
+ },
17
  "actions_required": ["classify_ticket"],
18
  "scoring": {
19
  "classification_correct": 1.0,
20
  "classification_wrong": 0.0
21
  },
22
+ "passing_threshold": 0.5,
23
  "has_grader": true,
24
  "has_evaluator": true,
25
  "grader": true
26
  },
27
  {
28
  "id": "task_easy_2",
29
+ "name": "Priority Triage",
30
  "difficulty": "EASY",
31
+ "scenario": "A panicked user writes: 'I cannot log in and my team demo starts in 5 minutes!' — High urgency requires HIGH priority. A general question like 'How do I export a CSV?' should get LOW priority.",
32
+ "objective": "Call assign_priority with the correct urgency level (low | medium | high). Sentiment and time-pressure signals in the ticket determine priority. Score = 1.0 for correct, 0.0 otherwise.",
33
+ "description": "Single-action triage task. The agent reads urgency signals (keywords like 'ASAP', 'urgent', 'presentation', crash reports) and maps them to correct priority. HIGH = emergency/angry/time-sensitive. MEDIUM = frustrated/recurring. LOW = curious/happy/general.",
34
+ "example_input": {
35
+ "ticket_text": "I can't log in and my client call starts in 5 minutes!",
36
+ "sentiment": "panicked"
37
+ },
38
+ "example_action": {
39
+ "action_type": "assign_priority",
40
+ "payload": {"priority": "high"}
41
+ },
42
  "actions_required": ["assign_priority"],
43
  "scoring": {
44
  "priority_correct": 1.0,
45
  "priority_wrong": 0.0
46
  },
47
+ "passing_threshold": 0.5,
48
  "has_grader": true,
49
  "has_evaluator": true,
50
  "grader": true
51
  },
52
  {
53
  "id": "task_medium_1",
54
+ "name": "Classify + Empathetic Reply",
55
  "difficulty": "MEDIUM",
56
+ "scenario": "An angry customer is frustrated their refund has not arrived after 10 days. The agent must (1) correctly classify as 'refund' and (2) write a response that acknowledges their frustration using empathy words like 'sorry', 'apologize', or 'understand'.",
57
+ "objective": "Two actions in sequence: classify_ticket correctly (0.5 pts) + generate_response containing at least one empathy keyword for angry customers (0.5 pts). Missing empathy for an angry customer scores 0 on the response component.",
58
+ "description": "Real-world de-escalation task. An angry customer needs both accurate issue categorization AND a tone-appropriate response. The grader checks: (a) classification matches expected_classification, (b) for angry/panicked sentiment, response must contain empathy words [sorry, apologize, understand, help, concern].",
59
+ "example_input": {
60
+ "ticket_text": "My refund was supposed to arrive 10 days ago. This is completely unacceptable!",
61
+ "sentiment": "angry"
62
+ },
63
+ "example_actions": [
64
+ {"action_type": "classify_ticket", "payload": {"classification": "refund"}},
65
+ {"action_type": "generate_response", "payload": {"response": "I sincerely apologize for the delay on your refund. I understand how frustrating this must be and I am escalating this to our billing team right now."}}
66
+ ],
67
  "actions_required": ["classify_ticket", "generate_response"],
68
  "scoring": {
69
  "classification_correct": 0.5,
70
+ "response_empathetic_for_angry_customer": 0.5
71
  },
72
+ "passing_threshold": 0.5,
73
  "has_grader": true,
74
  "has_evaluator": true,
75
  "grader": true
76
  },
77
  {
78
  "id": "task_medium_2",
79
+ "name": "Classify + Actionable Resolution",
80
  "difficulty": "MEDIUM",
81
+ "scenario": "A user reports a technical bug: 'The app crashes every time I try to export a PDF.' The agent must (1) classify as 'technical_issue' and (2) provide an actionable response that guides the user toward a solution (using keywords like 'help', 'support', 'resolve', 'fix', 'solution', 'assist').",
82
+ "objective": "Two actions: classify_ticket correctly (0.5 pts) + generate_response with at least one solution-oriented keyword (0.5 pts). Tests that the agent provides helpful guidance, not just sympathy.",
83
+ "description": "Actionable response task. Unlike task_medium_1 which checks for empathy, this task checks for solution orientation. The agent must show they can guide users toward resolution, not just acknowledge feelings. Keywords checked: [help, support, assist, resolve, solution, fix, guide, step, instructions].",
84
+ "example_input": {
85
+ "ticket_text": "The app crashes when I try to export PDF. This is blocking my work.",
86
+ "sentiment": "frustrated"
87
+ },
88
+ "example_actions": [
89
+ {"action_type": "classify_ticket", "payload": {"classification": "technical_issue"}},
90
+ {"action_type": "generate_response", "payload": {"response": "I understand the inconvenience. Please try clearing your app cache and updating to v2.3.1. If the issue persists, our support team will assist you directly with a fix."}}
91
+ ],
92
  "actions_required": ["classify_ticket", "generate_response"],
93
  "scoring": {
94
  "classification_correct": 0.5,
95
+ "response_solution_oriented": 0.5
96
  },
97
+ "passing_threshold": 0.5,
98
  "has_grader": true,
99
  "has_evaluator": true,
100
  "grader": true
101
  },
102
  {
103
  "id": "task_hard_1",
104
+ "name": "Full Ticket Lifecycle",
105
  "difficulty": "HARD",
106
+ "scenario": "A customer reports they cannot access their account after changing their password. The full workflow must be completed: classify the issue, set the right priority, write an empathetic response that offers next steps, and then close the ticket.",
107
+ "objective": "Complete all 4 lifecycle steps correctly. Each step earns 0.25: (1) classify_ticket correct, (2) assign_priority correct, (3) generate_response with empathy/solution keywords, (4) resolve (ticket must have classification + priority + response before resolving).",
108
+ "description": "End-to-end lifecycle task. This mirrors a real support agent's complete workflow. The grader is strict: resolve only scores 0.25 if the ticket also has classification, priority, and response set. This prevents agents from skipping steps and jumping straight to resolve.",
109
+ "example_input": {
110
+ "ticket_text": "I reset my password but still cannot log in. My entire team is locked out!",
111
+ "sentiment": "panicked"
112
+ },
113
+ "example_actions": [
114
+ {"action_type": "classify_ticket", "payload": {"classification": "login_issue"}},
115
+ {"action_type": "assign_priority", "payload": {"priority": "high"}},
116
+ {"action_type": "generate_response", "payload": {"response": "I am so sorry you're locked out. I understand how urgent this is. I am escalating this to our account team immediately — you should be back in within 10 minutes. Please try the 'Forgot Password' link in the meantime."}},
117
+ {"action_type": "resolve", "payload": {}}
118
+ ],
119
  "actions_required": ["classify_ticket", "assign_priority", "generate_response", "resolve"],
120
  "scoring": {
121
  "classification_correct": 0.25,
122
  "priority_correct": 0.25,
123
+ "response_empathetic_and_actionable": 0.25,
124
+ "ticket_properly_resolved": 0.25
125
  },
126
+ "passing_threshold": 0.5,
127
  "has_grader": true,
128
  "has_evaluator": true,
129
  "grader": true
130
  },
131
  {
132
  "id": "task_hard_2",
133
+ "name": "Angry Customer De-escalation",
134
  "difficulty": "HARD",
135
+ "scenario": "A furious customer threatens to cancel their subscription after being billed incorrectly three months in a row. The agent must correctly classify as 'refund', set priority to 'high' (angry + financial dispute), write an empathetic response addressing their anger, and the ticket must come from an angry/panicked sentiment.",
136
+ "objective": "4-component score: (1) correct classification (0.25), (2) priority set to 'high' (0.25) any other priority scores 0, (3) response contains empathy keywords (0.25), (4) ticket sentiment is 'angry' or 'panicked' (0.25) validates agent correctly identifies escalation scenarios.",
137
+ "description": "De-escalation specialization task. Real customer support teams have agents who specialize in handling angry customers. This task trains that skill: the agent must recognize the escalation signals, prioritize correctly, AND respond with appropriate emotional intelligence. Assigning low/medium priority to an angry billing complaint is a failure.",
138
+ "example_input": {
139
+ "ticket_text": "I've been billed incorrectly for 3 months! I want a full refund and I'm cancelling everything if this isn't fixed TODAY.",
140
+ "sentiment": "angry"
141
+ },
142
+ "example_actions": [
143
+ {"action_type": "classify_ticket", "payload": {"classification": "refund"}},
144
+ {"action_type": "assign_priority", "payload": {"priority": "high"}},
145
+ {"action_type": "generate_response", "payload": {"response": "I sincerely apologize for this ongoing billing error — this is completely unacceptable and I understand your frustration. I am immediately processing a full 3-month refund and flagging your account to prevent future errors. A senior account manager will call you within the hour."}}
146
+ ],
147
  "actions_required": ["classify_ticket", "assign_priority", "generate_response"],
148
  "scoring": {
149
  "classification_correct": 0.25,
150
+ "priority_must_be_high": 0.25,
151
  "response_empathetic": 0.25,
152
+ "sentiment_is_angry_or_panicked": 0.25
153
  },
154
+ "passing_threshold": 0.5,
155
  "has_grader": true,
156
  "has_evaluator": true,
157
  "grader": true
158
  },
159
  {
160
  "id": "task_hard_3",
161
+ "name": "SLA Speed Challenge",
162
  "difficulty": "HARD",
163
+ "scenario": "A high-SLA enterprise ticket has arrived — the customer's entire team is blocked and the contract mandates resolution within 5 actions. The agent must complete the full workflow (classify + priority + respond + resolve) accurately AND efficiently. Every extra action wastes SLA budget.",
164
+ "objective": "5-component score: classification (0.2) + priority (0.2) + response present (0.2) + ticket resolved (0.2) + efficiency bonus: 0.2 for ≤4 steps, 0.1 for ≤6 steps, 0.0 for >6 steps. Maximum achievable score = 1.0.",
165
+ "description": "Speed + accuracy combined task. A perfect agent scores 1.0 by doing exactly: classify → priority → respond → resolve (4 steps = maximum efficiency bonus). Extra actions (repeating classify, unnecessary escalations) drain the efficiency score. This tests an agent's ability to plan ahead, not just react to each observation.",
166
+ "example_input": {
167
+ "ticket_text": "Our entire development team cannot access the API. We have a production deployment in 2 hours.",
168
+ "sentiment": "panicked"
169
+ },
170
+ "example_actions": [
171
+ {"action_type": "classify_ticket", "payload": {"classification": "technical_issue"}},
172
+ {"action_type": "assign_priority", "payload": {"priority": "high"}},
173
+ {"action_type": "generate_response", "payload": {"response": "This is our highest priority. Our on-call engineering team has been paged and will resolve your API access within 30 minutes. We will keep you updated every 10 minutes."}},
174
+ {"action_type": "resolve", "payload": {}}
175
+ ],
176
  "actions_required": ["classify_ticket", "assign_priority", "generate_response", "resolve"],
177
  "scoring": {
178
  "classification_correct": 0.2,
179
  "priority_correct": 0.2,
180
+ "response_present_and_meaningful": 0.2,
181
  "ticket_resolved": 0.2,
182
+ "efficiency_bonus_4_steps": 0.2,
183
+ "efficiency_partial_6_steps": 0.1
184
  },
185
+ "passing_threshold": 0.5,
186
  "has_grader": true,
187
  "has_evaluator": true,
188
  "grader": true