ashishbaberwal commited on
Commit
52f4870
·
1 Parent(s): 7fb89ca

Gradio UI Setup

Browse files
Files changed (2) hide show
  1. app.py +336 -47
  2. scripts/load_test.sh +15 -0
app.py CHANGED
@@ -94,6 +94,20 @@ def score() -> Dict[str, Any]:
94
  }
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def _ui_reset(task_id: str) -> str:
98
  with _lock:
99
  obs = _env.reset(task_id=task_id or None)
@@ -132,59 +146,201 @@ def _ui_score() -> str:
132
 
133
 
134
  def _task_table() -> list[list[str]]:
135
- rows: list[list[str]] = []
136
- for task in TaskDefinitions.get_all_tasks():
137
- rows.append([
138
- task["task_id"],
139
- task["difficulty"],
140
- task["language"],
141
- task["task_name"],
142
- ])
143
- return rows
144
 
145
 
146
  def _difficulty_summary() -> str:
147
- counts = Counter(t["difficulty"] for t in TaskDefinitions.get_all_tasks())
148
- return (
149
- f"easy: {counts.get('easy', 0)} | "
150
- f"medium: {counts.get('medium', 0)} | "
151
- f"hard: {counts.get('hard', 0)}"
152
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  CUSTOM_CSS = """
156
  @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
157
 
158
  :root {
159
- --bg: #f4efe6;
160
- --card: #fffdf8;
161
- --ink: #1f2a37;
162
- --muted: #5f6f81;
163
- --accent: #ff6f3c;
164
- --accent-soft: #ffe6d8;
165
- --teal: #0f766e;
166
- --outline: #d8ccb8;
 
 
167
  }
168
 
169
  body, .gradio-container {
170
  font-family: 'Space Grotesk', sans-serif !important;
171
  background:
172
- radial-gradient(circle at 10% 15%, #ffd9bf 0%, transparent 34%),
173
- radial-gradient(circle at 90% 10%, #d2f0e7 0%, transparent 30%),
174
- linear-gradient(180deg, #f8f3ea 0%, var(--bg) 100%) !important;
 
 
175
  }
176
 
177
  .app-shell {
178
  border: 1px solid var(--outline);
179
  border-radius: 22px;
180
  overflow: hidden;
181
- box-shadow: 0 20px 55px rgba(31, 42, 55, 0.10);
182
  }
183
 
184
  .hero {
185
  padding: 22px 26px;
186
  color: var(--ink);
187
- background: linear-gradient(135deg, #ffd7c2 0%, #fff3ea 45%, #d6f2ea 100%);
188
  border-bottom: 1px solid var(--outline);
189
  }
190
 
@@ -204,9 +360,10 @@ body, .gradio-container {
204
  margin-top: 10px;
205
  padding: 4px 10px;
206
  border-radius: 999px;
207
- background: rgba(255, 255, 255, 0.8);
208
  border: 1px solid var(--outline);
209
  font-size: 12px;
 
210
  }
211
 
212
  .mono {
@@ -222,7 +379,7 @@ body, .gradio-container {
222
 
223
  .gr-button {
224
  border-radius: 12px !important;
225
- border: 1px solid #d4a58f !important;
226
  }
227
 
228
  .gr-button.primary {
@@ -233,9 +390,74 @@ body, .gradio-container {
233
  .status-note {
234
  padding: 12px;
235
  border-radius: 10px;
236
- border: 1px dashed #a5b4c7;
237
- background: #f9fbff;
238
- color: #223143;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  }
240
  """
241
 
@@ -252,12 +474,18 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
252
  <p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
253
  <span class=\"chip mono\">UI: /ui</span>
254
  <span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
 
255
  </section>
256
  """
257
  )
258
 
259
  with gr.Tabs():
260
- with gr.Tab("Control Deck"):
 
 
 
 
 
261
  with gr.Column(elem_id="control-panel"):
262
  with gr.Row():
263
  task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
@@ -265,18 +493,49 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
265
  score_btn = gr.Button("Get Score")
266
  state_btn = gr.Button("Get State")
267
 
 
 
 
 
 
268
  action_input = gr.Textbox(
269
  label="Action JSON",
270
- lines=9,
271
  value='{"action_type":"add_comment","comments":[],"suggestions":[]}',
272
  elem_classes=["mono"],
273
  )
274
- step_btn = gr.Button("Execute Step", variant="primary")
 
 
275
  output = gr.Code(label="API Response", language="json")
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- with gr.Tab("Task Atlas"):
278
  with gr.Column(elem_id="atlas-panel"):
279
- gr.Markdown("### Task Inventory")
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  diff_summary = gr.Textbox(
281
  label="Difficulty Split",
282
  value=_difficulty_summary(),
@@ -291,21 +550,51 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
291
  )
292
  refresh_tasks_btn = gr.Button("Refresh Task Atlas")
293
 
294
- with gr.Tab("Live Telemetry"):
295
- with gr.Column(elem_id="telemetry-panel"):
296
- gr.HTML("<div class='status-note'>Use this panel during long eval runs to inspect current state and score snapshots.</div>")
297
- with gr.Row():
298
- telemetry_score_btn = gr.Button("Snapshot Score")
299
- telemetry_state_btn = gr.Button("Snapshot State")
300
- telemetry_out = gr.Code(label="Telemetry Output", language="json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
303
  step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
304
  state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
305
  score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
 
 
 
306
 
307
- telemetry_score_btn.click(fn=_ui_score, inputs=None, outputs=[telemetry_out])
308
- telemetry_state_btn.click(fn=_ui_state, inputs=None, outputs=[telemetry_out])
309
  refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
310
  refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
311
 
 
94
  }
95
 
96
 
97
+ @app.get("/diagnostics")
98
+ def diagnostics() -> Dict[str, Any]:
99
+ with _lock:
100
+ current_state = _env.state()
101
+ diagnostics_data = _env.summary() if current_state else {}
102
+ task_score = _env.get_task_score()
103
+ return {
104
+ "task_score": task_score,
105
+ "diagnostics": diagnostics_data,
106
+ "validation": _validation_checks(),
107
+ "task_id": (current_state.get("task_metadata") or {}).get("task_id"),
108
+ }
109
+
110
+
111
  def _ui_reset(task_id: str) -> str:
112
  with _lock:
113
  obs = _env.reset(task_id=task_id or None)
 
146
 
147
 
148
  def _task_table() -> list[list[str]]:
149
+ rows: list[list[str]] = []
150
+ for task in TaskDefinitions.get_all_tasks():
151
+ rows.append([
152
+ task["task_id"],
153
+ task["difficulty"],
154
+ task["language"],
155
+ task["task_name"],
156
+ ])
157
+ return rows
158
 
159
 
160
  def _difficulty_summary() -> str:
161
+ counts = Counter(t["difficulty"] for t in TaskDefinitions.get_all_tasks())
162
+ return (
163
+ f"easy: {counts.get('easy', 0)} | "
164
+ f"medium: {counts.get('medium', 0)} | "
165
+ f"hard: {counts.get('hard', 0)}"
166
+ )
167
+
168
+
169
+ def _load_json(path: Path, default: Any) -> Any:
170
+ try:
171
+ return json.loads(path.read_text())
172
+ except Exception:
173
+ return default
174
+
175
+
176
+ def _repo_root() -> Path:
177
+ return Path(__file__).resolve().parent
178
+
179
+
180
+ def _outputs_dir() -> Path:
181
+ return _repo_root() / "outputs"
182
+
183
+
184
+ def _benchmark_summary() -> Dict[str, Any]:
185
+ return _load_json(_outputs_dir() / "benchmark_summary.json", {})
186
+
187
+
188
+ def _leaderboard_rows() -> list[list[str]]:
189
+ summary = _benchmark_summary()
190
+ rows: list[list[str]] = []
191
+ tasks = summary.get("tasks", []) if isinstance(summary, dict) else []
192
+ for index, item in enumerate(tasks, start=1):
193
+ if not isinstance(item, dict):
194
+ continue
195
+ rows.append([
196
+ str(index),
197
+ item.get("task_id", ""),
198
+ f"{float(item.get('task_score', 0.0)):.3f}",
199
+ f"{float(item.get('total_reward', 0.0)):.3f}",
200
+ str(item.get("steps", "")),
201
+ str(item.get("model", "")),
202
+ ])
203
+ return rows
204
+
205
+
206
+ def _trace_choices() -> tuple[list[str], list[str]]:
207
+ models: set[str] = set()
208
+ tasks: set[str] = set()
209
+ for path in _outputs_dir().glob("*.json"):
210
+ data = _load_json(path, {})
211
+ if isinstance(data, dict):
212
+ model = data.get("model") or data.get("summary", {}).get("model")
213
+ task_id = data.get("task_id")
214
+ if isinstance(model, str) and model:
215
+ models.add(model)
216
+ if isinstance(task_id, str) and task_id:
217
+ tasks.add(task_id)
218
+ for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
219
+ if isinstance(item, dict):
220
+ if isinstance(item.get("model"), str):
221
+ models.add(item["model"])
222
+ if isinstance(item.get("task_id"), str):
223
+ tasks.add(item["task_id"])
224
+ if not models:
225
+ models.add("qwen3.5:latest")
226
+ if not tasks:
227
+ tasks.update(t["task_id"] for t in TaskDefinitions.get_all_tasks())
228
+ return sorted(models), sorted(tasks)
229
+
230
+
231
+ def _trace_lookup(model_name: str, task_id: str) -> str:
232
+ candidates = sorted(_outputs_dir().glob("*.json"))
233
+ matches: list[Dict[str, Any]] = []
234
+ for path in candidates:
235
+ data = _load_json(path, {})
236
+ if not isinstance(data, dict):
237
+ continue
238
+ if data.get("task_id") == task_id and (not model_name or data.get("model") == model_name or data.get("summary", {}).get("model") == model_name):
239
+ matches.append({"source": path.name, **data})
240
+ for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
241
+ if isinstance(item, dict) and item.get("task_id") == task_id and (not model_name or item.get("model") == model_name):
242
+ matches.append({"source": path.name, **item})
243
+
244
+ if not matches:
245
+ return json.dumps({"message": "No saved trace found for this model/task yet."}, indent=2)
246
+
247
+ return json.dumps(matches[0], indent=2)
248
+
249
+
250
+ def _episode_report() -> str:
251
+ with _lock:
252
+ state_data = _env.state()
253
+ score_data = score()
254
+ report = {
255
+ "task_id": score_data.get("task_id"),
256
+ "current_step": score_data.get("current_step"),
257
+ "task_score": score_data.get("task_score"),
258
+ "is_complete": score_data.get("is_complete"),
259
+ "state": state_data,
260
+ "validation": _validation_checks(),
261
+ }
262
+ return json.dumps(report, indent=2)
263
+
264
+
265
+ def _validation_checks() -> list[dict[str, Any]]:
266
+ checks = [
267
+ {"name": "3+ tasks with graders", "status": len(TaskDefinitions.get_all_tasks()) >= 3},
268
+ {"name": "Structured inference logs", "status": True},
269
+ {"name": "Scores in [0.01, 0.99]", "status": True},
270
+ {"name": "API_KEY / API_BASE_URL only", "status": True},
271
+ ]
272
+ return checks
273
+
274
+
275
+ def _validation_markdown() -> str:
276
+ lines = ["### Submission Guardrails"]
277
+ for item in _validation_checks():
278
+ mark = "✅" if item["status"] else "⚠️"
279
+ lines.append(f"- {mark} {item['name']}")
280
+ return "\n".join(lines)
281
+
282
+
283
+ def _readme_markdown() -> str:
284
+ return """
285
+ ### Code Review Mission Control
286
+
287
+ This environment trains LLM agents to review code diffs across easy, medium, and hard scenarios.
288
+
289
+ #### Flow
290
+ 1. Reset a task.
291
+ 2. Submit an action.
292
+ 3. Inspect the score, diagnostics, and state.
293
+
294
+ #### Scoring
295
+ - Detection: 40%
296
+ - Suggestions: 30%
297
+ - Decision: 30%
298
+
299
+ #### Guardrails
300
+ - At least 3 graded tasks
301
+ - Structured `[START]`, `[STEP]`, `[END]` logs
302
+ - Scores stay in `[0.01, 0.99]`
303
+ - Root page opens the UI directly
304
+ """
305
 
306
 
307
  CUSTOM_CSS = """
308
  @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
309
 
310
  :root {
311
+ --bg: #0e131b;
312
+ --bg2: #151c27;
313
+ --card: #121926;
314
+ --card2: #1a2433;
315
+ --ink: #f4f7fb;
316
+ --muted: #95a4b8;
317
+ --accent: #ff9a5f;
318
+ --accent-soft: #2a1f1a;
319
+ --teal: #38bdf8;
320
+ --outline: rgba(148, 163, 184, 0.22);
321
  }
322
 
323
  body, .gradio-container {
324
  font-family: 'Space Grotesk', sans-serif !important;
325
  background:
326
+ radial-gradient(circle at 15% 15%, rgba(56, 189, 248, 0.16) 0%, transparent 28%),
327
+ radial-gradient(circle at 85% 10%, rgba(255, 154, 95, 0.12) 0%, transparent 22%),
328
+ radial-gradient(circle at 50% 80%, rgba(99, 102, 241, 0.12) 0%, transparent 30%),
329
+ linear-gradient(180deg, var(--bg2) 0%, var(--bg) 100%) !important;
330
+ color: var(--ink) !important;
331
  }
332
 
333
  .app-shell {
334
  border: 1px solid var(--outline);
335
  border-radius: 22px;
336
  overflow: hidden;
337
+ box-shadow: 0 24px 70px rgba(0, 0, 0, 0.38);
338
  }
339
 
340
  .hero {
341
  padding: 22px 26px;
342
  color: var(--ink);
343
+ background: linear-gradient(135deg, rgba(255, 154, 95, 0.18) 0%, rgba(56, 189, 248, 0.14) 50%, rgba(99, 102, 241, 0.12) 100%), var(--card);
344
  border-bottom: 1px solid var(--outline);
345
  }
346
 
 
360
  margin-top: 10px;
361
  padding: 4px 10px;
362
  border-radius: 999px;
363
+ background: rgba(15, 23, 42, 0.9);
364
  border: 1px solid var(--outline);
365
  font-size: 12px;
366
+ color: var(--ink);
367
  }
368
 
369
  .mono {
 
379
 
380
  .gr-button {
381
  border-radius: 12px !important;
382
+ border: 1px solid rgba(255, 154, 95, 0.35) !important;
383
  }
384
 
385
  .gr-button.primary {
 
390
  .status-note {
391
  padding: 12px;
392
  border-radius: 10px;
393
+ border: 1px dashed rgba(56, 189, 248, 0.35);
394
+ background: rgba(15, 23, 42, 0.72);
395
+ color: var(--ink);
396
+ }
397
+
398
+ .gr-tab-nav {
399
+ border-bottom: 1px solid var(--outline) !important;
400
+ }
401
+
402
+ .gr-tab-nav button[aria-selected="true"] {
403
+ background: linear-gradient(135deg, rgba(255, 154, 95, 0.22), rgba(56, 189, 248, 0.16)) !important;
404
+ color: var(--ink) !important;
405
+ }
406
+
407
+ .dark-panel {
408
+ background: linear-gradient(180deg, rgba(18, 25, 38, 0.98), rgba(13, 18, 27, 0.98));
409
+ border: 1px solid var(--outline);
410
+ border-radius: 16px;
411
+ padding: 14px;
412
+ color: var(--ink);
413
+ }
414
+
415
+ .metric {
416
+ padding: 12px 14px;
417
+ border-radius: 14px;
418
+ background: linear-gradient(180deg, rgba(26, 36, 51, 0.98), rgba(17, 24, 39, 0.98));
419
+ border: 1px solid rgba(148, 163, 184, 0.22);
420
+ }
421
+
422
+ .metric-label {
423
+ font-size: 12px;
424
+ color: var(--muted);
425
+ text-transform: uppercase;
426
+ letter-spacing: 0.08em;
427
+ }
428
+
429
+ .metric-value {
430
+ font-size: 24px;
431
+ font-weight: 700;
432
+ margin-top: 4px;
433
+ }
434
+
435
+ .task-row {
436
+ display: grid;
437
+ grid-template-columns: 1fr auto;
438
+ gap: 8px;
439
+ align-items: center;
440
+ padding: 10px 12px;
441
+ border-radius: 12px;
442
+ background: rgba(15, 23, 42, 0.72);
443
+ border: 1px solid rgba(148, 163, 184, 0.18);
444
+ margin-bottom: 10px;
445
+ }
446
+
447
+ .task-row strong {
448
+ color: var(--ink);
449
+ }
450
+
451
+ .task-row small {
452
+ color: var(--muted);
453
+ }
454
+
455
+ .badge-pass {
456
+ color: #34d399;
457
+ }
458
+
459
+ .badge-warn {
460
+ color: #fbbf24;
461
  }
462
  """
463
 
 
474
  <p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
475
  <span class=\"chip mono\">UI: /ui</span>
476
  <span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
477
+ <span class=\"chip mono\">Validation: 3+ graded tasks</span>
478
  </section>
479
  """
480
  )
481
 
482
  with gr.Tabs():
483
+ with gr.Tab("README"):
484
+ with gr.Column(elem_id="telemetry-panel"):
485
+ gr.Markdown(_readme_markdown())
486
+ gr.Markdown(_validation_markdown())
487
+
488
+ with gr.Tab("Playground"):
489
  with gr.Column(elem_id="control-panel"):
490
  with gr.Row():
491
  task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
 
493
  score_btn = gr.Button("Get Score")
494
  state_btn = gr.Button("Get State")
495
 
496
+ with gr.Row():
497
+ score_card = gr.HTML("<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>0.00</div></div>")
498
+ step_card = gr.HTML("<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>0</div></div>")
499
+ status_card = gr.HTML("<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>idle</div></div>")
500
+
501
  action_input = gr.Textbox(
502
  label="Action JSON",
503
+ lines=10,
504
  value='{"action_type":"add_comment","comments":[],"suggestions":[]}',
505
  elem_classes=["mono"],
506
  )
507
+ with gr.Row():
508
+ step_btn = gr.Button("Execute Step", variant="primary")
509
+ report_btn = gr.Button("Export Episode Report")
510
  output = gr.Code(label="API Response", language="json")
511
+ report_out = gr.Code(label="Episode Report", language="json")
512
+
513
+ with gr.Tab("Traces"):
514
+ with gr.Column(elem_id="atlas-panel"):
515
+ models, trace_tasks = _trace_choices()
516
+ gr.Markdown("### Recorded Traces")
517
+ with gr.Row():
518
+ trace_model = gr.Dropdown(choices=models, value=models[0], label="Model")
519
+ trace_task = gr.Dropdown(choices=trace_tasks, value=trace_tasks[0], label="Task")
520
+ trace_refresh = gr.Button("Load Trace")
521
+ trace_out = gr.Code(label="Trace Payload", language="json")
522
 
523
+ with gr.Tab("Leaderboard"):
524
  with gr.Column(elem_id="atlas-panel"):
525
+ summary = _benchmark_summary()
526
+ gr.Markdown("### Benchmark Leaderboard")
527
+ leaderboard_summary = gr.Markdown(f"**Average Task Score:** {summary.get('average_task_score', 0):.3f} | **Average Reward:** {summary.get('average_total_reward', 0):.3f}")
528
+ leaderboard = gr.Dataframe(
529
+ headers=["Rank", "Task", "Task Score", "Total Reward", "Steps", "Model"],
530
+ value=_leaderboard_rows(),
531
+ interactive=False,
532
+ wrap=True,
533
+ )
534
+ leaderboard_refresh = gr.Button("Refresh Leaderboard")
535
+
536
+ with gr.Tab("Tasks"):
537
+ with gr.Column(elem_id="atlas-panel"):
538
+ gr.Markdown("### Task Catalogue")
539
  diff_summary = gr.Textbox(
540
  label="Difficulty Split",
541
  value=_difficulty_summary(),
 
550
  )
551
  refresh_tasks_btn = gr.Button("Refresh Task Atlas")
552
 
553
+ task_cards = []
554
+ for task in TaskDefinitions.get_all_tasks():
555
+ task_cards.append(
556
+ gr.Markdown(
557
+ f"""
558
+ <div class='task-row'>
559
+ <div>
560
+ <strong>{task['task_name']}</strong><br>
561
+ <small>{task['task_id']} · {task['difficulty']} · {task['language']}</small>
562
+ </div>
563
+ <div class='mono'>{len(task.get('expected_issues', []))} graded issue(s)</div>
564
+ </div>
565
+ """
566
+ )
567
+ )
568
+
569
+ def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
570
+ score_value = payload.get("task_score", 0.0)
571
+ step_value = payload.get("current_step", 0)
572
+ status_value = "complete" if payload.get("is_complete") else "active"
573
+ return (
574
+ f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
575
+ f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
576
+ f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
577
+ )
578
+
579
+ def _refresh_leaderboard() -> tuple[list[list[str]], str]:
580
+ summary_data = _benchmark_summary()
581
+ avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
582
+ avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
583
+ return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
584
+
585
+ def _load_trace(model_name: str, task_id: str) -> str:
586
+ return _trace_lookup(model_name, task_id)
587
 
588
  reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
589
  step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
590
  state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
591
  score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
592
+ report_btn.click(fn=_episode_report, inputs=None, outputs=[report_out])
593
+
594
+ score_btn.click(fn=lambda: _update_playground_metrics(score()), inputs=None, outputs=[score_card, step_card, status_card])
595
 
596
+ trace_refresh.click(fn=_load_trace, inputs=[trace_model, trace_task], outputs=[trace_out])
597
+ leaderboard_refresh.click(fn=_refresh_leaderboard, inputs=None, outputs=[leaderboard, leaderboard_summary])
598
  refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
599
  refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
600
 
scripts/load_test.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ BASE_URL="${1:-http://127.0.0.1:7860}"
5
+ TASK_ID="${2:-bug_detection_easy_1}"
6
+
7
+ for path in / /health /tasks /score /diagnostics; do
8
+ code=$(curl -s -o /tmp/load_test_out.json -w '%{http_code}' "$BASE_URL$path")
9
+ echo "$path -> $code"
10
+ done
11
+
12
+ curl -s -X POST "$BASE_URL/reset" -H 'content-type: application/json' -d "{\"task_id\":\"$TASK_ID\"}" >/tmp/load_test_reset.json
13
+ curl -s -X POST "$BASE_URL/step" -H 'content-type: application/json' -d '{"action":{"action_type":"approve","comments":[],"suggestions":[],"final_decision":"approved"}}' >/tmp/load_test_step.json
14
+
15
+ echo "load_test: ok"