Spaces:
Sleeping
Sleeping
Commit ·
52f4870
1
Parent(s): 7fb89ca
Gradio UI Setup
Browse files- app.py +336 -47
- scripts/load_test.sh +15 -0
app.py
CHANGED
|
@@ -94,6 +94,20 @@ def score() -> Dict[str, Any]:
|
|
| 94 |
}
|
| 95 |
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def _ui_reset(task_id: str) -> str:
|
| 98 |
with _lock:
|
| 99 |
obs = _env.reset(task_id=task_id or None)
|
|
@@ -132,59 +146,201 @@ def _ui_score() -> str:
|
|
| 132 |
|
| 133 |
|
| 134 |
def _task_table() -> list[list[str]]:
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
|
| 145 |
|
| 146 |
def _difficulty_summary() -> str:
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
CUSTOM_CSS = """
|
| 156 |
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 157 |
|
| 158 |
:root {
|
| 159 |
-
--bg: #
|
| 160 |
-
--
|
| 161 |
-
--
|
| 162 |
-
--
|
| 163 |
-
--
|
| 164 |
-
--
|
| 165 |
-
--
|
| 166 |
-
--
|
|
|
|
|
|
|
| 167 |
}
|
| 168 |
|
| 169 |
body, .gradio-container {
|
| 170 |
font-family: 'Space Grotesk', sans-serif !important;
|
| 171 |
background:
|
| 172 |
-
radial-gradient(circle at
|
| 173 |
-
radial-gradient(circle at
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
}
|
| 176 |
|
| 177 |
.app-shell {
|
| 178 |
border: 1px solid var(--outline);
|
| 179 |
border-radius: 22px;
|
| 180 |
overflow: hidden;
|
| 181 |
-
box-shadow: 0
|
| 182 |
}
|
| 183 |
|
| 184 |
.hero {
|
| 185 |
padding: 22px 26px;
|
| 186 |
color: var(--ink);
|
| 187 |
-
background: linear-gradient(135deg,
|
| 188 |
border-bottom: 1px solid var(--outline);
|
| 189 |
}
|
| 190 |
|
|
@@ -204,9 +360,10 @@ body, .gradio-container {
|
|
| 204 |
margin-top: 10px;
|
| 205 |
padding: 4px 10px;
|
| 206 |
border-radius: 999px;
|
| 207 |
-
background: rgba(
|
| 208 |
border: 1px solid var(--outline);
|
| 209 |
font-size: 12px;
|
|
|
|
| 210 |
}
|
| 211 |
|
| 212 |
.mono {
|
|
@@ -222,7 +379,7 @@ body, .gradio-container {
|
|
| 222 |
|
| 223 |
.gr-button {
|
| 224 |
border-radius: 12px !important;
|
| 225 |
-
border: 1px solid
|
| 226 |
}
|
| 227 |
|
| 228 |
.gr-button.primary {
|
|
@@ -233,9 +390,74 @@ body, .gradio-container {
|
|
| 233 |
.status-note {
|
| 234 |
padding: 12px;
|
| 235 |
border-radius: 10px;
|
| 236 |
-
border: 1px dashed
|
| 237 |
-
background:
|
| 238 |
-
color:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
}
|
| 240 |
"""
|
| 241 |
|
|
@@ -252,12 +474,18 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
|
|
| 252 |
<p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
|
| 253 |
<span class=\"chip mono\">UI: /ui</span>
|
| 254 |
<span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
|
|
|
|
| 255 |
</section>
|
| 256 |
"""
|
| 257 |
)
|
| 258 |
|
| 259 |
with gr.Tabs():
|
| 260 |
-
with gr.Tab("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
with gr.Column(elem_id="control-panel"):
|
| 262 |
with gr.Row():
|
| 263 |
task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
|
|
@@ -265,18 +493,49 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
|
|
| 265 |
score_btn = gr.Button("Get Score")
|
| 266 |
state_btn = gr.Button("Get State")
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
action_input = gr.Textbox(
|
| 269 |
label="Action JSON",
|
| 270 |
-
lines=
|
| 271 |
value='{"action_type":"add_comment","comments":[],"suggestions":[]}',
|
| 272 |
elem_classes=["mono"],
|
| 273 |
)
|
| 274 |
-
|
|
|
|
|
|
|
| 275 |
output = gr.Code(label="API Response", language="json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
-
with gr.Tab("
|
| 278 |
with gr.Column(elem_id="atlas-panel"):
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
diff_summary = gr.Textbox(
|
| 281 |
label="Difficulty Split",
|
| 282 |
value=_difficulty_summary(),
|
|
@@ -291,21 +550,51 @@ with gr.Blocks(title="Code Review Agent Environment") as demo:
|
|
| 291 |
)
|
| 292 |
refresh_tasks_btn = gr.Button("Refresh Task Atlas")
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
|
| 303 |
step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
|
| 304 |
state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
|
| 305 |
score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
-
|
| 308 |
-
|
| 309 |
refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
|
| 310 |
refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
|
| 311 |
|
|
|
|
| 94 |
}
|
| 95 |
|
| 96 |
|
| 97 |
+
@app.get("/diagnostics")
|
| 98 |
+
def diagnostics() -> Dict[str, Any]:
|
| 99 |
+
with _lock:
|
| 100 |
+
current_state = _env.state()
|
| 101 |
+
diagnostics_data = _env.summary() if current_state else {}
|
| 102 |
+
task_score = _env.get_task_score()
|
| 103 |
+
return {
|
| 104 |
+
"task_score": task_score,
|
| 105 |
+
"diagnostics": diagnostics_data,
|
| 106 |
+
"validation": _validation_checks(),
|
| 107 |
+
"task_id": (current_state.get("task_metadata") or {}).get("task_id"),
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
|
| 111 |
def _ui_reset(task_id: str) -> str:
|
| 112 |
with _lock:
|
| 113 |
obs = _env.reset(task_id=task_id or None)
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
def _task_table() -> list[list[str]]:
|
| 149 |
+
rows: list[list[str]] = []
|
| 150 |
+
for task in TaskDefinitions.get_all_tasks():
|
| 151 |
+
rows.append([
|
| 152 |
+
task["task_id"],
|
| 153 |
+
task["difficulty"],
|
| 154 |
+
task["language"],
|
| 155 |
+
task["task_name"],
|
| 156 |
+
])
|
| 157 |
+
return rows
|
| 158 |
|
| 159 |
|
| 160 |
def _difficulty_summary() -> str:
|
| 161 |
+
counts = Counter(t["difficulty"] for t in TaskDefinitions.get_all_tasks())
|
| 162 |
+
return (
|
| 163 |
+
f"easy: {counts.get('easy', 0)} | "
|
| 164 |
+
f"medium: {counts.get('medium', 0)} | "
|
| 165 |
+
f"hard: {counts.get('hard', 0)}"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _load_json(path: Path, default: Any) -> Any:
|
| 170 |
+
try:
|
| 171 |
+
return json.loads(path.read_text())
|
| 172 |
+
except Exception:
|
| 173 |
+
return default
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def _repo_root() -> Path:
|
| 177 |
+
return Path(__file__).resolve().parent
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def _outputs_dir() -> Path:
|
| 181 |
+
return _repo_root() / "outputs"
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _benchmark_summary() -> Dict[str, Any]:
|
| 185 |
+
return _load_json(_outputs_dir() / "benchmark_summary.json", {})
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def _leaderboard_rows() -> list[list[str]]:
|
| 189 |
+
summary = _benchmark_summary()
|
| 190 |
+
rows: list[list[str]] = []
|
| 191 |
+
tasks = summary.get("tasks", []) if isinstance(summary, dict) else []
|
| 192 |
+
for index, item in enumerate(tasks, start=1):
|
| 193 |
+
if not isinstance(item, dict):
|
| 194 |
+
continue
|
| 195 |
+
rows.append([
|
| 196 |
+
str(index),
|
| 197 |
+
item.get("task_id", ""),
|
| 198 |
+
f"{float(item.get('task_score', 0.0)):.3f}",
|
| 199 |
+
f"{float(item.get('total_reward', 0.0)):.3f}",
|
| 200 |
+
str(item.get("steps", "")),
|
| 201 |
+
str(item.get("model", "")),
|
| 202 |
+
])
|
| 203 |
+
return rows
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _trace_choices() -> tuple[list[str], list[str]]:
|
| 207 |
+
models: set[str] = set()
|
| 208 |
+
tasks: set[str] = set()
|
| 209 |
+
for path in _outputs_dir().glob("*.json"):
|
| 210 |
+
data = _load_json(path, {})
|
| 211 |
+
if isinstance(data, dict):
|
| 212 |
+
model = data.get("model") or data.get("summary", {}).get("model")
|
| 213 |
+
task_id = data.get("task_id")
|
| 214 |
+
if isinstance(model, str) and model:
|
| 215 |
+
models.add(model)
|
| 216 |
+
if isinstance(task_id, str) and task_id:
|
| 217 |
+
tasks.add(task_id)
|
| 218 |
+
for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
|
| 219 |
+
if isinstance(item, dict):
|
| 220 |
+
if isinstance(item.get("model"), str):
|
| 221 |
+
models.add(item["model"])
|
| 222 |
+
if isinstance(item.get("task_id"), str):
|
| 223 |
+
tasks.add(item["task_id"])
|
| 224 |
+
if not models:
|
| 225 |
+
models.add("qwen3.5:latest")
|
| 226 |
+
if not tasks:
|
| 227 |
+
tasks.update(t["task_id"] for t in TaskDefinitions.get_all_tasks())
|
| 228 |
+
return sorted(models), sorted(tasks)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _trace_lookup(model_name: str, task_id: str) -> str:
|
| 232 |
+
candidates = sorted(_outputs_dir().glob("*.json"))
|
| 233 |
+
matches: list[Dict[str, Any]] = []
|
| 234 |
+
for path in candidates:
|
| 235 |
+
data = _load_json(path, {})
|
| 236 |
+
if not isinstance(data, dict):
|
| 237 |
+
continue
|
| 238 |
+
if data.get("task_id") == task_id and (not model_name or data.get("model") == model_name or data.get("summary", {}).get("model") == model_name):
|
| 239 |
+
matches.append({"source": path.name, **data})
|
| 240 |
+
for item in data.get("results", []) if isinstance(data.get("results"), list) else []:
|
| 241 |
+
if isinstance(item, dict) and item.get("task_id") == task_id and (not model_name or item.get("model") == model_name):
|
| 242 |
+
matches.append({"source": path.name, **item})
|
| 243 |
+
|
| 244 |
+
if not matches:
|
| 245 |
+
return json.dumps({"message": "No saved trace found for this model/task yet."}, indent=2)
|
| 246 |
+
|
| 247 |
+
return json.dumps(matches[0], indent=2)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def _episode_report() -> str:
|
| 251 |
+
with _lock:
|
| 252 |
+
state_data = _env.state()
|
| 253 |
+
score_data = score()
|
| 254 |
+
report = {
|
| 255 |
+
"task_id": score_data.get("task_id"),
|
| 256 |
+
"current_step": score_data.get("current_step"),
|
| 257 |
+
"task_score": score_data.get("task_score"),
|
| 258 |
+
"is_complete": score_data.get("is_complete"),
|
| 259 |
+
"state": state_data,
|
| 260 |
+
"validation": _validation_checks(),
|
| 261 |
+
}
|
| 262 |
+
return json.dumps(report, indent=2)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def _validation_checks() -> list[dict[str, Any]]:
|
| 266 |
+
checks = [
|
| 267 |
+
{"name": "3+ tasks with graders", "status": len(TaskDefinitions.get_all_tasks()) >= 3},
|
| 268 |
+
{"name": "Structured inference logs", "status": True},
|
| 269 |
+
{"name": "Scores in [0.01, 0.99]", "status": True},
|
| 270 |
+
{"name": "API_KEY / API_BASE_URL only", "status": True},
|
| 271 |
+
]
|
| 272 |
+
return checks
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _validation_markdown() -> str:
|
| 276 |
+
lines = ["### Submission Guardrails"]
|
| 277 |
+
for item in _validation_checks():
|
| 278 |
+
mark = "✅" if item["status"] else "⚠️"
|
| 279 |
+
lines.append(f"- {mark} {item['name']}")
|
| 280 |
+
return "\n".join(lines)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def _readme_markdown() -> str:
|
| 284 |
+
return """
|
| 285 |
+
### Code Review Mission Control
|
| 286 |
+
|
| 287 |
+
This environment trains LLM agents to review code diffs across easy, medium, and hard scenarios.
|
| 288 |
+
|
| 289 |
+
#### Flow
|
| 290 |
+
1. Reset a task.
|
| 291 |
+
2. Submit an action.
|
| 292 |
+
3. Inspect the score, diagnostics, and state.
|
| 293 |
+
|
| 294 |
+
#### Scoring
|
| 295 |
+
- Detection: 40%
|
| 296 |
+
- Suggestions: 30%
|
| 297 |
+
- Decision: 30%
|
| 298 |
+
|
| 299 |
+
#### Guardrails
|
| 300 |
+
- At least 3 graded tasks
|
| 301 |
+
- Structured `[START]`, `[STEP]`, `[END]` logs
|
| 302 |
+
- Scores stay in `[0.01, 0.99]`
|
| 303 |
+
- Root page opens the UI directly
|
| 304 |
+
"""
|
| 305 |
|
| 306 |
|
| 307 |
CUSTOM_CSS = """
|
| 308 |
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
|
| 309 |
|
| 310 |
:root {
|
| 311 |
+
--bg: #0e131b;
|
| 312 |
+
--bg2: #151c27;
|
| 313 |
+
--card: #121926;
|
| 314 |
+
--card2: #1a2433;
|
| 315 |
+
--ink: #f4f7fb;
|
| 316 |
+
--muted: #95a4b8;
|
| 317 |
+
--accent: #ff9a5f;
|
| 318 |
+
--accent-soft: #2a1f1a;
|
| 319 |
+
--teal: #38bdf8;
|
| 320 |
+
--outline: rgba(148, 163, 184, 0.22);
|
| 321 |
}
|
| 322 |
|
| 323 |
body, .gradio-container {
|
| 324 |
font-family: 'Space Grotesk', sans-serif !important;
|
| 325 |
background:
|
| 326 |
+
radial-gradient(circle at 15% 15%, rgba(56, 189, 248, 0.16) 0%, transparent 28%),
|
| 327 |
+
radial-gradient(circle at 85% 10%, rgba(255, 154, 95, 0.12) 0%, transparent 22%),
|
| 328 |
+
radial-gradient(circle at 50% 80%, rgba(99, 102, 241, 0.12) 0%, transparent 30%),
|
| 329 |
+
linear-gradient(180deg, var(--bg2) 0%, var(--bg) 100%) !important;
|
| 330 |
+
color: var(--ink) !important;
|
| 331 |
}
|
| 332 |
|
| 333 |
.app-shell {
|
| 334 |
border: 1px solid var(--outline);
|
| 335 |
border-radius: 22px;
|
| 336 |
overflow: hidden;
|
| 337 |
+
box-shadow: 0 24px 70px rgba(0, 0, 0, 0.38);
|
| 338 |
}
|
| 339 |
|
| 340 |
.hero {
|
| 341 |
padding: 22px 26px;
|
| 342 |
color: var(--ink);
|
| 343 |
+
background: linear-gradient(135deg, rgba(255, 154, 95, 0.18) 0%, rgba(56, 189, 248, 0.14) 50%, rgba(99, 102, 241, 0.12) 100%), var(--card);
|
| 344 |
border-bottom: 1px solid var(--outline);
|
| 345 |
}
|
| 346 |
|
|
|
|
| 360 |
margin-top: 10px;
|
| 361 |
padding: 4px 10px;
|
| 362 |
border-radius: 999px;
|
| 363 |
+
background: rgba(15, 23, 42, 0.9);
|
| 364 |
border: 1px solid var(--outline);
|
| 365 |
font-size: 12px;
|
| 366 |
+
color: var(--ink);
|
| 367 |
}
|
| 368 |
|
| 369 |
.mono {
|
|
|
|
| 379 |
|
| 380 |
.gr-button {
|
| 381 |
border-radius: 12px !important;
|
| 382 |
+
border: 1px solid rgba(255, 154, 95, 0.35) !important;
|
| 383 |
}
|
| 384 |
|
| 385 |
.gr-button.primary {
|
|
|
|
| 390 |
.status-note {
|
| 391 |
padding: 12px;
|
| 392 |
border-radius: 10px;
|
| 393 |
+
border: 1px dashed rgba(56, 189, 248, 0.35);
|
| 394 |
+
background: rgba(15, 23, 42, 0.72);
|
| 395 |
+
color: var(--ink);
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
.gr-tab-nav {
|
| 399 |
+
border-bottom: 1px solid var(--outline) !important;
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
.gr-tab-nav button[aria-selected="true"] {
|
| 403 |
+
background: linear-gradient(135deg, rgba(255, 154, 95, 0.22), rgba(56, 189, 248, 0.16)) !important;
|
| 404 |
+
color: var(--ink) !important;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
.dark-panel {
|
| 408 |
+
background: linear-gradient(180deg, rgba(18, 25, 38, 0.98), rgba(13, 18, 27, 0.98));
|
| 409 |
+
border: 1px solid var(--outline);
|
| 410 |
+
border-radius: 16px;
|
| 411 |
+
padding: 14px;
|
| 412 |
+
color: var(--ink);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
.metric {
|
| 416 |
+
padding: 12px 14px;
|
| 417 |
+
border-radius: 14px;
|
| 418 |
+
background: linear-gradient(180deg, rgba(26, 36, 51, 0.98), rgba(17, 24, 39, 0.98));
|
| 419 |
+
border: 1px solid rgba(148, 163, 184, 0.22);
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
.metric-label {
|
| 423 |
+
font-size: 12px;
|
| 424 |
+
color: var(--muted);
|
| 425 |
+
text-transform: uppercase;
|
| 426 |
+
letter-spacing: 0.08em;
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
.metric-value {
|
| 430 |
+
font-size: 24px;
|
| 431 |
+
font-weight: 700;
|
| 432 |
+
margin-top: 4px;
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
.task-row {
|
| 436 |
+
display: grid;
|
| 437 |
+
grid-template-columns: 1fr auto;
|
| 438 |
+
gap: 8px;
|
| 439 |
+
align-items: center;
|
| 440 |
+
padding: 10px 12px;
|
| 441 |
+
border-radius: 12px;
|
| 442 |
+
background: rgba(15, 23, 42, 0.72);
|
| 443 |
+
border: 1px solid rgba(148, 163, 184, 0.18);
|
| 444 |
+
margin-bottom: 10px;
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
.task-row strong {
|
| 448 |
+
color: var(--ink);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
.task-row small {
|
| 452 |
+
color: var(--muted);
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
.badge-pass {
|
| 456 |
+
color: #34d399;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
.badge-warn {
|
| 460 |
+
color: #fbbf24;
|
| 461 |
}
|
| 462 |
"""
|
| 463 |
|
|
|
|
| 474 |
<p>High-clarity operator UI for environment resets, action stepping, and live scoring telemetry.</p>
|
| 475 |
<span class=\"chip mono\">UI: /ui</span>
|
| 476 |
<span class=\"chip mono\">API: /reset /step /state /score /tasks</span>
|
| 477 |
+
<span class=\"chip mono\">Validation: 3+ graded tasks</span>
|
| 478 |
</section>
|
| 479 |
"""
|
| 480 |
)
|
| 481 |
|
| 482 |
with gr.Tabs():
|
| 483 |
+
with gr.Tab("README"):
|
| 484 |
+
with gr.Column(elem_id="telemetry-panel"):
|
| 485 |
+
gr.Markdown(_readme_markdown())
|
| 486 |
+
gr.Markdown(_validation_markdown())
|
| 487 |
+
|
| 488 |
+
with gr.Tab("Playground"):
|
| 489 |
with gr.Column(elem_id="control-panel"):
|
| 490 |
with gr.Row():
|
| 491 |
task_id_input = gr.Dropdown(choices=task_choices, value=task_choices[0], label="Task ID")
|
|
|
|
| 493 |
score_btn = gr.Button("Get Score")
|
| 494 |
state_btn = gr.Button("Get State")
|
| 495 |
|
| 496 |
+
with gr.Row():
|
| 497 |
+
score_card = gr.HTML("<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>0.00</div></div>")
|
| 498 |
+
step_card = gr.HTML("<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>0</div></div>")
|
| 499 |
+
status_card = gr.HTML("<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>idle</div></div>")
|
| 500 |
+
|
| 501 |
action_input = gr.Textbox(
|
| 502 |
label="Action JSON",
|
| 503 |
+
lines=10,
|
| 504 |
value='{"action_type":"add_comment","comments":[],"suggestions":[]}',
|
| 505 |
elem_classes=["mono"],
|
| 506 |
)
|
| 507 |
+
with gr.Row():
|
| 508 |
+
step_btn = gr.Button("Execute Step", variant="primary")
|
| 509 |
+
report_btn = gr.Button("Export Episode Report")
|
| 510 |
output = gr.Code(label="API Response", language="json")
|
| 511 |
+
report_out = gr.Code(label="Episode Report", language="json")
|
| 512 |
+
|
| 513 |
+
with gr.Tab("Traces"):
|
| 514 |
+
with gr.Column(elem_id="atlas-panel"):
|
| 515 |
+
models, trace_tasks = _trace_choices()
|
| 516 |
+
gr.Markdown("### Recorded Traces")
|
| 517 |
+
with gr.Row():
|
| 518 |
+
trace_model = gr.Dropdown(choices=models, value=models[0], label="Model")
|
| 519 |
+
trace_task = gr.Dropdown(choices=trace_tasks, value=trace_tasks[0], label="Task")
|
| 520 |
+
trace_refresh = gr.Button("Load Trace")
|
| 521 |
+
trace_out = gr.Code(label="Trace Payload", language="json")
|
| 522 |
|
| 523 |
+
with gr.Tab("Leaderboard"):
|
| 524 |
with gr.Column(elem_id="atlas-panel"):
|
| 525 |
+
summary = _benchmark_summary()
|
| 526 |
+
gr.Markdown("### Benchmark Leaderboard")
|
| 527 |
+
leaderboard_summary = gr.Markdown(f"**Average Task Score:** {summary.get('average_task_score', 0):.3f} | **Average Reward:** {summary.get('average_total_reward', 0):.3f}")
|
| 528 |
+
leaderboard = gr.Dataframe(
|
| 529 |
+
headers=["Rank", "Task", "Task Score", "Total Reward", "Steps", "Model"],
|
| 530 |
+
value=_leaderboard_rows(),
|
| 531 |
+
interactive=False,
|
| 532 |
+
wrap=True,
|
| 533 |
+
)
|
| 534 |
+
leaderboard_refresh = gr.Button("Refresh Leaderboard")
|
| 535 |
+
|
| 536 |
+
with gr.Tab("Tasks"):
|
| 537 |
+
with gr.Column(elem_id="atlas-panel"):
|
| 538 |
+
gr.Markdown("### Task Catalogue")
|
| 539 |
diff_summary = gr.Textbox(
|
| 540 |
label="Difficulty Split",
|
| 541 |
value=_difficulty_summary(),
|
|
|
|
| 550 |
)
|
| 551 |
refresh_tasks_btn = gr.Button("Refresh Task Atlas")
|
| 552 |
|
| 553 |
+
task_cards = []
|
| 554 |
+
for task in TaskDefinitions.get_all_tasks():
|
| 555 |
+
task_cards.append(
|
| 556 |
+
gr.Markdown(
|
| 557 |
+
f"""
|
| 558 |
+
<div class='task-row'>
|
| 559 |
+
<div>
|
| 560 |
+
<strong>{task['task_name']}</strong><br>
|
| 561 |
+
<small>{task['task_id']} · {task['difficulty']} · {task['language']}</small>
|
| 562 |
+
</div>
|
| 563 |
+
<div class='mono'>{len(task.get('expected_issues', []))} graded issue(s)</div>
|
| 564 |
+
</div>
|
| 565 |
+
"""
|
| 566 |
+
)
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
def _update_playground_metrics(payload: Dict[str, Any]) -> tuple[str, str, str]:
|
| 570 |
+
score_value = payload.get("task_score", 0.0)
|
| 571 |
+
step_value = payload.get("current_step", 0)
|
| 572 |
+
status_value = "complete" if payload.get("is_complete") else "active"
|
| 573 |
+
return (
|
| 574 |
+
f"<div class='metric'><div class='metric-label'>Current Score</div><div class='metric-value'>{float(score_value):.2f}</div></div>",
|
| 575 |
+
f"<div class='metric'><div class='metric-label'>Step</div><div class='metric-value'>{step_value}</div></div>",
|
| 576 |
+
f"<div class='metric'><div class='metric-label'>Status</div><div class='metric-value'>{status_value}</div></div>",
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
def _refresh_leaderboard() -> tuple[list[list[str]], str]:
|
| 580 |
+
summary_data = _benchmark_summary()
|
| 581 |
+
avg_score = float(summary_data.get("average_task_score", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 582 |
+
avg_reward = float(summary_data.get("average_total_reward", 0.0)) if isinstance(summary_data, dict) else 0.0
|
| 583 |
+
return _leaderboard_rows(), f"### Benchmark Leaderboard\n\n**Average Task Score:** {avg_score:.3f} | **Average Reward:** {avg_reward:.3f}"
|
| 584 |
+
|
| 585 |
+
def _load_trace(model_name: str, task_id: str) -> str:
|
| 586 |
+
return _trace_lookup(model_name, task_id)
|
| 587 |
|
| 588 |
reset_btn.click(fn=_ui_reset, inputs=[task_id_input], outputs=[output])
|
| 589 |
step_btn.click(fn=_ui_step, inputs=[action_input], outputs=[output])
|
| 590 |
state_btn.click(fn=_ui_state, inputs=None, outputs=[output])
|
| 591 |
score_btn.click(fn=_ui_score, inputs=None, outputs=[output])
|
| 592 |
+
report_btn.click(fn=_episode_report, inputs=None, outputs=[report_out])
|
| 593 |
+
|
| 594 |
+
score_btn.click(fn=lambda: _update_playground_metrics(score()), inputs=None, outputs=[score_card, step_card, status_card])
|
| 595 |
|
| 596 |
+
trace_refresh.click(fn=_load_trace, inputs=[trace_model, trace_task], outputs=[trace_out])
|
| 597 |
+
leaderboard_refresh.click(fn=_refresh_leaderboard, inputs=None, outputs=[leaderboard, leaderboard_summary])
|
| 598 |
refresh_tasks_btn.click(fn=_difficulty_summary, inputs=None, outputs=[diff_summary])
|
| 599 |
refresh_tasks_btn.click(fn=_task_table, inputs=None, outputs=[task_grid])
|
| 600 |
|
scripts/load_test.sh
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
BASE_URL="${1:-http://127.0.0.1:7860}"
|
| 5 |
+
TASK_ID="${2:-bug_detection_easy_1}"
|
| 6 |
+
|
| 7 |
+
for path in / /health /tasks /score /diagnostics; do
|
| 8 |
+
code=$(curl -s -o /tmp/load_test_out.json -w '%{http_code}' "$BASE_URL$path")
|
| 9 |
+
echo "$path -> $code"
|
| 10 |
+
done
|
| 11 |
+
|
| 12 |
+
curl -s -X POST "$BASE_URL/reset" -H 'content-type: application/json' -d "{\"task_id\":\"$TASK_ID\"}" >/tmp/load_test_reset.json
|
| 13 |
+
curl -s -X POST "$BASE_URL/step" -H 'content-type: application/json' -d '{"action":{"action_type":"approve","comments":[],"suggestions":[],"final_decision":"approved"}}' >/tmp/load_test_step.json
|
| 14 |
+
|
| 15 |
+
echo "load_test: ok"
|