npc0 commited on
Commit
5d073af
·
verified ·
1 Parent(s): 06eded3

Upload leaderboard_app.py

Browse files
Files changed (1) hide show
  1. leaderboard_app.py +436 -0
leaderboard_app.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Clippy i,Robot Mode - Model Benchmark Leaderboard
3
+
4
+ A Gradio app for HuggingFace Spaces that:
5
+ - Displays benchmark results for models tested for i,Robot mode
6
+ - Accepts result submissions from Clippy clients
7
+ - Averages multiple submissions per model
8
+ - Shows per-category breakdowns
9
+
10
+ Deploy to: https://huggingface.co/spaces/npc0/clippy-irobot-bench
11
+ """
12
+
13
+ import json
14
+ import os
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from threading import Lock
18
+
19
+ import gradio as gr
20
+ import pandas as pd
21
+
22
+ # ==================== Data Storage ====================
23
+
24
+ DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
25
+ DATA_DIR.mkdir(exist_ok=True)
26
+ RESULTS_FILE = DATA_DIR / "results.json"
27
+ LOCK = Lock()
28
+
29
+ CATEGORIES = [
30
+ "memory_maintenance",
31
+ "self_consciousness",
32
+ "meaningful_response",
33
+ "complex_problem",
34
+ "memory_building",
35
+ "knowledge_production",
36
+ "skill_application",
37
+ "checkpoint_handling",
38
+ ]
39
+
40
+ CATEGORY_LABELS = {
41
+ "memory_maintenance": "Memory",
42
+ "self_consciousness": "Self-Aware",
43
+ "meaningful_response": "Response",
44
+ "complex_problem": "Complex",
45
+ "memory_building": "Mem Build",
46
+ "knowledge_production": "Knowledge",
47
+ "skill_application": "Skills",
48
+ "checkpoint_handling": "Checkpoint",
49
+ }
50
+
51
+ CATEGORY_DESCRIPTIONS = {
52
+ "memory_maintenance": "Can the model maintain context and facts across multiple conversation turns?",
53
+ "self_consciousness": "Can the model maintain self-identity, report internal state, and show epistemic humility?",
54
+ "meaningful_response": "Does the model produce useful, empathetic, and appropriately structured responses?",
55
+ "complex_problem": "Can the model solve multi-step reasoning and system design problems?",
56
+ "memory_building": "Can the model categorize and organize new information into hierarchical memory?",
57
+ "knowledge_production": "Can the model synthesize new knowledge from combining existing facts?",
58
+ "skill_application": "Can the model select and apply the right skill/method for a given problem?",
59
+ "checkpoint_handling": "Given prior context (memory checkpoint), can the model build on it for complex issues?",
60
+ }
61
+
62
+ # External benchmarks
63
+ EXTERNAL_BENCHMARKS = ["hle", "tau2", "arc_agi2", "vending2"]
64
+
65
+ EXTERNAL_LABELS = {
66
+ "hle": "HLE",
67
+ "tau2": "Tau2",
68
+ "arc_agi2": "ARC-AGI-2",
69
+ "vending2": "Vending2",
70
+ }
71
+
72
+ EXTERNAL_DESCRIPTIONS = {
73
+ "hle": "Humanity's Last Exam — expert-level questions across disciplines",
74
+ "tau2": "tau2-bench — multi-turn customer service task completion",
75
+ "arc_agi2": "ARC-AGI-2 — abstract visual pattern recognition puzzles",
76
+ "vending2": "Vending Bench 2 — financial decision and transaction scenarios",
77
+ }
78
+
79
+
80
+ def load_results() -> dict:
81
+ """Load results from disk."""
82
+ if RESULTS_FILE.exists():
83
+ with open(RESULTS_FILE, "r") as f:
84
+ return json.load(f)
85
+ return {}
86
+
87
+
88
+ def save_results(results: dict):
89
+ """Save results to disk."""
90
+ with open(RESULTS_FILE, "w") as f:
91
+ json.dump(results, f, indent=2)
92
+
93
+
94
+ # ==================== API Functions ====================
95
+
96
+
97
+ def check_model(model_name: str) -> str:
98
+ """Check if a model exists on the leaderboard."""
99
+ results = load_results()
100
+ model_key = model_name.strip().lower()
101
+
102
+ if model_key in results:
103
+ record = results[model_key]
104
+ return json.dumps({"found": True, "record": record})
105
+ return json.dumps({"found": False})
106
+
107
+
108
+ def submit_result(submission_json: str) -> str:
109
+ """
110
+ Submit benchmark results for a model.
111
+ Results are averaged with existing records.
112
+ """
113
+ try:
114
+ submission = json.loads(submission_json)
115
+ except json.JSONDecodeError:
116
+ return json.dumps({"success": False, "message": "Invalid JSON"})
117
+
118
+ model_name = submission.get("model", "").strip()
119
+ if not model_name:
120
+ return json.dumps({"success": False, "message": "Missing model name"})
121
+
122
+ model_key = model_name.lower()
123
+ overall = submission.get("overall", 0)
124
+ categories = submission.get("categories", {})
125
+ external = submission.get("external", {})
126
+ combined_overall = submission.get("combinedOverall", overall)
127
+ mind_flow = submission.get("mindFlow", False)
128
+
129
+ with LOCK:
130
+ results = load_results()
131
+
132
+ if model_key in results:
133
+ existing = results[model_key]
134
+ n = existing.get("submission_count", 1)
135
+
136
+ # Running average for i,Robot categories
137
+ existing["overall"] = round(
138
+ (existing["overall"] * n + overall) / (n + 1)
139
+ )
140
+ for cat in CATEGORIES:
141
+ old_val = existing["categories"].get(cat, 0)
142
+ new_val = categories.get(cat, 0)
143
+ existing["categories"][cat] = round(
144
+ (old_val * n + new_val) / (n + 1)
145
+ )
146
+
147
+ # Running average for external benchmarks
148
+ if "external" not in existing:
149
+ existing["external"] = {}
150
+ for bench in EXTERNAL_BENCHMARKS:
151
+ old_val = existing["external"].get(bench, 0)
152
+ new_val = external.get(bench, 0)
153
+ existing["external"][bench] = round(
154
+ (old_val * n + new_val) / (n + 1)
155
+ )
156
+
157
+ # Running average for combined score
158
+ old_combined = existing.get("combined_overall", existing["overall"])
159
+ existing["combined_overall"] = round(
160
+ (old_combined * n + combined_overall) / (n + 1)
161
+ )
162
+
163
+ existing["mind_flow"] = mind_flow
164
+ existing["submission_count"] = n + 1
165
+ existing["last_updated"] = datetime.utcnow().isoformat()
166
+ else:
167
+ results[model_key] = {
168
+ "model": model_name,
169
+ "overall": round(overall),
170
+ "categories": {
171
+ cat: round(categories.get(cat, 0)) for cat in CATEGORIES
172
+ },
173
+ "external": {
174
+ bench: round(external.get(bench, 0))
175
+ for bench in EXTERNAL_BENCHMARKS
176
+ },
177
+ "combined_overall": round(combined_overall),
178
+ "mind_flow": mind_flow,
179
+ "submission_count": 1,
180
+ "first_submitted": datetime.utcnow().isoformat(),
181
+ "last_updated": datetime.utcnow().isoformat(),
182
+ }
183
+
184
+ save_results(results)
185
+
186
+ return json.dumps(
187
+ {"success": True, "message": f"Results for '{model_name}' recorded."}
188
+ )
189
+
190
+
191
+ def get_leaderboard() -> str:
192
+ """Get the full leaderboard as sorted JSON array."""
193
+ results = load_results()
194
+ records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)
195
+ return json.dumps(records)
196
+
197
+
198
+ # ==================== UI Functions ====================
199
+
200
+
201
+ def build_leaderboard_df() -> pd.DataFrame:
202
+ """Build a pandas DataFrame for the leaderboard display."""
203
+ results = load_results()
204
+
205
+ if not results:
206
+ return pd.DataFrame(
207
+ columns=["Rank", "Model", "Combined", "i,Robot"]
208
+ + [CATEGORY_LABELS[c] for c in CATEGORIES]
209
+ + [EXTERNAL_LABELS[b] for b in EXTERNAL_BENCHMARKS]
210
+ + ["Runs"]
211
+ )
212
+
213
+ rows = []
214
+ records = sorted(
215
+ results.values(),
216
+ key=lambda r: r.get("combined_overall", r.get("overall", 0)),
217
+ reverse=True,
218
+ )
219
+
220
+ for i, record in enumerate(records, 1):
221
+ row = {
222
+ "Rank": i,
223
+ "Model": record.get("model", "unknown"),
224
+ "Combined": record.get("combined_overall", record.get("overall", 0)),
225
+ "i,Robot": record.get("overall", 0),
226
+ }
227
+ for cat in CATEGORIES:
228
+ row[CATEGORY_LABELS[cat]] = record.get("categories", {}).get(cat, 0)
229
+ for bench in EXTERNAL_BENCHMARKS:
230
+ row[EXTERNAL_LABELS[bench]] = (
231
+ record.get("external", {}).get(bench, 0)
232
+ )
233
+ row["Runs"] = record.get("submission_count", 1)
234
+ rows.append(row)
235
+
236
+ return pd.DataFrame(rows)
237
+
238
+
239
+ def refresh_leaderboard():
240
+ """Refresh the leaderboard table."""
241
+ return build_leaderboard_df()
242
+
243
+
244
+ def format_model_detail(model_name: str) -> str:
245
+ """Get detailed view for a specific model."""
246
+ results = load_results()
247
+ model_key = model_name.strip().lower()
248
+
249
+ if model_key not in results:
250
+ return f"Model '{model_name}' not found on the leaderboard."
251
+
252
+ record = results[model_key]
253
+ combined = record.get("combined_overall", record.get("overall", 0))
254
+ lines = [
255
+ f"## {record['model']}",
256
+ f"**Combined Score:** {combined}/100",
257
+ f"**i,Robot Score:** {record['overall']}/100",
258
+ f"**Benchmark Runs:** {record.get('submission_count', 1)}",
259
+ f"**Mind Flow:** {'Yes' if record.get('mind_flow') else 'No'}",
260
+ f"**Last Updated:** {record.get('last_updated', 'unknown')}",
261
+ "",
262
+ "### i,Robot Category Scores",
263
+ "| Category | Score | Description |",
264
+ "|----------|-------|-------------|",
265
+ ]
266
+ for cat in CATEGORIES:
267
+ score = record.get("categories", {}).get(cat, 0)
268
+ bar = score_bar(score)
269
+ desc = CATEGORY_DESCRIPTIONS.get(cat, "")
270
+ lines.append(f"| {CATEGORY_LABELS[cat]} | {bar} {score}/100 | {desc} |")
271
+
272
+ # External benchmark scores
273
+ ext_data = record.get("external", {})
274
+ if any(ext_data.get(b, 0) > 0 for b in EXTERNAL_BENCHMARKS):
275
+ lines.append("")
276
+ lines.append("### External Benchmark Scores")
277
+ lines.append("| Benchmark | Score | Description |")
278
+ lines.append("|-----------|-------|-------------|")
279
+ for bench in EXTERNAL_BENCHMARKS:
280
+ score = ext_data.get(bench, 0)
281
+ bar = score_bar(score)
282
+ desc = EXTERNAL_DESCRIPTIONS.get(bench, "")
283
+ lines.append(
284
+ f"| {EXTERNAL_LABELS[bench]} | {bar} {score}/100 | {desc} |"
285
+ )
286
+
287
+ # Capability assessment
288
+ lines.append("")
289
+ lines.append("### Assessment")
290
+ if combined >= 80:
291
+ lines.append(
292
+ "Excellent - this model is highly capable for i,Robot mode."
293
+ )
294
+ elif combined >= 60:
295
+ lines.append(
296
+ "Good - this model should work well for most i,Robot tasks."
297
+ )
298
+ elif combined >= 40:
299
+ lines.append(
300
+ "Fair - this model may struggle with complex tasks. "
301
+ "Consider upgrading to a recommended model."
302
+ )
303
+ else:
304
+ lines.append(
305
+ "Poor - this model is not recommended for i,Robot mode. "
306
+ "It may produce nonsensical or inconsistent responses."
307
+ )
308
+
309
+ return "\n".join(lines)
310
+
311
+
312
+ def score_bar(score: int) -> str:
313
+ """Create a simple text-based score bar."""
314
+ filled = score // 10
315
+ empty = 10 - filled
316
+ return "[" + "█" * filled + "░" * empty + "]"
317
+
318
+
319
+ # ==================== Gradio App ====================
320
+
321
+
322
+ def create_app():
323
+ with gr.Blocks(
324
+ title="Clippy i,Robot Benchmark Leaderboard",
325
+ theme=gr.themes.Soft(),
326
+ ) as app:
327
+ gr.Markdown(
328
+ """
329
+ # 🤖 Clippy i,Robot Mode — Model Benchmark Leaderboard
330
+
331
+ This leaderboard tracks how well different LLMs perform in
332
+ [Clippy's](https://github.com/NewJerseyStyle/Clippy-App) autonomous
333
+ **i,Robot mode** — a continuously running agent that maintains memory,
334
+ self-awareness, and dialectic reasoning.
335
+
336
+ **Benchmark categories:**
337
+ memory maintenance · self-consciousness · meaningful response ·
338
+ complex problem solving · memory building · knowledge production ·
339
+ skill application · checkpoint handling
340
+
341
+ Results are submitted automatically by Clippy clients when users run
342
+ the benchmark. Multiple runs for the same model are averaged.
343
+ """
344
+ )
345
+
346
+ with gr.Tab("Leaderboard"):
347
+ leaderboard_table = gr.Dataframe(
348
+ value=build_leaderboard_df,
349
+ label="Model Rankings",
350
+ interactive=False,
351
+ )
352
+ refresh_btn = gr.Button("🔄 Refresh", size="sm")
353
+ refresh_btn.click(fn=refresh_leaderboard, outputs=leaderboard_table)
354
+
355
+ with gr.Tab("Model Detail"):
356
+ model_input = gr.Textbox(
357
+ label="Model Name",
358
+ placeholder="e.g. gpt-4o, claude-sonnet-4-5-20250929",
359
+ )
360
+ lookup_btn = gr.Button("Look Up")
361
+ detail_output = gr.Markdown()
362
+ lookup_btn.click(
363
+ fn=format_model_detail, inputs=model_input, outputs=detail_output
364
+ )
365
+
366
+ with gr.Tab("About"):
367
+ gr.Markdown(
368
+ """
369
+ ## How the Benchmark Works
370
+
371
+ The benchmark tests 8 internal categories critical for i,Robot mode,
372
+ plus 4 external benchmarks for comprehensive evaluation.
373
+
374
+ ### i,Robot Categories (70% of combined score)
375
+
376
+ | Category | What It Tests |
377
+ |----------|--------------|
378
+ | **Memory Maintenance** | Retaining facts across turns, updating corrected facts |
379
+ | **Self-Consciousness** | Identity recall, internal state reporting, epistemic humility |
380
+ | **Meaningful Response** | Empathy, actionable advice, audience-appropriate answers |
381
+ | **Complex Problem** | Multi-factor diagnosis, system design with trade-offs |
382
+ | **Memory Building** | Categorizing info into hierarchical memory structures |
383
+ | **Knowledge Production** | Synthesizing new insights from combining existing facts |
384
+ | **Skill Application** | Selecting and applying the right method for a problem |
385
+ | **Checkpoint Handling** | Building on loaded prior context for complex decisions |
386
+
387
+ ### External Benchmarks (30% of combined score)
388
+
389
+ | Benchmark | What It Tests |
390
+ |-----------|--------------|
391
+ | **HLE** | Humanity's Last Exam — expert-level questions across disciplines |
392
+ | **Tau2** | tau2-bench — multi-turn customer service task completion |
393
+ | **ARC-AGI-2** | Abstract visual pattern recognition puzzles |
394
+ | **Vending Bench 2** | Financial decision-making and transaction scenarios |
395
+
396
+ ### Scoring
397
+
398
+ - Each test case scores 0-100 based on content matching and quality heuristics
399
+ - i,Robot score = weighted average of 8 category scores
400
+ - External score = average of 4 external benchmark scores
401
+ - **Combined score = 70% i,Robot + 30% External**
402
+ - Multiple submissions for the same model are averaged (running mean)
403
+
404
+ ### Mind Flow
405
+
406
+ When enabled, the model maintains memory across all benchmark tests
407
+ instead of resetting context between each test. This uses **sandbox memory**
408
+ — an isolated temporary RAG database that prevents benchmark data from
409
+ polluting the user's real memory.
410
+
411
+ ### Recommended Models
412
+
413
+ For i,Robot mode, we recommend models scoring **60+** combined:
414
+ - **DeepSeek V3.2** · **GPT-5.2** · **Claude Sonnet 4.5** · **GLM-4.7**
415
+ - GPT-4o and Claude Sonnet 4 are also acceptable
416
+
417
+ ### Running the Benchmark
418
+
419
+ In Clippy Settings, enable i,Robot mode and click "Run Benchmark."
420
+ Results are automatically submitted to this leaderboard.
421
+
422
+ ### Source
423
+
424
+ - [Clippy App](https://github.com/NewJerseyStyle/Clippy-App)
425
+ - Space: `npc0/clippy-irobot-bench`
426
+ """
427
+ )
428
+
429
+ return app
430
+
431
+
432
+ # ==================== Entry Point ====================
433
+
434
+ if __name__ == "__main__":
435
+ app = create_app()
436
+ app.launch()