npc0's picture
Upload 3 files
06eded3 verified
"""
Clippy i,Robot Mode - Model Benchmark Leaderboard
A Gradio app for HuggingFace Spaces that:
- Displays benchmark results for models tested for i,Robot mode
- Accepts result submissions from Clippy clients
- Averages multiple submissions per model
- Shows per-category breakdowns
Deploy to: https://huggingface.co/spaces/npc0/clippy-irobot-bench
"""
import json
import os
from datetime import datetime
from pathlib import Path
from threading import Lock
import gradio as gr
import pandas as pd
# ==================== Data Storage ====================
DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
DATA_DIR.mkdir(exist_ok=True)
RESULTS_FILE = DATA_DIR / "results.json"
LOCK = Lock()
CATEGORIES = [
"memory_maintenance",
"self_consciousness",
"meaningful_response",
"complex_problem",
"memory_building",
"knowledge_production",
"skill_application",
"checkpoint_handling",
]
CATEGORY_LABELS = {
"memory_maintenance": "Memory",
"self_consciousness": "Self-Aware",
"meaningful_response": "Response",
"complex_problem": "Complex",
"memory_building": "Mem Build",
"knowledge_production": "Knowledge",
"skill_application": "Skills",
"checkpoint_handling": "Checkpoint",
}
CATEGORY_DESCRIPTIONS = {
"memory_maintenance": "Can the model maintain context and facts across multiple conversation turns?",
"self_consciousness": "Can the model maintain self-identity, report internal state, and show epistemic humility?",
"meaningful_response": "Does the model produce useful, empathetic, and appropriately structured responses?",
"complex_problem": "Can the model solve multi-step reasoning and system design problems?",
"memory_building": "Can the model categorize and organize new information into hierarchical memory?",
"knowledge_production": "Can the model synthesize new knowledge from combining existing facts?",
"skill_application": "Can the model select and apply the right skill/method for a given problem?",
"checkpoint_handling": "Given prior context (memory checkpoint), can the model build on it for complex issues?",
}
def load_results() -> dict:
"""Load results from disk."""
if RESULTS_FILE.exists():
with open(RESULTS_FILE, "r") as f:
return json.load(f)
return {}
def save_results(results: dict):
"""Save results to disk."""
with open(RESULTS_FILE, "w") as f:
json.dump(results, f, indent=2)
# ==================== API Functions ====================
def check_model(model_name: str) -> str:
"""Check if a model exists on the leaderboard."""
results = load_results()
model_key = model_name.strip().lower()
if model_key in results:
record = results[model_key]
return json.dumps({"found": True, "record": record})
return json.dumps({"found": False})
def submit_result(submission_json: str) -> str:
"""
Submit benchmark results for a model.
Results are averaged with existing records.
"""
try:
submission = json.loads(submission_json)
except json.JSONDecodeError:
return json.dumps({"success": False, "message": "Invalid JSON"})
model_name = submission.get("model", "").strip()
if not model_name:
return json.dumps({"success": False, "message": "Missing model name"})
model_key = model_name.lower()
overall = submission.get("overall", 0)
categories = submission.get("categories", {})
with LOCK:
results = load_results()
if model_key in results:
existing = results[model_key]
n = existing.get("submission_count", 1)
# Running average
existing["overall"] = round(
(existing["overall"] * n + overall) / (n + 1)
)
for cat in CATEGORIES:
old_val = existing["categories"].get(cat, 0)
new_val = categories.get(cat, 0)
existing["categories"][cat] = round(
(old_val * n + new_val) / (n + 1)
)
existing["submission_count"] = n + 1
existing["last_updated"] = datetime.utcnow().isoformat()
else:
results[model_key] = {
"model": model_name,
"overall": round(overall),
"categories": {
cat: round(categories.get(cat, 0)) for cat in CATEGORIES
},
"submission_count": 1,
"first_submitted": datetime.utcnow().isoformat(),
"last_updated": datetime.utcnow().isoformat(),
}
save_results(results)
return json.dumps(
{"success": True, "message": f"Results for '{model_name}' recorded."}
)
def get_leaderboard() -> str:
"""Get the full leaderboard as sorted JSON array."""
results = load_results()
records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)
return json.dumps(records)
# ==================== UI Functions ====================
def build_leaderboard_df() -> pd.DataFrame:
"""Build a pandas DataFrame for the leaderboard display."""
results = load_results()
if not results:
return pd.DataFrame(
columns=["Rank", "Model", "Overall"]
+ [CATEGORY_LABELS[c] for c in CATEGORIES]
+ ["Runs"]
)
rows = []
records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)
for i, record in enumerate(records, 1):
row = {
"Rank": i,
"Model": record.get("model", "unknown"),
"Overall": record.get("overall", 0),
}
for cat in CATEGORIES:
row[CATEGORY_LABELS[cat]] = record.get("categories", {}).get(cat, 0)
row["Runs"] = record.get("submission_count", 1)
rows.append(row)
return pd.DataFrame(rows)
def refresh_leaderboard():
"""Refresh the leaderboard table."""
return build_leaderboard_df()
def format_model_detail(model_name: str) -> str:
"""Get detailed view for a specific model."""
results = load_results()
model_key = model_name.strip().lower()
if model_key not in results:
return f"Model '{model_name}' not found on the leaderboard."
record = results[model_key]
lines = [
f"## {record['model']}",
f"**Overall Score:** {record['overall']}/100",
f"**Benchmark Runs:** {record.get('submission_count', 1)}",
f"**Last Updated:** {record.get('last_updated', 'unknown')}",
"",
"### Category Scores",
"| Category | Score | Description |",
"|----------|-------|-------------|",
]
for cat in CATEGORIES:
score = record.get("categories", {}).get(cat, 0)
bar = score_bar(score)
desc = CATEGORY_DESCRIPTIONS.get(cat, "")
lines.append(f"| {CATEGORY_LABELS[cat]} | {bar} {score}/100 | {desc} |")
# Capability assessment
lines.append("")
lines.append("### Assessment")
if record["overall"] >= 80:
lines.append("Excellent - this model is highly capable for i,Robot mode.")
elif record["overall"] >= 60:
lines.append("Good - this model should work well for most i,Robot tasks.")
elif record["overall"] >= 40:
lines.append(
"Fair - this model may struggle with complex tasks. "
"Consider upgrading to a recommended model."
)
else:
lines.append(
"Poor - this model is not recommended for i,Robot mode. "
"It may produce nonsensical or inconsistent responses."
)
return "\n".join(lines)
def score_bar(score: int) -> str:
"""Create a simple text-based score bar."""
filled = score // 10
empty = 10 - filled
return "[" + "█" * filled + "░" * empty + "]"
# ==================== Gradio App ====================
def create_app():
with gr.Blocks(
title="Clippy i,Robot Benchmark Leaderboard",
theme=gr.themes.Soft(),
) as app:
gr.Markdown(
"""
# 🤖 Clippy i,Robot Mode — Model Benchmark Leaderboard
This leaderboard tracks how well different LLMs perform in
[Clippy's](https://github.com/NewJerseyStyle/Clippy-App) autonomous
**i,Robot mode** — a continuously running agent that maintains memory,
self-awareness, and dialectic reasoning.
**Benchmark categories:**
memory maintenance · self-consciousness · meaningful response ·
complex problem solving · memory building · knowledge production ·
skill application · checkpoint handling
Results are submitted automatically by Clippy clients when users run
the benchmark. Multiple runs for the same model are averaged.
"""
)
with gr.Tab("Leaderboard"):
leaderboard_table = gr.Dataframe(
value=build_leaderboard_df,
label="Model Rankings",
interactive=False,
)
refresh_btn = gr.Button("🔄 Refresh", size="sm")
refresh_btn.click(fn=refresh_leaderboard, outputs=leaderboard_table)
with gr.Tab("Model Detail"):
model_input = gr.Textbox(
label="Model Name",
placeholder="e.g. gpt-4o, claude-sonnet-4-5-20250929",
)
lookup_btn = gr.Button("Look Up")
detail_output = gr.Markdown()
lookup_btn.click(
fn=format_model_detail, inputs=model_input, outputs=detail_output
)
with gr.Tab("About"):
gr.Markdown(
"""
## How the Benchmark Works
The benchmark tests 8 categories critical for i,Robot mode:
| Category | What It Tests |
|----------|--------------|
| **Memory Maintenance** | Retaining facts across turns, updating corrected facts |
| **Self-Consciousness** | Identity recall, internal state reporting, epistemic humility |
| **Meaningful Response** | Empathy, actionable advice, audience-appropriate answers |
| **Complex Problem** | Multi-factor diagnosis, system design with trade-offs |
| **Memory Building** | Categorizing info into hierarchical memory structures |
| **Knowledge Production** | Synthesizing new insights from combining existing facts |
| **Skill Application** | Selecting and applying the right method for a problem |
| **Checkpoint Handling** | Building on loaded prior context for complex decisions |
### Scoring
- Each test case scores 0-100 based on content matching and quality heuristics
- Category score = average of test case scores
- Overall score = weighted average of category scores
- Multiple submissions for the same model are averaged (running mean)
### Recommended Models
For i,Robot mode, we recommend models scoring **60+** overall:
- **DeepSeek V3.2** · **GPT-5.2** · **Claude Sonnet 4.5** · **GLM-4.7**
- GPT-4o and Claude Sonnet 4 are also acceptable
### Running the Benchmark
In Clippy Settings, enable i,Robot mode and click "Run Benchmark."
Results are automatically submitted to this leaderboard.
### Source
- [Clippy App](https://github.com/NewJerseyStyle/Clippy-App)
- Space: `npc0/clippy-irobot-bench`
"""
)
return app
# ==================== Entry Point ====================
if __name__ == "__main__":
app = create_app()
app.launch()