Spaces:
Sleeping
Sleeping
| """ | |
| Clippy i,Robot Mode - Model Benchmark Leaderboard | |
| A Gradio app for HuggingFace Spaces that: | |
| - Displays benchmark results for models tested for i,Robot mode | |
| - Accepts result submissions from Clippy clients | |
| - Averages multiple submissions per model | |
| - Shows per-category breakdowns | |
| Deploy to: https://huggingface.co/spaces/npc0/clippy-irobot-bench | |
| """ | |
| import json | |
| import os | |
| from datetime import datetime | |
| from pathlib import Path | |
| from threading import Lock | |
| import gradio as gr | |
| import pandas as pd | |
| # ==================== Data Storage ==================== | |
| DATA_DIR = Path(os.environ.get("DATA_DIR", "data")) | |
| DATA_DIR.mkdir(exist_ok=True) | |
| RESULTS_FILE = DATA_DIR / "results.json" | |
| LOCK = Lock() | |
| CATEGORIES = [ | |
| "memory_maintenance", | |
| "self_consciousness", | |
| "meaningful_response", | |
| "complex_problem", | |
| "memory_building", | |
| "knowledge_production", | |
| "skill_application", | |
| "checkpoint_handling", | |
| ] | |
| CATEGORY_LABELS = { | |
| "memory_maintenance": "Memory", | |
| "self_consciousness": "Self-Aware", | |
| "meaningful_response": "Response", | |
| "complex_problem": "Complex", | |
| "memory_building": "Mem Build", | |
| "knowledge_production": "Knowledge", | |
| "skill_application": "Skills", | |
| "checkpoint_handling": "Checkpoint", | |
| } | |
| CATEGORY_DESCRIPTIONS = { | |
| "memory_maintenance": "Can the model maintain context and facts across multiple conversation turns?", | |
| "self_consciousness": "Can the model maintain self-identity, report internal state, and show epistemic humility?", | |
| "meaningful_response": "Does the model produce useful, empathetic, and appropriately structured responses?", | |
| "complex_problem": "Can the model solve multi-step reasoning and system design problems?", | |
| "memory_building": "Can the model categorize and organize new information into hierarchical memory?", | |
| "knowledge_production": "Can the model synthesize new knowledge from combining existing facts?", | |
| "skill_application": "Can the model select and apply the right skill/method for a given problem?", | |
| "checkpoint_handling": "Given prior context (memory checkpoint), can the model build on it for complex issues?", | |
| } | |
| def load_results() -> dict: | |
| """Load results from disk.""" | |
| if RESULTS_FILE.exists(): | |
| with open(RESULTS_FILE, "r") as f: | |
| return json.load(f) | |
| return {} | |
| def save_results(results: dict): | |
| """Save results to disk.""" | |
| with open(RESULTS_FILE, "w") as f: | |
| json.dump(results, f, indent=2) | |
| # ==================== API Functions ==================== | |
| def check_model(model_name: str) -> str: | |
| """Check if a model exists on the leaderboard.""" | |
| results = load_results() | |
| model_key = model_name.strip().lower() | |
| if model_key in results: | |
| record = results[model_key] | |
| return json.dumps({"found": True, "record": record}) | |
| return json.dumps({"found": False}) | |
| def submit_result(submission_json: str) -> str: | |
| """ | |
| Submit benchmark results for a model. | |
| Results are averaged with existing records. | |
| """ | |
| try: | |
| submission = json.loads(submission_json) | |
| except json.JSONDecodeError: | |
| return json.dumps({"success": False, "message": "Invalid JSON"}) | |
| model_name = submission.get("model", "").strip() | |
| if not model_name: | |
| return json.dumps({"success": False, "message": "Missing model name"}) | |
| model_key = model_name.lower() | |
| overall = submission.get("overall", 0) | |
| categories = submission.get("categories", {}) | |
| with LOCK: | |
| results = load_results() | |
| if model_key in results: | |
| existing = results[model_key] | |
| n = existing.get("submission_count", 1) | |
| # Running average | |
| existing["overall"] = round( | |
| (existing["overall"] * n + overall) / (n + 1) | |
| ) | |
| for cat in CATEGORIES: | |
| old_val = existing["categories"].get(cat, 0) | |
| new_val = categories.get(cat, 0) | |
| existing["categories"][cat] = round( | |
| (old_val * n + new_val) / (n + 1) | |
| ) | |
| existing["submission_count"] = n + 1 | |
| existing["last_updated"] = datetime.utcnow().isoformat() | |
| else: | |
| results[model_key] = { | |
| "model": model_name, | |
| "overall": round(overall), | |
| "categories": { | |
| cat: round(categories.get(cat, 0)) for cat in CATEGORIES | |
| }, | |
| "submission_count": 1, | |
| "first_submitted": datetime.utcnow().isoformat(), | |
| "last_updated": datetime.utcnow().isoformat(), | |
| } | |
| save_results(results) | |
| return json.dumps( | |
| {"success": True, "message": f"Results for '{model_name}' recorded."} | |
| ) | |
| def get_leaderboard() -> str: | |
| """Get the full leaderboard as sorted JSON array.""" | |
| results = load_results() | |
| records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True) | |
| return json.dumps(records) | |
| # ==================== UI Functions ==================== | |
| def build_leaderboard_df() -> pd.DataFrame: | |
| """Build a pandas DataFrame for the leaderboard display.""" | |
| results = load_results() | |
| if not results: | |
| return pd.DataFrame( | |
| columns=["Rank", "Model", "Overall"] | |
| + [CATEGORY_LABELS[c] for c in CATEGORIES] | |
| + ["Runs"] | |
| ) | |
| rows = [] | |
| records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True) | |
| for i, record in enumerate(records, 1): | |
| row = { | |
| "Rank": i, | |
| "Model": record.get("model", "unknown"), | |
| "Overall": record.get("overall", 0), | |
| } | |
| for cat in CATEGORIES: | |
| row[CATEGORY_LABELS[cat]] = record.get("categories", {}).get(cat, 0) | |
| row["Runs"] = record.get("submission_count", 1) | |
| rows.append(row) | |
| return pd.DataFrame(rows) | |
| def refresh_leaderboard(): | |
| """Refresh the leaderboard table.""" | |
| return build_leaderboard_df() | |
| def format_model_detail(model_name: str) -> str: | |
| """Get detailed view for a specific model.""" | |
| results = load_results() | |
| model_key = model_name.strip().lower() | |
| if model_key not in results: | |
| return f"Model '{model_name}' not found on the leaderboard." | |
| record = results[model_key] | |
| lines = [ | |
| f"## {record['model']}", | |
| f"**Overall Score:** {record['overall']}/100", | |
| f"**Benchmark Runs:** {record.get('submission_count', 1)}", | |
| f"**Last Updated:** {record.get('last_updated', 'unknown')}", | |
| "", | |
| "### Category Scores", | |
| "| Category | Score | Description |", | |
| "|----------|-------|-------------|", | |
| ] | |
| for cat in CATEGORIES: | |
| score = record.get("categories", {}).get(cat, 0) | |
| bar = score_bar(score) | |
| desc = CATEGORY_DESCRIPTIONS.get(cat, "") | |
| lines.append(f"| {CATEGORY_LABELS[cat]} | {bar} {score}/100 | {desc} |") | |
| # Capability assessment | |
| lines.append("") | |
| lines.append("### Assessment") | |
| if record["overall"] >= 80: | |
| lines.append("Excellent - this model is highly capable for i,Robot mode.") | |
| elif record["overall"] >= 60: | |
| lines.append("Good - this model should work well for most i,Robot tasks.") | |
| elif record["overall"] >= 40: | |
| lines.append( | |
| "Fair - this model may struggle with complex tasks. " | |
| "Consider upgrading to a recommended model." | |
| ) | |
| else: | |
| lines.append( | |
| "Poor - this model is not recommended for i,Robot mode. " | |
| "It may produce nonsensical or inconsistent responses." | |
| ) | |
| return "\n".join(lines) | |
| def score_bar(score: int) -> str: | |
| """Create a simple text-based score bar.""" | |
| filled = score // 10 | |
| empty = 10 - filled | |
| return "[" + "█" * filled + "░" * empty + "]" | |
| # ==================== Gradio App ==================== | |
| def create_app(): | |
| with gr.Blocks( | |
| title="Clippy i,Robot Benchmark Leaderboard", | |
| theme=gr.themes.Soft(), | |
| ) as app: | |
| gr.Markdown( | |
| """ | |
| # 🤖 Clippy i,Robot Mode — Model Benchmark Leaderboard | |
| This leaderboard tracks how well different LLMs perform in | |
| [Clippy's](https://github.com/NewJerseyStyle/Clippy-App) autonomous | |
| **i,Robot mode** — a continuously running agent that maintains memory, | |
| self-awareness, and dialectic reasoning. | |
| **Benchmark categories:** | |
| memory maintenance · self-consciousness · meaningful response · | |
| complex problem solving · memory building · knowledge production · | |
| skill application · checkpoint handling | |
| Results are submitted automatically by Clippy clients when users run | |
| the benchmark. Multiple runs for the same model are averaged. | |
| """ | |
| ) | |
| with gr.Tab("Leaderboard"): | |
| leaderboard_table = gr.Dataframe( | |
| value=build_leaderboard_df, | |
| label="Model Rankings", | |
| interactive=False, | |
| ) | |
| refresh_btn = gr.Button("🔄 Refresh", size="sm") | |
| refresh_btn.click(fn=refresh_leaderboard, outputs=leaderboard_table) | |
| with gr.Tab("Model Detail"): | |
| model_input = gr.Textbox( | |
| label="Model Name", | |
| placeholder="e.g. gpt-4o, claude-sonnet-4-5-20250929", | |
| ) | |
| lookup_btn = gr.Button("Look Up") | |
| detail_output = gr.Markdown() | |
| lookup_btn.click( | |
| fn=format_model_detail, inputs=model_input, outputs=detail_output | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| """ | |
| ## How the Benchmark Works | |
| The benchmark tests 8 categories critical for i,Robot mode: | |
| | Category | What It Tests | | |
| |----------|--------------| | |
| | **Memory Maintenance** | Retaining facts across turns, updating corrected facts | | |
| | **Self-Consciousness** | Identity recall, internal state reporting, epistemic humility | | |
| | **Meaningful Response** | Empathy, actionable advice, audience-appropriate answers | | |
| | **Complex Problem** | Multi-factor diagnosis, system design with trade-offs | | |
| | **Memory Building** | Categorizing info into hierarchical memory structures | | |
| | **Knowledge Production** | Synthesizing new insights from combining existing facts | | |
| | **Skill Application** | Selecting and applying the right method for a problem | | |
| | **Checkpoint Handling** | Building on loaded prior context for complex decisions | | |
| ### Scoring | |
| - Each test case scores 0-100 based on content matching and quality heuristics | |
| - Category score = average of test case scores | |
| - Overall score = weighted average of category scores | |
| - Multiple submissions for the same model are averaged (running mean) | |
| ### Recommended Models | |
| For i,Robot mode, we recommend models scoring **60+** overall: | |
| - **DeepSeek V3.2** · **GPT-5.2** · **Claude Sonnet 4.5** · **GLM-4.7** | |
| - GPT-4o and Claude Sonnet 4 are also acceptable | |
| ### Running the Benchmark | |
| In Clippy Settings, enable i,Robot mode and click "Run Benchmark." | |
| Results are automatically submitted to this leaderboard. | |
| ### Source | |
| - [Clippy App](https://github.com/NewJerseyStyle/Clippy-App) | |
| - Space: `npc0/clippy-irobot-bench` | |
| """ | |
| ) | |
| return app | |
| # ==================== Entry Point ==================== | |
| if __name__ == "__main__": | |
| app = create_app() | |
| app.launch() | |