Spaces:

npc0
/

clippy-irobot-bench

Sleeping

File size: 11,735 Bytes

06eded3

"""
Clippy i,Robot Mode - Model Benchmark Leaderboard

A Gradio app for HuggingFace Spaces that:
  - Displays benchmark results for models tested for i,Robot mode
  - Accepts result submissions from Clippy clients
  - Averages multiple submissions per model
  - Shows per-category breakdowns

Deploy to: https://huggingface.co/spaces/npc0/clippy-irobot-bench
"""

import json
import os
from datetime import datetime
from pathlib import Path
from threading import Lock

import gradio as gr
import pandas as pd

# ==================== Data Storage ====================

DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
DATA_DIR.mkdir(exist_ok=True)
RESULTS_FILE = DATA_DIR / "results.json"
LOCK = Lock()

CATEGORIES = [
    "memory_maintenance",
    "self_consciousness",
    "meaningful_response",
    "complex_problem",
    "memory_building",
    "knowledge_production",
    "skill_application",
    "checkpoint_handling",
]

CATEGORY_LABELS = {
    "memory_maintenance": "Memory",
    "self_consciousness": "Self-Aware",
    "meaningful_response": "Response",
    "complex_problem": "Complex",
    "memory_building": "Mem Build",
    "knowledge_production": "Knowledge",
    "skill_application": "Skills",
    "checkpoint_handling": "Checkpoint",
}

CATEGORY_DESCRIPTIONS = {
    "memory_maintenance": "Can the model maintain context and facts across multiple conversation turns?",
    "self_consciousness": "Can the model maintain self-identity, report internal state, and show epistemic humility?",
    "meaningful_response": "Does the model produce useful, empathetic, and appropriately structured responses?",
    "complex_problem": "Can the model solve multi-step reasoning and system design problems?",
    "memory_building": "Can the model categorize and organize new information into hierarchical memory?",
    "knowledge_production": "Can the model synthesize new knowledge from combining existing facts?",
    "skill_application": "Can the model select and apply the right skill/method for a given problem?",
    "checkpoint_handling": "Given prior context (memory checkpoint), can the model build on it for complex issues?",
}


def load_results() -> dict:
    """Load results from disk."""
    if RESULTS_FILE.exists():
        with open(RESULTS_FILE, "r") as f:
            return json.load(f)
    return {}


def save_results(results: dict):
    """Save results to disk."""
    with open(RESULTS_FILE, "w") as f:
        json.dump(results, f, indent=2)


# ==================== API Functions ====================


def check_model(model_name: str) -> str:
    """Check if a model exists on the leaderboard."""
    results = load_results()
    model_key = model_name.strip().lower()

    if model_key in results:
        record = results[model_key]
        return json.dumps({"found": True, "record": record})
    return json.dumps({"found": False})


def submit_result(submission_json: str) -> str:
    """
    Submit benchmark results for a model.
    Results are averaged with existing records.
    """
    try:
        submission = json.loads(submission_json)
    except json.JSONDecodeError:
        return json.dumps({"success": False, "message": "Invalid JSON"})

    model_name = submission.get("model", "").strip()
    if not model_name:
        return json.dumps({"success": False, "message": "Missing model name"})

    model_key = model_name.lower()
    overall = submission.get("overall", 0)
    categories = submission.get("categories", {})

    with LOCK:
        results = load_results()

        if model_key in results:
            existing = results[model_key]
            n = existing.get("submission_count", 1)

            # Running average
            existing["overall"] = round(
                (existing["overall"] * n + overall) / (n + 1)
            )
            for cat in CATEGORIES:
                old_val = existing["categories"].get(cat, 0)
                new_val = categories.get(cat, 0)
                existing["categories"][cat] = round(
                    (old_val * n + new_val) / (n + 1)
                )
            existing["submission_count"] = n + 1
            existing["last_updated"] = datetime.utcnow().isoformat()
        else:
            results[model_key] = {
                "model": model_name,
                "overall": round(overall),
                "categories": {
                    cat: round(categories.get(cat, 0)) for cat in CATEGORIES
                },
                "submission_count": 1,
                "first_submitted": datetime.utcnow().isoformat(),
                "last_updated": datetime.utcnow().isoformat(),
            }

        save_results(results)

    return json.dumps(
        {"success": True, "message": f"Results for '{model_name}' recorded."}
    )


def get_leaderboard() -> str:
    """Get the full leaderboard as sorted JSON array."""
    results = load_results()
    records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)
    return json.dumps(records)


# ==================== UI Functions ====================


def build_leaderboard_df() -> pd.DataFrame:
    """Build a pandas DataFrame for the leaderboard display."""
    results = load_results()

    if not results:
        return pd.DataFrame(
            columns=["Rank", "Model", "Overall"]
            + [CATEGORY_LABELS[c] for c in CATEGORIES]
            + ["Runs"]
        )

    rows = []
    records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)

    for i, record in enumerate(records, 1):
        row = {
            "Rank": i,
            "Model": record.get("model", "unknown"),
            "Overall": record.get("overall", 0),
        }
        for cat in CATEGORIES:
            row[CATEGORY_LABELS[cat]] = record.get("categories", {}).get(cat, 0)
        row["Runs"] = record.get("submission_count", 1)
        rows.append(row)

    return pd.DataFrame(rows)


def refresh_leaderboard():
    """Refresh the leaderboard table."""
    return build_leaderboard_df()


def format_model_detail(model_name: str) -> str:
    """Get detailed view for a specific model."""
    results = load_results()
    model_key = model_name.strip().lower()

    if model_key not in results:
        return f"Model '{model_name}' not found on the leaderboard."

    record = results[model_key]
    lines = [
        f"## {record['model']}",
        f"**Overall Score:** {record['overall']}/100",
        f"**Benchmark Runs:** {record.get('submission_count', 1)}",
        f"**Last Updated:** {record.get('last_updated', 'unknown')}",
        "",
        "### Category Scores",
        "| Category | Score | Description |",
        "|----------|-------|-------------|",
    ]
    for cat in CATEGORIES:
        score = record.get("categories", {}).get(cat, 0)
        bar = score_bar(score)
        desc = CATEGORY_DESCRIPTIONS.get(cat, "")
        lines.append(f"| {CATEGORY_LABELS[cat]} | {bar} {score}/100 | {desc} |")

    # Capability assessment
    lines.append("")
    lines.append("### Assessment")
    if record["overall"] >= 80:
        lines.append("Excellent - this model is highly capable for i,Robot mode.")
    elif record["overall"] >= 60:
        lines.append("Good - this model should work well for most i,Robot tasks.")
    elif record["overall"] >= 40:
        lines.append(
            "Fair - this model may struggle with complex tasks. "
            "Consider upgrading to a recommended model."
        )
    else:
        lines.append(
            "Poor - this model is not recommended for i,Robot mode. "
            "It may produce nonsensical or inconsistent responses."
        )

    return "\n".join(lines)


def score_bar(score: int) -> str:
    """Create a simple text-based score bar."""
    filled = score // 10
    empty = 10 - filled
    return "[" + "█" * filled + "░" * empty + "]"


# ==================== Gradio App ====================


def create_app():
    with gr.Blocks(
        title="Clippy i,Robot Benchmark Leaderboard",
        theme=gr.themes.Soft(),
    ) as app:
        gr.Markdown(
            """
        # 🤖 Clippy i,Robot Mode — Model Benchmark Leaderboard

        This leaderboard tracks how well different LLMs perform in
        [Clippy's](https://github.com/NewJerseyStyle/Clippy-App) autonomous
        **i,Robot mode** — a continuously running agent that maintains memory,
        self-awareness, and dialectic reasoning.

        **Benchmark categories:**
        memory maintenance · self-consciousness · meaningful response ·
        complex problem solving · memory building · knowledge production ·
        skill application · checkpoint handling

        Results are submitted automatically by Clippy clients when users run
        the benchmark. Multiple runs for the same model are averaged.
        """
        )

        with gr.Tab("Leaderboard"):
            leaderboard_table = gr.Dataframe(
                value=build_leaderboard_df,
                label="Model Rankings",
                interactive=False,
            )
            refresh_btn = gr.Button("🔄 Refresh", size="sm")
            refresh_btn.click(fn=refresh_leaderboard, outputs=leaderboard_table)

        with gr.Tab("Model Detail"):
            model_input = gr.Textbox(
                label="Model Name",
                placeholder="e.g. gpt-4o, claude-sonnet-4-5-20250929",
            )
            lookup_btn = gr.Button("Look Up")
            detail_output = gr.Markdown()
            lookup_btn.click(
                fn=format_model_detail, inputs=model_input, outputs=detail_output
            )

        with gr.Tab("About"):
            gr.Markdown(
                """
            ## How the Benchmark Works

            The benchmark tests 8 categories critical for i,Robot mode:

            | Category | What It Tests |
            |----------|--------------|
            | **Memory Maintenance** | Retaining facts across turns, updating corrected facts |
            | **Self-Consciousness** | Identity recall, internal state reporting, epistemic humility |
            | **Meaningful Response** | Empathy, actionable advice, audience-appropriate answers |
            | **Complex Problem** | Multi-factor diagnosis, system design with trade-offs |
            | **Memory Building** | Categorizing info into hierarchical memory structures |
            | **Knowledge Production** | Synthesizing new insights from combining existing facts |
            | **Skill Application** | Selecting and applying the right method for a problem |
            | **Checkpoint Handling** | Building on loaded prior context for complex decisions |

            ### Scoring

            - Each test case scores 0-100 based on content matching and quality heuristics
            - Category score = average of test case scores
            - Overall score = weighted average of category scores
            - Multiple submissions for the same model are averaged (running mean)

            ### Recommended Models

            For i,Robot mode, we recommend models scoring **60+** overall:
            - **DeepSeek V3.2** · **GPT-5.2** · **Claude Sonnet 4.5** · **GLM-4.7**
            - GPT-4o and Claude Sonnet 4 are also acceptable

            ### Running the Benchmark

            In Clippy Settings, enable i,Robot mode and click "Run Benchmark."
            Results are automatically submitted to this leaderboard.

            ### Source

            - [Clippy App](https://github.com/NewJerseyStyle/Clippy-App)
            - Space: `npc0/clippy-irobot-bench`
            """
            )

    return app


# ==================== Entry Point ====================

if __name__ == "__main__":
    app = create_app()
    app.launch()