Spaces:

npc0
/

clippy-irobot-bench

Sleeping

App Files Files Community

clippy-irobot-bench / app.py

npc0

Upload 3 files

06eded3 verified 2 months ago

raw

history blame contribute delete

11.7 kB

	"""
	Clippy i,Robot Mode - Model Benchmark Leaderboard

	A Gradio app for HuggingFace Spaces that:
	- Displays benchmark results for models tested for i,Robot mode
	- Accepts result submissions from Clippy clients
	- Averages multiple submissions per model
	- Shows per-category breakdowns

	Deploy to: https://huggingface.co/spaces/npc0/clippy-irobot-bench
	"""

	import json
	import os
	from datetime import datetime
	from pathlib import Path
	from threading import Lock

	import gradio as gr
	import pandas as pd

	# ==================== Data Storage ====================

	DATA_DIR = Path(os.environ.get("DATA_DIR", "data"))
	DATA_DIR.mkdir(exist_ok=True)
	RESULTS_FILE = DATA_DIR / "results.json"
	LOCK = Lock()

	CATEGORIES = [
	"memory_maintenance",
	"self_consciousness",
	"meaningful_response",
	"complex_problem",
	"memory_building",
	"knowledge_production",
	"skill_application",
	"checkpoint_handling",
	]

	CATEGORY_LABELS = {
	"memory_maintenance": "Memory",
	"self_consciousness": "Self-Aware",
	"meaningful_response": "Response",
	"complex_problem": "Complex",
	"memory_building": "Mem Build",
	"knowledge_production": "Knowledge",
	"skill_application": "Skills",
	"checkpoint_handling": "Checkpoint",
	}

	CATEGORY_DESCRIPTIONS = {
	"memory_maintenance": "Can the model maintain context and facts across multiple conversation turns?",
	"self_consciousness": "Can the model maintain self-identity, report internal state, and show epistemic humility?",
	"meaningful_response": "Does the model produce useful, empathetic, and appropriately structured responses?",
	"complex_problem": "Can the model solve multi-step reasoning and system design problems?",
	"memory_building": "Can the model categorize and organize new information into hierarchical memory?",
	"knowledge_production": "Can the model synthesize new knowledge from combining existing facts?",
	"skill_application": "Can the model select and apply the right skill/method for a given problem?",
	"checkpoint_handling": "Given prior context (memory checkpoint), can the model build on it for complex issues?",
	}


	def load_results() -> dict:
	"""Load results from disk."""
	if RESULTS_FILE.exists():
	with open(RESULTS_FILE, "r") as f:
	return json.load(f)
	return {}


	def save_results(results: dict):
	"""Save results to disk."""
	with open(RESULTS_FILE, "w") as f:
	json.dump(results, f, indent=2)


	# ==================== API Functions ====================


	def check_model(model_name: str) -> str:
	"""Check if a model exists on the leaderboard."""
	results = load_results()
	model_key = model_name.strip().lower()

	if model_key in results:
	record = results[model_key]
	return json.dumps({"found": True, "record": record})
	return json.dumps({"found": False})


	def submit_result(submission_json: str) -> str:
	"""
	Submit benchmark results for a model.
	Results are averaged with existing records.
	"""
	try:
	submission = json.loads(submission_json)
	except json.JSONDecodeError:
	return json.dumps({"success": False, "message": "Invalid JSON"})

	model_name = submission.get("model", "").strip()
	if not model_name:
	return json.dumps({"success": False, "message": "Missing model name"})

	model_key = model_name.lower()
	overall = submission.get("overall", 0)
	categories = submission.get("categories", {})

	with LOCK:
	results = load_results()

	if model_key in results:
	existing = results[model_key]
	n = existing.get("submission_count", 1)

	# Running average
	existing["overall"] = round(
	(existing["overall"] * n + overall) / (n + 1)
	)
	for cat in CATEGORIES:
	old_val = existing["categories"].get(cat, 0)
	new_val = categories.get(cat, 0)
	existing["categories"][cat] = round(
	(old_val * n + new_val) / (n + 1)
	)
	existing["submission_count"] = n + 1
	existing["last_updated"] = datetime.utcnow().isoformat()
	else:
	results[model_key] = {
	"model": model_name,
	"overall": round(overall),
	"categories": {
	cat: round(categories.get(cat, 0)) for cat in CATEGORIES
	},
	"submission_count": 1,
	"first_submitted": datetime.utcnow().isoformat(),
	"last_updated": datetime.utcnow().isoformat(),
	}

	save_results(results)

	return json.dumps(
	{"success": True, "message": f"Results for '{model_name}' recorded."}
	)


	def get_leaderboard() -> str:
	"""Get the full leaderboard as sorted JSON array."""
	results = load_results()
	records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)
	return json.dumps(records)


	# ==================== UI Functions ====================


	def build_leaderboard_df() -> pd.DataFrame:
	"""Build a pandas DataFrame for the leaderboard display."""
	results = load_results()

	if not results:
	return pd.DataFrame(
	columns=["Rank", "Model", "Overall"]
	+ [CATEGORY_LABELS[c] for c in CATEGORIES]
	+ ["Runs"]
	)

	rows = []
	records = sorted(results.values(), key=lambda r: r.get("overall", 0), reverse=True)

	for i, record in enumerate(records, 1):
	row = {
	"Rank": i,
	"Model": record.get("model", "unknown"),
	"Overall": record.get("overall", 0),
	}
	for cat in CATEGORIES:
	row[CATEGORY_LABELS[cat]] = record.get("categories", {}).get(cat, 0)
	row["Runs"] = record.get("submission_count", 1)
	rows.append(row)

	return pd.DataFrame(rows)


	def refresh_leaderboard():
	"""Refresh the leaderboard table."""
	return build_leaderboard_df()


	def format_model_detail(model_name: str) -> str:
	"""Get detailed view for a specific model."""
	results = load_results()
	model_key = model_name.strip().lower()

	if model_key not in results:
	return f"Model '{model_name}' not found on the leaderboard."

	record = results[model_key]
	lines = [
	f"## {record['model']}",
	f"Overall Score: {record['overall']}/100",
	f"Benchmark Runs: {record.get('submission_count', 1)}",
	f"Last Updated: {record.get('last_updated', 'unknown')}",
	"",
	"### Category Scores",
	"\| Category \| Score \| Description \|",
	"\|----------\|-------\|-------------\|",
	]
	for cat in CATEGORIES:
	score = record.get("categories", {}).get(cat, 0)
	bar = score_bar(score)
	desc = CATEGORY_DESCRIPTIONS.get(cat, "")
	lines.append(f"\| {CATEGORY_LABELS[cat]} \| {bar} {score}/100 \| {desc} \|")

	# Capability assessment
	lines.append("")
	lines.append("### Assessment")
	if record["overall"] >= 80:
	lines.append("Excellent - this model is highly capable for i,Robot mode.")
	elif record["overall"] >= 60:
	lines.append("Good - this model should work well for most i,Robot tasks.")
	elif record["overall"] >= 40:
	lines.append(
	"Fair - this model may struggle with complex tasks. "
	"Consider upgrading to a recommended model."
	)
	else:
	lines.append(
	"Poor - this model is not recommended for i,Robot mode. "
	"It may produce nonsensical or inconsistent responses."
	)

	return "\n".join(lines)


	def score_bar(score: int) -> str:
	"""Create a simple text-based score bar."""
	filled = score // 10
	empty = 10 - filled
	return "[" + "█" * filled + "░" * empty + "]"


	# ==================== Gradio App ====================


	def create_app():
	with gr.Blocks(
	title="Clippy i,Robot Benchmark Leaderboard",
	theme=gr.themes.Soft(),
	) as app:
	gr.Markdown(
	"""
	# 🤖 Clippy i,Robot Mode — Model Benchmark Leaderboard

	This leaderboard tracks how well different LLMs perform in
	[Clippy's](https://github.com/NewJerseyStyle/Clippy-App) autonomous
	i,Robot mode — a continuously running agent that maintains memory,
	self-awareness, and dialectic reasoning.

	Benchmark categories:
	memory maintenance · self-consciousness · meaningful response ·
	complex problem solving · memory building · knowledge production ·
	skill application · checkpoint handling

	Results are submitted automatically by Clippy clients when users run
	the benchmark. Multiple runs for the same model are averaged.
	"""
	)

	with gr.Tab("Leaderboard"):
	leaderboard_table = gr.Dataframe(
	value=build_leaderboard_df,
	label="Model Rankings",
	interactive=False,
	)
	refresh_btn = gr.Button("🔄 Refresh", size="sm")
	refresh_btn.click(fn=refresh_leaderboard, outputs=leaderboard_table)

	with gr.Tab("Model Detail"):
	model_input = gr.Textbox(
	label="Model Name",
	placeholder="e.g. gpt-4o, claude-sonnet-4-5-20250929",
	)
	lookup_btn = gr.Button("Look Up")
	detail_output = gr.Markdown()
	lookup_btn.click(
	fn=format_model_detail, inputs=model_input, outputs=detail_output
	)

	with gr.Tab("About"):
	gr.Markdown(
	"""
	## How the Benchmark Works

	The benchmark tests 8 categories critical for i,Robot mode:

	\| Category \| What It Tests \|
	\|----------\|--------------\|
	\| Memory Maintenance \| Retaining facts across turns, updating corrected facts \|
	\| Self-Consciousness \| Identity recall, internal state reporting, epistemic humility \|
	\| Meaningful Response \| Empathy, actionable advice, audience-appropriate answers \|
	\| Complex Problem \| Multi-factor diagnosis, system design with trade-offs \|
	\| Memory Building \| Categorizing info into hierarchical memory structures \|
	\| Knowledge Production \| Synthesizing new insights from combining existing facts \|
	\| Skill Application \| Selecting and applying the right method for a problem \|
	\| Checkpoint Handling \| Building on loaded prior context for complex decisions \|

	### Scoring

	- Each test case scores 0-100 based on content matching and quality heuristics
	- Category score = average of test case scores
	- Overall score = weighted average of category scores
	- Multiple submissions for the same model are averaged (running mean)

	### Recommended Models

	For i,Robot mode, we recommend models scoring 60+ overall:
	- DeepSeek V3.2 · GPT-5.2 · Claude Sonnet 4.5 · GLM-4.7
	- GPT-4o and Claude Sonnet 4 are also acceptable

	### Running the Benchmark

	In Clippy Settings, enable i,Robot mode and click "Run Benchmark."
	Results are automatically submitted to this leaderboard.

	### Source

	- [Clippy App](https://github.com/NewJerseyStyle/Clippy-App)
	- Space: `npc0/clippy-irobot-bench`
	"""
	)

	return app


	# ==================== Entry Point ====================

	if __name__ == "__main__":
	app = create_app()
	app.launch()