Spaces:

zai-org
/

RPC-Bench

Running

App Files Files Community

RPC-Bench / app.py

ZHANGYUXUAN-zR

Upload RPC-Bench leaderboard Space

e9313f3 verified 10 days ago

Raw

History Blame Contribute Delete

15.2 kB

	import html
	import json
	import re
	from pathlib import Path

	import gradio as gr
	import pandas as pd


	ROOT = Path(__file__).parent
	DATA_DIR = ROOT / "data"
	LEADERBOARD_JSON_PATH = DATA_DIR / "leaderboard.json"
	LEADERBOARD_CSV_PATH = DATA_DIR / "leaderboard_seed.csv"

	MODEL_LINK_RE = re.compile(r"^\[(?P<name>.)\]$(?P<url>.)$$")
	DISPLAY_COLUMNS = [
	"Rank",
	"Model",
	"Organization",
	"Input Config",
	"Conciseness",
	"Correctness",
	"Completeness",
	"F1-like",
	"Informativeness",
	"Date",
	]
	NUMERIC_COLUMNS = ["Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"]

	CUSTOM_CSS = """
	footer { display: none !important; }
	.api-docs, .show-api, .built-with, [data-testid="api-info"] { display: none !important; }
	.gradio-container { max-width: 100% !important; padding: 18px 24px 16px !important; }
	#component-0 { max-width: 100% !important; }
	.rpc-title h1 { margin-bottom: 4px !important; }
	.rpc-title p { margin-top: 0 !important; color: #555; }
	.rpc-links {
	display: flex;
	justify-content: center;
	align-items: center;
	gap: 8px;
	flex-wrap: wrap;
	margin: 4px 0 12px;
	color: #4b5563;
	}
	.rpc-links a { color: #2563eb; text-decoration: none; }
	.rpc-links a:hover { text-decoration: underline; }
	.leaderboard-toolbar {
	display: flex;
	align-items: center;
	justify-content: space-between;
	gap: 12px;
	margin: 8px 0 12px;
	flex-wrap: wrap;
	}
	.config-filter {
	display: inline-flex;
	align-items: center;
	gap: 4px;
	padding: 3px;
	border: 1px solid #e5e7eb;
	border-radius: 8px;
	background: #f8fafc;
	}
	.config-filter button {
	border: 0;
	background: transparent;
	color: #4b5563;
	cursor: pointer;
	font-size: 13px;
	font-weight: 600;
	padding: 6px 10px;
	border-radius: 6px;
	}
	.config-filter button.active {
	background: #111827;
	color: #ffffff;
	}
	.table-count {
	color: #6b7280;
	font-size: 13px;
	}
	.leaderboard-shell {
	height: calc(100vh - 245px);
	min-height: 460px;
	max-height: 780px;
	overflow: auto;
	border: 1px solid #e5e7eb;
	border-radius: 8px;
	background: white;
	}
	.rpc-table {
	width: 100%;
	border-collapse: separate;
	border-spacing: 0;
	font-size: 14px;
	}
	.rpc-table th {
	position: sticky;
	top: 0;
	z-index: 2;
	background: #f8fafc;
	color: #111827;
	font-weight: 650;
	text-align: left;
	border-bottom: 1px solid #d1d5db;
	padding: 9px 11px;
	white-space: nowrap;
	cursor: pointer;
	user-select: none;
	}
	.rpc-table th::after {
	content: "↕";
	color: #9ca3af;
	font-size: 11px;
	margin-left: 6px;
	}
	.rpc-table th.sort-asc::after { content: "↑"; color: #111827; }
	.rpc-table th.sort-desc::after { content: "↓"; color: #111827; }
	.rpc-table td {
	border-bottom: 1px solid #eef2f7;
	padding: 9px 11px;
	vertical-align: middle;
	white-space: nowrap;
	}
	.rpc-table tbody tr:hover { background: #f9fafb; }
	.rpc-table .num { text-align: right; font-variant-numeric: tabular-nums; }
	.rpc-table .rank { width: 64px; text-align: right; color: #4b5563; }
	.rpc-table .model { min-width: 210px; font-weight: 600; }
	.rpc-table .org { min-width: 180px; }
	.config-badge {
	display: inline-flex;
	align-items: center;
	justify-content: center;
	min-width: 58px;
	padding: 3px 8px;
	border-radius: 999px;
	font-size: 12px;
	font-weight: 700;
	letter-spacing: 0.02em;
	}
	.config-text {
	background: #e0f2fe;
	color: #075985;
	}
	.config-visual {
	background: #fef3c7;
	color: #92400e;
	}
	.rpc-table a { color: #2563eb; text-decoration: none; }
	.rpc-table a:hover { text-decoration: underline; }
	.submit-panel { max-width: 980px; }
	.submit-panel pre { border-radius: 8px; }
	"""


	TABLE_SCRIPT = """
	<script>
	(function () {
	const table = document.getElementById("rpc-leaderboard-table");
	if (!table) return;

	const tbody = table.querySelector("tbody");
	const headers = table.querySelectorAll("th[data-sort]");
	const filterButtons = document.querySelectorAll(".config-filter button");
	const countEl = document.getElementById("table-count");
	let activeConfig = "ALL";
	let sortColumn = "Informativeness";
	let sortDirection = "desc";

	function parseValue(row, column) {
	const cell = row.querySelector(`[data-col="${column}"]`);
	if (!cell) return "";
	const raw = cell.getAttribute("data-value") \|\| cell.textContent \|\| "";
	if (["Rank", "Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"].includes(column)) {
	const num = Number.parseFloat(raw);
	return Number.isNaN(num) ? -Infinity : num;
	}
	if (column === "Date") {
	const time = Date.parse(raw);
	return Number.isNaN(time) ? 0 : time;
	}
	return raw.toLowerCase();
	}


	function apply() {
	const rows = Array.from(tbody.querySelectorAll("tr"));
	const sorted = rows.slice().sort((a, b) => {
	const av = parseValue(a, sortColumn);
	const bv = parseValue(b, sortColumn);
	if (av < bv) return sortDirection === "asc" ? -1 : 1;
	if (av > bv) return sortDirection === "asc" ? 1 : -1;
	return 0;
	});

	sorted.forEach(row => tbody.appendChild(row));

	let shown = 0;
	Array.from(tbody.querySelectorAll("tr")).forEach(row => {
	const visible = activeConfig === "ALL" \|\| row.dataset.config === activeConfig;
	row.style.display = visible ? "" : "none";
	if (visible) {
	shown += 1;
	const rankCell = row.querySelector('[data-col="Rank"]');
	if (rankCell) {
	rankCell.textContent = shown;
	rankCell.setAttribute("data-value", String(shown));
	}
	}
	});

	if (countEl) countEl.textContent = `${shown} entries`;
	headers.forEach(header => {
	header.classList.remove("sort-asc", "sort-desc");
	if (header.dataset.sort === sortColumn) {
	header.classList.add(sortDirection === "asc" ? "sort-asc" : "sort-desc");
	}
	});
	}

	headers.forEach(header => {
	header.addEventListener("click", () => {
	const column = header.dataset.sort;
	if (sortColumn === column) {
	sortDirection = sortDirection === "asc" ? "desc" : "asc";
	} else {
	sortColumn = column;
	sortDirection = ["Model", "Organization", "Input Config", "Date"].includes(column) ? "asc" : "desc";
	}
	apply();
	});
	});

	filterButtons.forEach(button => {
	button.addEventListener("click", () => {
	activeConfig = button.dataset.config;
	filterButtons.forEach(item => item.classList.toggle("active", item === button));
	apply();
	});
	});

	apply();
	})();
	</script>
	"""

	SUBMISSION_GUIDE = """
	<div class="submit-panel">

	### How to Submit

	1. Fork <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">this repository</a>.
	2. Create a new branch for your submission.
	3. Add your submission folder under
	`submissions/<organization>__<model>__<input_config>/`.
	4. Open a Pull Request with the new submission folder.

	### Submission Directory Requirements

	Each submission directory must contain the metadata and predictions for one
	model/input configuration pair:

	```text
	<organization>__<model>__<input_config>/
	metadata.yaml
	predictions.jsonl
	generation_config.json # optional, recommended
	artifacts/ # optional logs or prompt notes
	```

	Use URL-safe directory names. Replace spaces, slashes, and special characters
	with hyphens; keep `input_config` as `TEXT` or `VISUAL`.

	### `metadata.yaml`

	```yaml
	model_name: "My Model"
	organization: "My Org"
	model_url: https://... # optional work link: paper, GitHub, model card, etc.
	date: "2026-06-17" # model release date, not submission date
	split: test
	input_config: TEXT # TEXT or VISUAL
	```

	### `predictions.jsonl`

	Each line must be one JSON object:

	```json
	{
	"id": "paper-id",
	"part_idx": 1,
	"question": "question text",
	"category": "category",
	"gen_answer": "model answer"
	}
	```

	`part_idx` is the question index in the current paper's `qa_pairs` list (`1` for the first item). `category` must match the corresponding item in `test.json`.

	### Validation Rules

	Your submission will be validated before evaluation. To pass:

	- `metadata.yaml` must include `model_name`, `organization`, `date`, `split`,
	and `input_config`.
	- `model_url` is optional.
	- `date` is the model release date, not the submission date.
	- `split` must be `test`.
	- `input_config` must be `TEXT` or `VISUAL`.
	- `predictions.jsonl` must contain exactly one line for every QA item in
	`test.json`.
	- `part_idx` is the question index in the current paper's `qa_pairs` list
	(`1` for the first item).
	- `id`, `part_idx`, `question`, and `category` must exactly match the benchmark
	item.
	- `gen_answer` must be a string.
	- For `Claim_Verification`, `gen_answer` must be exactly `True` or `False`.

	### Submission Process

	1. Open PR: add your folder under
	`submissions/<organization>__<model>__<input_config>/`.
	2. Fix issues: if validation fails, update the PR with corrected files.
	3. Review: once validation passes, a maintainer reviews the submission.
	4. Evaluate: maintainers run the official evaluator in a controlled local
	environment.
	5. Import: accepted aggregate results are imported to the leaderboard.

	</div>
	"""


	def _parse_markdown_link(value):
	text = str(value).strip()
	match = MODEL_LINK_RE.match(text)
	if match:
	return match.group("name"), match.group("url")
	return text, ""


	def _read_csv_leaderboard():
	df = pd.read_csv(LEADERBOARD_CSV_PATH)
	if "Info" in df.columns and "Informativeness" not in df.columns:
	df = df.rename(columns={"Info": "Informativeness"})
	names = []
	urls = []
	for value in df.get("Model", []):
	name, url = _parse_markdown_link(value)
	names.append(name)
	urls.append(url)
	if "Model" in df.columns:
	df["Model"] = names
	df["url"] = urls
	for col in NUMERIC_COLUMNS:
	if col in df.columns:
	df[col] = pd.to_numeric(df[col], errors="coerce")
	df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
	df.insert(0, "Rank", range(1, len(df) + 1))
	return df


	def _read_json_leaderboard():
	with LEADERBOARD_JSON_PATH.open("r", encoding="utf-8") as f:
	data = json.load(f)
	rows = []
	for season in data.get("seasons", {}).values():
	for row in season.get("models", []):
	rows.append({
	"Model": row.get("name", ""),
	"url": row.get("url", ""),
	"Organization": row.get("org", ""),
	"Input Config": str(row.get("modality", "")).upper(),
	"Conciseness": row.get("conciseness", 0),
	"Correctness": row.get("correctness", 0),
	"Completeness": row.get("completeness", 0),
	"F1-like": row.get("f1_like", row.get("informativeness", 0)),
	"Informativeness": row.get("informativeness", row.get("info", row.get("overall", 0))),
	"Date": row.get("date", ""),
	})
	df = pd.DataFrame(rows)
	if df.empty:
	return pd.DataFrame(columns=DISPLAY_COLUMNS + ["url"])
	df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
	df.insert(0, "Rank", range(1, len(df) + 1))
	return df


	def load_leaderboard_table():
	if LEADERBOARD_CSV_PATH.exists():
	try:
	return _read_csv_leaderboard()
	except Exception:
	pass
	return _read_json_leaderboard()


	def _format_cell(value, column):
	if pd.isna(value):
	return ""
	if column in NUMERIC_COLUMNS:
	return f"{float(value):.2f}"
	return html.escape(str(value))


	def _render_input_config(value):
	config = str(value).upper()
	if config == "TEXT":
	return '<span class="config-badge config-text">TEXT</span>'
	if config == "VISUAL":
	return '<span class="config-badge config-visual">VISUAL</span>'
	return html.escape(config)


	def render_leaderboard_html():
	df = load_leaderboard_table()
	columns = [col for col in DISPLAY_COLUMNS if col in df.columns]

	thead = "".join(
	f'<th data-sort="{html.escape(col, quote=True)}">{html.escape(col)}</th>'
	for col in columns
	)
	body_rows = []
	for _, row in df.iterrows():
	cells = []
	config_value = str(row.get("Input Config", "")).upper()
	for col in columns:
	classes = []
	if col == "Rank":
	classes.append("rank")
	if col == "Model":
	classes.append("model")
	if col == "Organization":
	classes.append("org")
	if col in NUMERIC_COLUMNS or col == "Rank":
	classes.append("num")
	class_attr = f' class="{" ".join(classes)}"' if classes else ""
	data_value = html.escape(str(row[col]), quote=True)
	data_col = html.escape(col, quote=True)
	if col == "Model" and row.get("url"):
	text = html.escape(str(row[col]))
	url = html.escape(str(row["url"]), quote=True)
	value = f'<a href="{url}" target="_blank" rel="noopener noreferrer">{text}</a>'
	elif col == "Input Config":
	value = _render_input_config(row[col])
	else:
	value = _format_cell(row[col], col)
	cells.append(f'<td{class_attr} data-col="{data_col}" data-value="{data_value}">{value}</td>')
	body_rows.append(f'<tr data-config="{html.escape(config_value, quote=True)}">' + "".join(cells) + "</tr>")

	return f"""
	<div class="leaderboard-toolbar">
	<div class="config-filter" aria-label="Input Config filter">
	<button type="button" class="active" data-config="ALL">All</button>
	<button type="button" data-config="TEXT">TEXT</button>
	<button type="button" data-config="VISUAL">VISUAL</button>
	</div>
	<div id="table-count" class="table-count">{len(df)} entries</div>
	</div>
	<div class="leaderboard-shell">
	<table id="rpc-leaderboard-table" class="rpc-table">
	<thead><tr>{thead}</tr></thead>
	<tbody>{''.join(body_rows)}</tbody>
	</table>
	</div>
	{TABLE_SCRIPT}
	"""



	with gr.Blocks(title="RPC-Bench Leaderboard", analytics_enabled=False, css=CUSTOM_CSS) as demo:
	gr.Markdown(
	"""
	# RPC-Bench Leaderboard

	<div class="rpc-links">
	<span>🌐 <a href="https://rpc-bench.github.io/" target="_blank" rel="noopener noreferrer">Project Page</a></span>
	<span>•</span>
	<span>📖 <a href="https://arxiv.org/abs/2601.14289" target="_blank" rel="noopener noreferrer">Paper</a></span>
	<span>•</span>
	<span>🤗 <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">Hugging Face</a></span>
	<span>•</span>
	<span>🧭 <a href="https://modelscope.cn/datasets/ZhipuAI/RPC-Bench" target="_blank" rel="noopener noreferrer">ModelScope</a></span>
	</div>
	""",
	elem_classes=["rpc-title"],
	)

	gr.Markdown(SUBMISSION_GUIDE)
	gr.Markdown("### Leaderboard")
	gr.HTML(render_leaderboard_html())


	if __name__ == "__main__":
	demo.launch(show_api=False)