RPC-Bench / app.py
ZHANGYUXUAN-zR's picture
Upload RPC-Bench leaderboard Space
e9313f3 verified
Raw
History Blame Contribute Delete
15.2 kB
import html
import json
import re
from pathlib import Path
import gradio as gr
import pandas as pd
ROOT = Path(__file__).parent
DATA_DIR = ROOT / "data"
LEADERBOARD_JSON_PATH = DATA_DIR / "leaderboard.json"
LEADERBOARD_CSV_PATH = DATA_DIR / "leaderboard_seed.csv"
MODEL_LINK_RE = re.compile(r"^\[(?P<name>.*)\]\((?P<url>.*)\)$")
DISPLAY_COLUMNS = [
"Rank",
"Model",
"Organization",
"Input Config",
"Conciseness",
"Correctness",
"Completeness",
"F1-like",
"Informativeness",
"Date",
]
NUMERIC_COLUMNS = ["Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"]
CUSTOM_CSS = """
footer { display: none !important; }
.api-docs, .show-api, .built-with, [data-testid="api-info"] { display: none !important; }
.gradio-container { max-width: 100% !important; padding: 18px 24px 16px !important; }
#component-0 { max-width: 100% !important; }
.rpc-title h1 { margin-bottom: 4px !important; }
.rpc-title p { margin-top: 0 !important; color: #555; }
.rpc-links {
display: flex;
justify-content: center;
align-items: center;
gap: 8px;
flex-wrap: wrap;
margin: 4px 0 12px;
color: #4b5563;
}
.rpc-links a { color: #2563eb; text-decoration: none; }
.rpc-links a:hover { text-decoration: underline; }
.leaderboard-toolbar {
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
margin: 8px 0 12px;
flex-wrap: wrap;
}
.config-filter {
display: inline-flex;
align-items: center;
gap: 4px;
padding: 3px;
border: 1px solid #e5e7eb;
border-radius: 8px;
background: #f8fafc;
}
.config-filter button {
border: 0;
background: transparent;
color: #4b5563;
cursor: pointer;
font-size: 13px;
font-weight: 600;
padding: 6px 10px;
border-radius: 6px;
}
.config-filter button.active {
background: #111827;
color: #ffffff;
}
.table-count {
color: #6b7280;
font-size: 13px;
}
.leaderboard-shell {
height: calc(100vh - 245px);
min-height: 460px;
max-height: 780px;
overflow: auto;
border: 1px solid #e5e7eb;
border-radius: 8px;
background: white;
}
.rpc-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
font-size: 14px;
}
.rpc-table th {
position: sticky;
top: 0;
z-index: 2;
background: #f8fafc;
color: #111827;
font-weight: 650;
text-align: left;
border-bottom: 1px solid #d1d5db;
padding: 9px 11px;
white-space: nowrap;
cursor: pointer;
user-select: none;
}
.rpc-table th::after {
content: "↕";
color: #9ca3af;
font-size: 11px;
margin-left: 6px;
}
.rpc-table th.sort-asc::after { content: "↑"; color: #111827; }
.rpc-table th.sort-desc::after { content: "↓"; color: #111827; }
.rpc-table td {
border-bottom: 1px solid #eef2f7;
padding: 9px 11px;
vertical-align: middle;
white-space: nowrap;
}
.rpc-table tbody tr:hover { background: #f9fafb; }
.rpc-table .num { text-align: right; font-variant-numeric: tabular-nums; }
.rpc-table .rank { width: 64px; text-align: right; color: #4b5563; }
.rpc-table .model { min-width: 210px; font-weight: 600; }
.rpc-table .org { min-width: 180px; }
.config-badge {
display: inline-flex;
align-items: center;
justify-content: center;
min-width: 58px;
padding: 3px 8px;
border-radius: 999px;
font-size: 12px;
font-weight: 700;
letter-spacing: 0.02em;
}
.config-text {
background: #e0f2fe;
color: #075985;
}
.config-visual {
background: #fef3c7;
color: #92400e;
}
.rpc-table a { color: #2563eb; text-decoration: none; }
.rpc-table a:hover { text-decoration: underline; }
.submit-panel { max-width: 980px; }
.submit-panel pre { border-radius: 8px; }
"""
TABLE_SCRIPT = """
<script>
(function () {
const table = document.getElementById("rpc-leaderboard-table");
if (!table) return;
const tbody = table.querySelector("tbody");
const headers = table.querySelectorAll("th[data-sort]");
const filterButtons = document.querySelectorAll(".config-filter button");
const countEl = document.getElementById("table-count");
let activeConfig = "ALL";
let sortColumn = "Informativeness";
let sortDirection = "desc";
function parseValue(row, column) {
const cell = row.querySelector(`[data-col="${column}"]`);
if (!cell) return "";
const raw = cell.getAttribute("data-value") || cell.textContent || "";
if (["Rank", "Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"].includes(column)) {
const num = Number.parseFloat(raw);
return Number.isNaN(num) ? -Infinity : num;
}
if (column === "Date") {
const time = Date.parse(raw);
return Number.isNaN(time) ? 0 : time;
}
return raw.toLowerCase();
}
function apply() {
const rows = Array.from(tbody.querySelectorAll("tr"));
const sorted = rows.slice().sort((a, b) => {
const av = parseValue(a, sortColumn);
const bv = parseValue(b, sortColumn);
if (av < bv) return sortDirection === "asc" ? -1 : 1;
if (av > bv) return sortDirection === "asc" ? 1 : -1;
return 0;
});
sorted.forEach(row => tbody.appendChild(row));
let shown = 0;
Array.from(tbody.querySelectorAll("tr")).forEach(row => {
const visible = activeConfig === "ALL" || row.dataset.config === activeConfig;
row.style.display = visible ? "" : "none";
if (visible) {
shown += 1;
const rankCell = row.querySelector('[data-col="Rank"]');
if (rankCell) {
rankCell.textContent = shown;
rankCell.setAttribute("data-value", String(shown));
}
}
});
if (countEl) countEl.textContent = `${shown} entries`;
headers.forEach(header => {
header.classList.remove("sort-asc", "sort-desc");
if (header.dataset.sort === sortColumn) {
header.classList.add(sortDirection === "asc" ? "sort-asc" : "sort-desc");
}
});
}
headers.forEach(header => {
header.addEventListener("click", () => {
const column = header.dataset.sort;
if (sortColumn === column) {
sortDirection = sortDirection === "asc" ? "desc" : "asc";
} else {
sortColumn = column;
sortDirection = ["Model", "Organization", "Input Config", "Date"].includes(column) ? "asc" : "desc";
}
apply();
});
});
filterButtons.forEach(button => {
button.addEventListener("click", () => {
activeConfig = button.dataset.config;
filterButtons.forEach(item => item.classList.toggle("active", item === button));
apply();
});
});
apply();
})();
</script>
"""
SUBMISSION_GUIDE = """
<div class="submit-panel">
### How to Submit
1. Fork <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">this repository</a>.
2. Create a new branch for your submission.
3. Add your submission folder under
`submissions/<organization>__<model>__<input_config>/`.
4. Open a Pull Request with the new submission folder.
### Submission Directory Requirements
Each submission directory must contain the metadata and predictions for one
model/input configuration pair:
```text
<organization>__<model>__<input_config>/
metadata.yaml
predictions.jsonl
generation_config.json # optional, recommended
artifacts/ # optional logs or prompt notes
```
Use URL-safe directory names. Replace spaces, slashes, and special characters
with hyphens; keep `input_config` as `TEXT` or `VISUAL`.
### `metadata.yaml`
```yaml
model_name: "My Model"
organization: "My Org"
model_url: https://... # optional work link: paper, GitHub, model card, etc.
date: "2026-06-17" # model release date, not submission date
split: test
input_config: TEXT # TEXT or VISUAL
```
### `predictions.jsonl`
Each line must be one JSON object:
```json
{
"id": "paper-id",
"part_idx": 1,
"question": "question text",
"category": "category",
"gen_answer": "model answer"
}
```
`part_idx` is the question index in the current paper's `qa_pairs` list (`1` for the first item). `category` must match the corresponding item in `test.json`.
### Validation Rules
Your submission will be validated before evaluation. To pass:
- `metadata.yaml` must include `model_name`, `organization`, `date`, `split`,
and `input_config`.
- `model_url` is optional.
- `date` is the model release date, not the submission date.
- `split` must be `test`.
- `input_config` must be `TEXT` or `VISUAL`.
- `predictions.jsonl` must contain exactly one line for every QA item in
`test.json`.
- `part_idx` is the question index in the current paper's `qa_pairs` list
(`1` for the first item).
- `id`, `part_idx`, `question`, and `category` must exactly match the benchmark
item.
- `gen_answer` must be a string.
- For `Claim_Verification`, `gen_answer` must be exactly `True` or `False`.
### Submission Process
1. Open PR: add your folder under
`submissions/<organization>__<model>__<input_config>/`.
2. Fix issues: if validation fails, update the PR with corrected files.
3. Review: once validation passes, a maintainer reviews the submission.
4. Evaluate: maintainers run the official evaluator in a controlled local
environment.
5. Import: accepted aggregate results are imported to the leaderboard.
</div>
"""
def _parse_markdown_link(value):
text = str(value).strip()
match = MODEL_LINK_RE.match(text)
if match:
return match.group("name"), match.group("url")
return text, ""
def _read_csv_leaderboard():
df = pd.read_csv(LEADERBOARD_CSV_PATH)
if "Info" in df.columns and "Informativeness" not in df.columns:
df = df.rename(columns={"Info": "Informativeness"})
names = []
urls = []
for value in df.get("Model", []):
name, url = _parse_markdown_link(value)
names.append(name)
urls.append(url)
if "Model" in df.columns:
df["Model"] = names
df["url"] = urls
for col in NUMERIC_COLUMNS:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
return df
def _read_json_leaderboard():
with LEADERBOARD_JSON_PATH.open("r", encoding="utf-8") as f:
data = json.load(f)
rows = []
for season in data.get("seasons", {}).values():
for row in season.get("models", []):
rows.append({
"Model": row.get("name", ""),
"url": row.get("url", ""),
"Organization": row.get("org", ""),
"Input Config": str(row.get("modality", "")).upper(),
"Conciseness": row.get("conciseness", 0),
"Correctness": row.get("correctness", 0),
"Completeness": row.get("completeness", 0),
"F1-like": row.get("f1_like", row.get("informativeness", 0)),
"Informativeness": row.get("informativeness", row.get("info", row.get("overall", 0))),
"Date": row.get("date", ""),
})
df = pd.DataFrame(rows)
if df.empty:
return pd.DataFrame(columns=DISPLAY_COLUMNS + ["url"])
df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
return df
def load_leaderboard_table():
if LEADERBOARD_CSV_PATH.exists():
try:
return _read_csv_leaderboard()
except Exception:
pass
return _read_json_leaderboard()
def _format_cell(value, column):
if pd.isna(value):
return ""
if column in NUMERIC_COLUMNS:
return f"{float(value):.2f}"
return html.escape(str(value))
def _render_input_config(value):
config = str(value).upper()
if config == "TEXT":
return '<span class="config-badge config-text">TEXT</span>'
if config == "VISUAL":
return '<span class="config-badge config-visual">VISUAL</span>'
return html.escape(config)
def render_leaderboard_html():
df = load_leaderboard_table()
columns = [col for col in DISPLAY_COLUMNS if col in df.columns]
thead = "".join(
f'<th data-sort="{html.escape(col, quote=True)}">{html.escape(col)}</th>'
for col in columns
)
body_rows = []
for _, row in df.iterrows():
cells = []
config_value = str(row.get("Input Config", "")).upper()
for col in columns:
classes = []
if col == "Rank":
classes.append("rank")
if col == "Model":
classes.append("model")
if col == "Organization":
classes.append("org")
if col in NUMERIC_COLUMNS or col == "Rank":
classes.append("num")
class_attr = f' class="{" ".join(classes)}"' if classes else ""
data_value = html.escape(str(row[col]), quote=True)
data_col = html.escape(col, quote=True)
if col == "Model" and row.get("url"):
text = html.escape(str(row[col]))
url = html.escape(str(row["url"]), quote=True)
value = f'<a href="{url}" target="_blank" rel="noopener noreferrer">{text}</a>'
elif col == "Input Config":
value = _render_input_config(row[col])
else:
value = _format_cell(row[col], col)
cells.append(f'<td{class_attr} data-col="{data_col}" data-value="{data_value}">{value}</td>')
body_rows.append(f'<tr data-config="{html.escape(config_value, quote=True)}">' + "".join(cells) + "</tr>")
return f"""
<div class="leaderboard-toolbar">
<div class="config-filter" aria-label="Input Config filter">
<button type="button" class="active" data-config="ALL">All</button>
<button type="button" data-config="TEXT">TEXT</button>
<button type="button" data-config="VISUAL">VISUAL</button>
</div>
<div id="table-count" class="table-count">{len(df)} entries</div>
</div>
<div class="leaderboard-shell">
<table id="rpc-leaderboard-table" class="rpc-table">
<thead><tr>{thead}</tr></thead>
<tbody>{''.join(body_rows)}</tbody>
</table>
</div>
{TABLE_SCRIPT}
"""
with gr.Blocks(title="RPC-Bench Leaderboard", analytics_enabled=False, css=CUSTOM_CSS) as demo:
gr.Markdown(
"""
# RPC-Bench Leaderboard
<div class="rpc-links">
<span>🌐 <a href="https://rpc-bench.github.io/" target="_blank" rel="noopener noreferrer">Project Page</a></span>
<span>β€’</span>
<span>πŸ“– <a href="https://arxiv.org/abs/2601.14289" target="_blank" rel="noopener noreferrer">Paper</a></span>
<span>β€’</span>
<span>πŸ€— <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">Hugging Face</a></span>
<span>β€’</span>
<span>🧭 <a href="https://modelscope.cn/datasets/ZhipuAI/RPC-Bench" target="_blank" rel="noopener noreferrer">ModelScope</a></span>
</div>
""",
elem_classes=["rpc-title"],
)
gr.Markdown(SUBMISSION_GUIDE)
gr.Markdown("### Leaderboard")
gr.HTML(render_leaderboard_html())
if __name__ == "__main__":
demo.launch(show_api=False)