Spaces:

zai-org
/

RPC-Bench

Running

File size: 15,170 Bytes

e9313f3

import html
import json
import re
from pathlib import Path

import gradio as gr
import pandas as pd


ROOT = Path(__file__).parent
DATA_DIR = ROOT / "data"
LEADERBOARD_JSON_PATH = DATA_DIR / "leaderboard.json"
LEADERBOARD_CSV_PATH = DATA_DIR / "leaderboard_seed.csv"

MODEL_LINK_RE = re.compile(r"^\[(?P<name>.*)\]\((?P<url>.*)\)$")
DISPLAY_COLUMNS = [
    "Rank",
    "Model",
    "Organization",
    "Input Config",
    "Conciseness",
    "Correctness",
    "Completeness",
    "F1-like",
    "Informativeness",
    "Date",
]
NUMERIC_COLUMNS = ["Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"]

CUSTOM_CSS = """
footer { display: none !important; }
.api-docs, .show-api, .built-with, [data-testid="api-info"] { display: none !important; }
.gradio-container { max-width: 100% !important; padding: 18px 24px 16px !important; }
#component-0 { max-width: 100% !important; }
.rpc-title h1 { margin-bottom: 4px !important; }
.rpc-title p { margin-top: 0 !important; color: #555; }
.rpc-links {
  display: flex;
  justify-content: center;
  align-items: center;
  gap: 8px;
  flex-wrap: wrap;
  margin: 4px 0 12px;
  color: #4b5563;
}
.rpc-links a { color: #2563eb; text-decoration: none; }
.rpc-links a:hover { text-decoration: underline; }
.leaderboard-toolbar {
  display: flex;
  align-items: center;
  justify-content: space-between;
  gap: 12px;
  margin: 8px 0 12px;
  flex-wrap: wrap;
}
.config-filter {
  display: inline-flex;
  align-items: center;
  gap: 4px;
  padding: 3px;
  border: 1px solid #e5e7eb;
  border-radius: 8px;
  background: #f8fafc;
}
.config-filter button {
  border: 0;
  background: transparent;
  color: #4b5563;
  cursor: pointer;
  font-size: 13px;
  font-weight: 600;
  padding: 6px 10px;
  border-radius: 6px;
}
.config-filter button.active {
  background: #111827;
  color: #ffffff;
}
.table-count {
  color: #6b7280;
  font-size: 13px;
}
.leaderboard-shell {
  height: calc(100vh - 245px);
  min-height: 460px;
  max-height: 780px;
  overflow: auto;
  border: 1px solid #e5e7eb;
  border-radius: 8px;
  background: white;
}
.rpc-table {
  width: 100%;
  border-collapse: separate;
  border-spacing: 0;
  font-size: 14px;
}
.rpc-table th {
  position: sticky;
  top: 0;
  z-index: 2;
  background: #f8fafc;
  color: #111827;
  font-weight: 650;
  text-align: left;
  border-bottom: 1px solid #d1d5db;
  padding: 9px 11px;
  white-space: nowrap;
  cursor: pointer;
  user-select: none;
}
.rpc-table th::after {
  content: "↕";
  color: #9ca3af;
  font-size: 11px;
  margin-left: 6px;
}
.rpc-table th.sort-asc::after { content: "↑"; color: #111827; }
.rpc-table th.sort-desc::after { content: "↓"; color: #111827; }
.rpc-table td {
  border-bottom: 1px solid #eef2f7;
  padding: 9px 11px;
  vertical-align: middle;
  white-space: nowrap;
}
.rpc-table tbody tr:hover { background: #f9fafb; }
.rpc-table .num { text-align: right; font-variant-numeric: tabular-nums; }
.rpc-table .rank { width: 64px; text-align: right; color: #4b5563; }
.rpc-table .model { min-width: 210px; font-weight: 600; }
.rpc-table .org { min-width: 180px; }
.config-badge {
  display: inline-flex;
  align-items: center;
  justify-content: center;
  min-width: 58px;
  padding: 3px 8px;
  border-radius: 999px;
  font-size: 12px;
  font-weight: 700;
  letter-spacing: 0.02em;
}
.config-text {
  background: #e0f2fe;
  color: #075985;
}
.config-visual {
  background: #fef3c7;
  color: #92400e;
}
.rpc-table a { color: #2563eb; text-decoration: none; }
.rpc-table a:hover { text-decoration: underline; }
.submit-panel { max-width: 980px; }
.submit-panel pre { border-radius: 8px; }
"""


TABLE_SCRIPT = """
<script>
(function () {
  const table = document.getElementById("rpc-leaderboard-table");
  if (!table) return;

  const tbody = table.querySelector("tbody");
  const headers = table.querySelectorAll("th[data-sort]");
  const filterButtons = document.querySelectorAll(".config-filter button");
  const countEl = document.getElementById("table-count");
  let activeConfig = "ALL";
  let sortColumn = "Informativeness";
  let sortDirection = "desc";

  function parseValue(row, column) {
    const cell = row.querySelector(`[data-col="${column}"]`);
    if (!cell) return "";
    const raw = cell.getAttribute("data-value") || cell.textContent || "";
    if (["Rank", "Conciseness", "Correctness", "Completeness", "F1-like", "Informativeness"].includes(column)) {
      const num = Number.parseFloat(raw);
      return Number.isNaN(num) ? -Infinity : num;
    }
    if (column === "Date") {
      const time = Date.parse(raw);
      return Number.isNaN(time) ? 0 : time;
    }
    return raw.toLowerCase();
  }


  function apply() {
    const rows = Array.from(tbody.querySelectorAll("tr"));
    const sorted = rows.slice().sort((a, b) => {
      const av = parseValue(a, sortColumn);
      const bv = parseValue(b, sortColumn);
      if (av < bv) return sortDirection === "asc" ? -1 : 1;
      if (av > bv) return sortDirection === "asc" ? 1 : -1;
      return 0;
    });

    sorted.forEach(row => tbody.appendChild(row));

    let shown = 0;
    Array.from(tbody.querySelectorAll("tr")).forEach(row => {
      const visible = activeConfig === "ALL" || row.dataset.config === activeConfig;
      row.style.display = visible ? "" : "none";
      if (visible) {
        shown += 1;
        const rankCell = row.querySelector('[data-col="Rank"]');
        if (rankCell) {
          rankCell.textContent = shown;
          rankCell.setAttribute("data-value", String(shown));
        }
      }
    });

    if (countEl) countEl.textContent = `${shown} entries`;
    headers.forEach(header => {
      header.classList.remove("sort-asc", "sort-desc");
      if (header.dataset.sort === sortColumn) {
        header.classList.add(sortDirection === "asc" ? "sort-asc" : "sort-desc");
      }
    });
  }

  headers.forEach(header => {
    header.addEventListener("click", () => {
      const column = header.dataset.sort;
      if (sortColumn === column) {
        sortDirection = sortDirection === "asc" ? "desc" : "asc";
      } else {
        sortColumn = column;
        sortDirection = ["Model", "Organization", "Input Config", "Date"].includes(column) ? "asc" : "desc";
      }
      apply();
    });
  });

  filterButtons.forEach(button => {
    button.addEventListener("click", () => {
      activeConfig = button.dataset.config;
      filterButtons.forEach(item => item.classList.toggle("active", item === button));
      apply();
    });
  });

  apply();
})();
</script>
"""

SUBMISSION_GUIDE = """
<div class="submit-panel">

### How to Submit

1. Fork <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">this repository</a>.
2. Create a new branch for your submission.
3. Add your submission folder under
   `submissions/<organization>__<model>__<input_config>/`.
4. Open a Pull Request with the new submission folder.

### Submission Directory Requirements

Each submission directory must contain the metadata and predictions for one
model/input configuration pair:

```text
<organization>__<model>__<input_config>/
  metadata.yaml
  predictions.jsonl
  generation_config.json      # optional, recommended
  artifacts/                  # optional logs or prompt notes
```

Use URL-safe directory names. Replace spaces, slashes, and special characters
with hyphens; keep `input_config` as `TEXT` or `VISUAL`.

### `metadata.yaml`

```yaml
model_name: "My Model"
organization: "My Org"
model_url: https://...        # optional work link: paper, GitHub, model card, etc.
date: "2026-06-17"           # model release date, not submission date
split: test
input_config: TEXT            # TEXT or VISUAL
```

### `predictions.jsonl`

Each line must be one JSON object:

```json
{
  "id": "paper-id",
  "part_idx": 1,
  "question": "question text",
  "category": "category",
  "gen_answer": "model answer"
}
```

`part_idx` is the question index in the current paper's `qa_pairs` list (`1` for the first item). `category` must match the corresponding item in `test.json`.

### Validation Rules

Your submission will be validated before evaluation. To pass:

- `metadata.yaml` must include `model_name`, `organization`, `date`, `split`,
  and `input_config`.
- `model_url` is optional.
- `date` is the model release date, not the submission date.
- `split` must be `test`.
- `input_config` must be `TEXT` or `VISUAL`.
- `predictions.jsonl` must contain exactly one line for every QA item in
  `test.json`.
- `part_idx` is the question index in the current paper's `qa_pairs` list
  (`1` for the first item).
- `id`, `part_idx`, `question`, and `category` must exactly match the benchmark
  item.
- `gen_answer` must be a string.
- For `Claim_Verification`, `gen_answer` must be exactly `True` or `False`.

### Submission Process

1. Open PR: add your folder under
   `submissions/<organization>__<model>__<input_config>/`.
2. Fix issues: if validation fails, update the PR with corrected files.
3. Review: once validation passes, a maintainer reviews the submission.
4. Evaluate: maintainers run the official evaluator in a controlled local
   environment.
5. Import: accepted aggregate results are imported to the leaderboard.

</div>
"""


def _parse_markdown_link(value):
    text = str(value).strip()
    match = MODEL_LINK_RE.match(text)
    if match:
        return match.group("name"), match.group("url")
    return text, ""


def _read_csv_leaderboard():
    df = pd.read_csv(LEADERBOARD_CSV_PATH)
    if "Info" in df.columns and "Informativeness" not in df.columns:
        df = df.rename(columns={"Info": "Informativeness"})
    names = []
    urls = []
    for value in df.get("Model", []):
        name, url = _parse_markdown_link(value)
        names.append(name)
        urls.append(url)
    if "Model" in df.columns:
        df["Model"] = names
        df["url"] = urls
    for col in NUMERIC_COLUMNS:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))
    return df


def _read_json_leaderboard():
    with LEADERBOARD_JSON_PATH.open("r", encoding="utf-8") as f:
        data = json.load(f)
    rows = []
    for season in data.get("seasons", {}).values():
        for row in season.get("models", []):
            rows.append({
                "Model": row.get("name", ""),
                "url": row.get("url", ""),
                "Organization": row.get("org", ""),
                "Input Config": str(row.get("modality", "")).upper(),
                "Conciseness": row.get("conciseness", 0),
                "Correctness": row.get("correctness", 0),
                "Completeness": row.get("completeness", 0),
                "F1-like": row.get("f1_like", row.get("informativeness", 0)),
                "Informativeness": row.get("informativeness", row.get("info", row.get("overall", 0))),
                "Date": row.get("date", ""),
            })
    df = pd.DataFrame(rows)
    if df.empty:
        return pd.DataFrame(columns=DISPLAY_COLUMNS + ["url"])
    df = df.sort_values("Informativeness", ascending=False, na_position="last").reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))
    return df


def load_leaderboard_table():
    if LEADERBOARD_CSV_PATH.exists():
        try:
            return _read_csv_leaderboard()
        except Exception:
            pass
    return _read_json_leaderboard()


def _format_cell(value, column):
    if pd.isna(value):
        return ""
    if column in NUMERIC_COLUMNS:
        return f"{float(value):.2f}"
    return html.escape(str(value))


def _render_input_config(value):
    config = str(value).upper()
    if config == "TEXT":
        return '<span class="config-badge config-text">TEXT</span>'
    if config == "VISUAL":
        return '<span class="config-badge config-visual">VISUAL</span>'
    return html.escape(config)


def render_leaderboard_html():
    df = load_leaderboard_table()
    columns = [col for col in DISPLAY_COLUMNS if col in df.columns]

    thead = "".join(
        f'<th data-sort="{html.escape(col, quote=True)}">{html.escape(col)}</th>'
        for col in columns
    )
    body_rows = []
    for _, row in df.iterrows():
        cells = []
        config_value = str(row.get("Input Config", "")).upper()
        for col in columns:
            classes = []
            if col == "Rank":
                classes.append("rank")
            if col == "Model":
                classes.append("model")
            if col == "Organization":
                classes.append("org")
            if col in NUMERIC_COLUMNS or col == "Rank":
                classes.append("num")
            class_attr = f' class="{" ".join(classes)}"' if classes else ""
            data_value = html.escape(str(row[col]), quote=True)
            data_col = html.escape(col, quote=True)
            if col == "Model" and row.get("url"):
                text = html.escape(str(row[col]))
                url = html.escape(str(row["url"]), quote=True)
                value = f'<a href="{url}" target="_blank" rel="noopener noreferrer">{text}</a>'
            elif col == "Input Config":
                value = _render_input_config(row[col])
            else:
                value = _format_cell(row[col], col)
            cells.append(f'<td{class_attr} data-col="{data_col}" data-value="{data_value}">{value}</td>')
        body_rows.append(f'<tr data-config="{html.escape(config_value, quote=True)}">' + "".join(cells) + "</tr>")

    return f"""
<div class="leaderboard-toolbar">
  <div class="config-filter" aria-label="Input Config filter">
    <button type="button" class="active" data-config="ALL">All</button>
    <button type="button" data-config="TEXT">TEXT</button>
    <button type="button" data-config="VISUAL">VISUAL</button>
  </div>
  <div id="table-count" class="table-count">{len(df)} entries</div>
</div>
<div class="leaderboard-shell">
  <table id="rpc-leaderboard-table" class="rpc-table">
    <thead><tr>{thead}</tr></thead>
    <tbody>{''.join(body_rows)}</tbody>
  </table>
</div>
{TABLE_SCRIPT}
"""



with gr.Blocks(title="RPC-Bench Leaderboard", analytics_enabled=False, css=CUSTOM_CSS) as demo:
    gr.Markdown(
        """
# RPC-Bench Leaderboard

<div class="rpc-links">
  <span>🌐 <a href="https://rpc-bench.github.io/" target="_blank" rel="noopener noreferrer">Project Page</a></span>
  <span>•</span>
  <span>📖 <a href="https://arxiv.org/abs/2601.14289" target="_blank" rel="noopener noreferrer">Paper</a></span>
  <span>•</span>
  <span>🤗 <a href="https://huggingface.co/datasets/zai-org/RPC-Bench" target="_blank" rel="noopener noreferrer">Hugging Face</a></span>
  <span>•</span>
  <span>🧭 <a href="https://modelscope.cn/datasets/ZhipuAI/RPC-Bench" target="_blank" rel="noopener noreferrer">ModelScope</a></span>
</div>
""",
        elem_classes=["rpc-title"],
    )

    gr.Markdown(SUBMISSION_GUIDE)
    gr.Markdown("### Leaderboard")
    gr.HTML(render_leaderboard_html())


if __name__ == "__main__":
    demo.launch(show_api=False)