Clemson-Computing-User's picture
VANTAGE-Bench v1.0
8b5161a
Raw
History Blame Contribute Delete
18.5 kB
"""Build per-pillar DataFrames for the VANTAGE-Bench leaderboard.
Primary API
-----------
build_all_tables(filtered_models, global_ranks) -> dict[str, pd.DataFrame]
Returns one DataFrame per pillar key in config.PILLARS.
Keys: 'overall', 'spatial', 'st', 'temporal', 'semantic'.
Model column: plain '{name} · {org}' string — bold if rank #1 in that
pillar. No badge HTML; badge rendering belongs in CSS.
Score columns: '—' for missing, '42.31' or '**42.31**' for best-in-col.
make_radar_svg(model) -> str
4-axis radar chart SVG (220 × 160 px) for the model detail side panel.
Axes: Semantic · Spatial · Sp-Temp · Temporal (clockwise from top).
Gradio compatibility API (app.py)
----------------------------------
render_tab(data, rank_map, tab, ...) -> (pd.DataFrame, status_str)
headers_for_tab(tab, selected_tasks=None) -> list[str]
datatypes_for_tab(tab, selected_tasks=None) -> list[str]
These accept the legacy tab keys ('spatio_temporal', etc.) used by app.py.
"""
from __future__ import annotations
import math
import pandas as pd
from .config import PILLARS, TASKS, TASK_METRIC_LABELS
from .data import LeaderboardData, ModelRecord
from .filters import apply_filters, is_filter_active
_MISSING = "—"
# Short task key (config.TASKS) → JSON score field in ModelRecord.scores.
_TASK_JSON_FIELD: dict[str, str] = {
"loc": "2d_localization",
"ground": "2d_referring_expressions",
"pointing": "2d_spatial_pointing",
"sot": "single_object_tracking",
"temploc": "temporal_localization",
"dvc": "dense_video_captioning",
"ev": "event_verification",
"vqa": "video_qa",
}
# Overall tab: ordered (display_label, json_field) pairs.
# Task columns ordered to align with pillar super-header groups injected by JS
# (see _RESIZE_JS in app.py): Spatial (3) · Spatio-Temp (1) · Temporal (2) · Semantic (2).
_OVERALL_SCORE_COLS: list[tuple[str, str]] = [
# Spatial
("Obj Loc", "2d_localization"),
("Ref Exp", "2d_referring_expressions"),
("Pointing", "2d_spatial_pointing"),
# Spatio-Temporal
("SOT", "single_object_tracking"),
# Temporal
("Temp Loc", "temporal_localization"),
("DVC", "dense_video_captioning"),
# Semantic
("Event Ver", "event_verification"),
("VQA", "video_qa"),
]
# Pillar group spec for the Overall-tab super-header row.
# Each entry: (colspan, label). Two empty leading cells cover the # and Name columns.
OVERALL_PILLAR_GROUPS: list[tuple[int, str]] = [
(2, ""), # # + Name
(3, "Spatial"),
(1, "Spatio-Temp"),
(2, "Temporal"),
(2, "Semantic"),
]
# Pillar aggregate score column shown immediately after Name on each pillar tab.
# Reuses the same pre-computed pillar field from the model record — no recomputation.
_PILLAR_AGGREGATE_COL: dict[str, tuple[str, str]] = {
"spatial": ("Spatial", "spatial"),
"st": ("Sp-Temp", "spatio_temporal"),
"temporal": ("Temporal", "temporal"),
"semantic": ("Semantic", "semantic"),
}
# Legacy Gradio tab key → short pillar key used in config.PILLARS.
_OLD_TAB_TO_PILLAR: dict[str, str] = {
"overall": "overall",
"spatial": "spatial",
"spatio_temporal": "st",
"temporal": "temporal",
"semantic": "semantic",
}
# -- Shared primitives -----------------------------------------------------
def _sort_by_rank(
models: list[ModelRecord], rank_map: dict[str, int]
) -> list[ModelRecord]:
sentinel = float("inf")
return sorted(models, key=lambda m: (rank_map.get(m.id, sentinel), m.name))
def _column_maxes(
models: list[ModelRecord], json_fields: list[str]
) -> dict[str, float]:
"""Maximum score per field across the given model set."""
maxes: dict[str, float] = {}
for f in json_fields:
vals = [m.scores[f] for m in models if f in m.scores]
if vals:
maxes[f] = max(vals)
return maxes
def _model_html(m: ModelRecord, rank_one_id: str | None) -> str:
"""HTML cell for the Model column: name + inline badges, org sub-line, type badges."""
name_part = f"<b>{m.name}</b>" if m.id == rank_one_id else m.name
if m.model_url:
name_core = f'<a href="{m.model_url}" target="_blank" class="mc-link">{name_part}</a>'
else:
name_core = name_part
verified_badge = '<span class="b b-verified">✓</span>' if m.verified else ""
name_html = f'<span class="mc-name-row">{name_core}{verified_badge}</span>'
badges: list[str] = []
if m.result_type == "ensemble":
badges.append('<span class="b b-ensemble">system / pipeline</span>')
else:
badges.append('<span class="b b-single">single</span>')
if m.type == "open":
badges.append('<span class="b b-open">open</span>')
else:
badges.append('<span class="b b-prop">prop.</span>')
if m.is_new:
badges.append('<span class="b b-new">new</span>')
badge_html = "".join(badges)
return (
f'<span data-n="{m.name}" class="mc">'
f'{name_html}'
f'<span class="mc-org">{m.organization}</span>'
f'<span class="mc-badges">{badge_html}</span>'
f'</span>'
)
def _fmt_score_val(v: float) -> str:
"""Format a numeric score cell for Styler display: NaN → '—', else 2 d.p."""
if pd.isna(v):
return _MISSING
return f"{v:.2f}"
def _score_cols_for_pillar(
pillar: str,
selected_tasks: list[str] | None = None,
) -> list[tuple[str, str]]:
"""Return [(display_label, json_field)] for one pillar.
selected_tasks: JSON field names to include (Gradio column-toggle).
None means show all. Only applies to non-overall pillars.
"""
if pillar == "overall":
return list(_OVERALL_SCORE_COLS)
task_keys = PILLARS[pillar]
if selected_tasks is not None:
task_keys = [tk for tk in task_keys if _TASK_JSON_FIELD[tk] in selected_tasks]
cols: list[tuple[str, str]] = []
# Lead each pillar tab with its pre-computed pillar aggregate score.
agg = _PILLAR_AGGREGATE_COL.get(pillar)
if agg is not None:
cols.append(agg)
for tk in task_keys:
json_field = _TASK_JSON_FIELD[tk]
metric = TASK_METRIC_LABELS.get(json_field, "")
label = f"{TASKS[tk]} ({metric})" if metric else TASKS[tk]
cols.append((label, json_field))
return cols
def _assemble_df(
sorted_models: list[ModelRecord],
score_cols: list[tuple[str, str]],
rank_one_id: str | None,
) -> "pd.io.formats.style.Styler":
"""Return a Pandas Styler with:
- score columns stored as float (NaN for missing) → enables numeric sort
- bold font-weight applied to the column-maximum cell in each score column
- NaN formatted as '—', floats formatted to 2 decimal places
"""
rows = []
for i, m in enumerate(sorted_models, 1):
row: dict = {
"#": str(i),
"Name": _model_html(m, rank_one_id),
}
for label, json_field in score_cols:
v = m.scores.get(json_field)
row[label] = float(v) if v is not None else float("nan")
rows.append(row)
score_labels = [lbl for lbl, _ in score_cols]
columns = ["#", "Name"] + score_labels
df = pd.DataFrame(rows, columns=columns)
def _bold_max(col: pd.Series) -> list[str]:
col_max = col.max(skipna=True)
return [
"font-weight: bold" if (not pd.isna(v) and v == col_max) else ""
for v in col
]
if score_labels:
styler = (
df.style
.apply(_bold_max, subset=score_labels, axis=0)
.format(_fmt_score_val, subset=score_labels, na_rep=_MISSING)
)
else:
styler = df.style
return styler
# -- Primary API -----------------------------------------------------------
def build_overall_html_table(
filtered_models: list[ModelRecord],
rank_map: dict[str, int],
) -> str:
"""Hand-rendered HTML <table> for the Overall tab.
Used instead of gr.Dataframe because Gradio's DataFrame component
does not cleanly support multi-level (grouped) column headers.
Renders a two-row header:
# · Name · Overall span both rows (rowspan=2)
Spatial (×3) | Spatio-Temp (×1) | Temporal (×2) | Semantic (×2)
Obj Loc | Ref Exp | Pointing | SOT | Temp Loc | DVC | Event Ver | VQA
The Overall column sits between Name and the pillar groups as a
standalone (non-grouped) column showing each model's stored overall
score from ModelRecord.scores["overall"].
Preserves: striping, hover, bold column-max, model badges, rank order,
scroll behavior.
"""
sorted_models = _sort_by_rank(filtered_models, rank_map)
rank_one_id = next(
(m.id for m in sorted_models if rank_map.get(m.id) == 1), None
)
score_cols = _OVERALL_SCORE_COLS
json_fields = [f for _, f in score_cols] + ["overall"]
col_max = _column_maxes(filtered_models, json_fields)
overall_max = col_max.get("overall")
# Render the table directly (no .table-wrap wrapper, no resize handle):
# the gr.HTML element itself acts as the rectangular container, with
# styling on .lb-table-overall in css.py. This avoids Gradio's
# rounded/clipped DataFrame-shell visuals.
parts: list[str] = ['<table class="lb-overall-table">']
# Column-width control via <colgroup> (avoids nth-child collisions across
# the two header rows). The Spatio-Temporal column gets a wider class
# because its lone task (SOT) sits under the long "Spatio-Temp" pillar
# super-header — we want that header to fit on one line.
_ST_FIELD = "single_object_tracking"
parts.append('<colgroup>')
parts.append('<col class="col-rank">')
parts.append('<col class="col-name">')
parts.append('<col class="col-overall">')
for _, field in score_cols:
cls = "col-score col-score-st" if field == _ST_FIELD else "col-score"
parts.append(f'<col class="{cls}">')
parts.append('</colgroup>')
# Header: two rows. # / Name / Overall span both via rowspan=2.
parts.append('<thead>')
parts.append('<tr class="lb-group-row">')
parts.append('<th class="lb-corner" rowspan="2">#</th>')
parts.append('<th class="lb-corner" rowspan="2">Name</th>')
parts.append('<th class="lb-corner lb-corner-num" rowspan="2">Overall</th>')
for span, label in [(3, "Spatial"), (1, "Spatio-Temp"),
(2, "Temporal"), (2, "Semantic")]:
parts.append(f'<th class="lb-group" colspan="{span}">{label}</th>')
parts.append('</tr>')
parts.append('<tr class="lb-task-row">')
for label, _ in score_cols:
parts.append(f'<th>{label}</th>')
parts.append('</tr>')
parts.append('</thead>')
# Body
# Total column count: # + Name + Overall + 8 task scores = 11.
total_cols = 3 + len(score_cols)
parts.append('<tbody>')
if not sorted_models:
parts.append(
f'<tr><td colspan="{total_cols}" class="lb-empty-row">'
f'No models match — adjust the filters.</td></tr>'
)
for i, m in enumerate(sorted_models, 1):
parts.append(f'<tr data-id="{m.id}">')
parts.append(f'<td class="lb-rank">{i}</td>')
parts.append(f'<td class="lb-name">{_model_html(m, rank_one_id)}</td>')
# Overall cell — bold if it's the column max.
ov = m.scores.get("overall")
if ov is None:
parts.append(f'<td class="lb-score lb-overall">{_MISSING}</td>')
else:
is_max = overall_max is not None and ov == overall_max
cls = "lb-score lb-overall lb-max" if is_max else "lb-score lb-overall"
parts.append(f'<td class="{cls}">{ov:.2f}</td>')
for _, f in score_cols:
v = m.scores.get(f)
if v is None:
parts.append(f'<td class="lb-score">{_MISSING}</td>')
else:
m_val = col_max.get(f)
is_max = m_val is not None and v == m_val
cls = "lb-score lb-max" if is_max else "lb-score"
parts.append(f'<td class="{cls}">{v:.2f}</td>')
parts.append('</tr>')
parts.append('</tbody>')
parts.append('</table>')
return "".join(parts)
def build_all_tables(
filtered_models: list[ModelRecord],
global_ranks: dict[str, dict[str, int]],
) -> dict[str, pd.DataFrame]:
"""Return {pillar_key: pd.DataFrame} for every pillar in config.PILLARS.
Parameters
----------
filtered_models:
Model list already narrowed by search / params / type filters.
global_ranks:
Mapping from util.ranking.GLOBAL_RANKS —
{'overall': {model_id: rank}, 'spatial': ..., 'st': ..., ...}.
Column rules
------------
Overall : #, Name, Overall, Spatial, Sp-Temp, Temporal, Semantic
Pillar : #, Name, [task columns for that pillar only]
Name : '{name} · {org}', bold when rank == 1 in this pillar
Scores : '42.31' or '**42.31**' for best-in-column; '—' if absent
"""
result: dict[str, pd.DataFrame] = {}
for pillar in PILLARS:
rank_map = global_ranks[pillar]
sorted_models = _sort_by_rank(filtered_models, rank_map)
score_cols = _score_cols_for_pillar(pillar)
rank_one_id = next(
(m.id for m in sorted_models if rank_map.get(m.id) == 1), None
)
result[pillar] = _assemble_df(
sorted_models, score_cols, rank_one_id
)
return result
# -- Radar SVG ------------------------------------------------------------
def make_radar_svg(m: ModelRecord) -> str:
"""4-axis radar chart SVG for the model detail side panel.
Axes (clockwise from top): Semantic · Spatial · Sp-Temp · Temporal.
Canvas: 220 × 160 px. Scores assumed in [0, 100].
"""
W, H = 220, 160
cx, cy = W / 2, 82.0 # slightly below centre for top-label room
chart_r = 50.0 # polygon radius
label_r = 67.0 # label ring radius
labels = ["Semantic", "Spatial", "Sp-Temp", "Temporal"]
fields = ["semantic", "spatial", "spatio_temporal", "temporal"]
vals: list[float] = []
for f in fields:
raw = m.scores.get(f)
v = float(raw) / 100.0 if raw is not None else 0.0
vals.append(max(0.0, min(1.0, v)))
N = 4
step = 2 * math.pi / N
off = -math.pi / 2 # index 0 points straight up
parts: list[str] = [
f'<svg xmlns="http://www.w3.org/2000/svg" '
f'width="{W}" height="{H}" viewBox="0 0 {W} {H}">'
]
# Grid rings (4 concentric)
for g in range(1, 5):
rg = chart_r * g / 4
parts.append(
f'<circle cx="{cx}" cy="{cy}" r="{rg:.1f}" '
f'fill="none" stroke="#e5e7eb" stroke-width="0.5"/>'
)
# Axis lines
for i in range(N):
a = off + i * step
x2 = cx + chart_r * math.cos(a)
y2 = cy + chart_r * math.sin(a)
parts.append(
f'<line x1="{cx}" y1="{cy}" '
f'x2="{x2:.1f}" y2="{y2:.1f}" '
f'stroke="#e5e7eb" stroke-width="0.5"/>'
)
# Data polygon
poly_pts = " ".join(
f"{cx + chart_r * v * math.cos(off + i * step):.1f},"
f"{cy + chart_r * v * math.sin(off + i * step):.1f}"
for i, v in enumerate(vals)
)
parts.append(
f'<polygon points="{poly_pts}" '
f'fill="rgba(37,99,235,0.12)" stroke="#2563eb" '
f'stroke-width="1.5" stroke-linejoin="round"/>'
)
# Labels
for i, lbl in enumerate(labels):
a = off + i * step
lx = cx + label_r * math.cos(a)
ly = cy + label_r * math.sin(a)
parts.append(
f'<text x="{lx:.1f}" y="{ly:.1f}" '
f'text-anchor="middle" dominant-baseline="middle" '
f'font-size="9" fill="#6b7280" font-family="sans-serif">'
f'{lbl}</text>'
)
parts.append("</svg>")
return "".join(parts)
# -- Gradio compatibility API ----------------------------------------------
def headers_for_tab(tab: str, selected_tasks: list[str] | None = None) -> list[str]:
"""Column header strings for a Gradio Dataframe widget.
tab accepts legacy keys: 'overall', 'spatial', 'spatio_temporal',
'temporal', 'semantic'.
"""
pillar = _OLD_TAB_TO_PILLAR.get(tab, tab)
score_cols = _score_cols_for_pillar(pillar, selected_tasks)
return ["#", "Name"] + [label for label, _ in score_cols]
def datatypes_for_tab(tab: str, selected_tasks: list[str] | None = None) -> list[str]:
"""Gradio datatype list parallel to headers_for_tab output."""
dtypes = []
for h in headers_for_tab(tab, selected_tasks):
if h == "#":
dtypes.append("str")
elif h == "Name":
dtypes.append("html")
else:
dtypes.append("number")
return dtypes
def render_tab(
data: LeaderboardData,
rank_map: dict[str, int],
tab: str,
search: str | None,
params_bucket: str | None,
model_type: str | None,
selected_tasks: list[str] | None = None,
) -> tuple[pd.DataFrame, str]:
"""Build (df, status_str) for one Gradio tab.
Applies search / params / model-type filters internally.
selected_tasks: JSON field names to show (Gradio column-toggle); None = all.
"""
filtered = apply_filters(data.models, search, params_bucket, model_type)
sorted_models = _sort_by_rank(filtered, rank_map)
pillar = _OLD_TAB_TO_PILLAR.get(tab, tab)
score_cols = _score_cols_for_pillar(pillar, selected_tasks)
json_fields = [f for _, f in score_cols]
maxes = _column_maxes(filtered, json_fields)
rank_one_id = next(
(m.id for m in sorted_models if rank_map.get(m.id) == 1), None
)
df = _assemble_df(sorted_models, score_cols, rank_map, maxes, rank_one_id)
status = _status_line(
total=len(data.models),
shown=len(filtered),
search=search,
params_bucket=params_bucket,
model_type=model_type,
)
return df, status
def _status_line(
*,
total: int,
shown: int,
search: str | None,
params_bucket: str | None,
model_type: str | None,
) -> str:
if not is_filter_active(search, params_bucket, model_type):
return ""
if shown == 0:
return f"Showing 0 of {total} models — no matches"
return f"Showing {shown} of {total} models"