Yif29's picture
Update leaderboard sorting and readability
dfd445e verified
Raw
History Blame Contribute Delete
13.3 kB
from __future__ import annotations
import os
from pathlib import Path
import gradio as gr
from leaderboard import (
DATA_PATH,
SORT_CHOICES,
filter_leaderboard,
load_leaderboard,
metric_standings,
model_choices,
render_methodology,
render_profile,
render_summary,
render_table,
)
from submission import submit_score
ROOT = Path(__file__).parent
OVERVIEW_IMAGE = ROOT / "assets" / "avbench_outline.png"
SUBMISSION_TEMPLATE = ROOT / "data" / "submission_template.csv"
LEADERBOARD = load_leaderboard(DATA_PATH)
STANDINGS = metric_standings(LEADERBOARD)
def update_leaderboard(component_type: str, query: str, sort_by: str, sort_order: str):
view = filter_leaderboard(LEADERBOARD, component_type, query, sort_by, sort_order)
return render_summary(LEADERBOARD, view), render_table(view, STANDINGS, sort_by, sort_order)
def update_profile(model: str):
return render_profile(LEADERBOARD, model)
CSS = """
:root {
--avgen-bg: #f6f7f9;
--avgen-panel: #ffffff;
--avgen-ink: #18202b;
--avgen-muted: #5f6b7a;
--avgen-line: #d9dee7;
--avgen-blue: #1d5f9f;
--avgen-teal: #087a73;
--avgen-orange: #b45309;
--avgen-green: #24784f;
}
.gradio-container {
max-width: 1280px !important;
margin: 0 auto !important;
background: var(--avgen-bg) !important;
color: var(--avgen-ink) !important;
}
.app-header {
display: grid;
grid-template-columns: minmax(0, 1fr) auto;
gap: 24px;
align-items: end;
padding: 26px 0 12px;
border-bottom: 1px solid var(--avgen-line);
}
.app-header h1 {
margin: 0;
color: #111827 !important;
font-size: clamp(28px, 4vw, 44px);
font-weight: 850;
line-height: 1.05;
letter-spacing: 0;
}
.app-header p {
margin: 10px 0 0;
max-width: 860px;
color: #4b5563 !important;
font-size: 16px;
}
.header-links {
display: flex;
flex-wrap: wrap;
gap: 8px;
justify-content: flex-end;
}
.header-links a {
color: var(--avgen-blue);
text-decoration: none;
border: 1px solid var(--avgen-line);
background: #fff;
padding: 8px 11px;
border-radius: 8px;
font-weight: 650;
}
.summary-grid {
display: grid;
grid-template-columns: repeat(5, minmax(130px, 1fr));
gap: 10px;
margin: 14px 0;
}
.summary-card,
.method-card {
border: 1px solid var(--avgen-line);
background: var(--avgen-panel);
border-radius: 8px;
padding: 12px;
}
.summary-card span,
.profile-total span,
.eyebrow {
display: block;
color: #4f5d6e !important;
font-size: 12px;
text-transform: uppercase;
letter-spacing: 0;
font-weight: 700;
}
.summary-card strong {
display: block;
margin-top: 4px;
color: #111827 !important;
font-size: 26px;
font-weight: 850;
line-height: 1;
}
.summary-card small {
display: block;
margin-top: 6px;
color: #4b5563 !important;
min-height: 32px;
}
.table-shell {
overflow-x: auto;
border: 1px solid var(--avgen-line);
border-radius: 8px;
background: #fff;
}
.leaderboard-table {
width: 100%;
border-collapse: collapse;
min-width: 1180px;
font-size: 14px;
}
.leaderboard-table th,
.leaderboard-table td {
padding: 11px 10px;
border-bottom: 1px solid #edf0f5;
vertical-align: middle;
}
.leaderboard-table tbody td {
color: var(--avgen-ink) !important;
}
.leaderboard-table thead th {
position: sticky;
top: 0;
background: #f1f4f8;
color: #202936 !important;
text-align: left;
white-space: nowrap;
z-index: 1;
}
.leaderboard-table thead th.sorted {
background: #e6eef8;
color: #111827 !important;
box-shadow: inset 0 -2px 0 var(--avgen-blue);
}
.sort-indicator {
display: inline-block;
margin-left: 6px;
color: var(--avgen-blue);
font-weight: 850;
}
.leaderboard-table tbody tr:hover {
background: #faf7f1;
}
.rank-cell {
width: 62px;
color: #4f5d6e !important;
font-weight: 700;
}
.model-cell {
min-width: 180px;
color: #16202d !important;
font-weight: 750;
}
.components-cell {
min-width: 220px;
}
.metric-cell {
text-align: right;
color: #253142 !important;
font-weight: 650;
font-variant-numeric: tabular-nums;
}
.metric-cell.best {
color: #0f6a43 !important;
font-weight: 800;
background: #edf8f2;
}
.metric-cell.second {
color: #8a4b08 !important;
font-weight: 750;
background: #fff7e8;
}
.component-badge,
.type-badge {
display: inline-flex;
align-items: center;
max-width: 100%;
margin: 2px 4px 2px 0;
padding: 3px 8px;
border-radius: 999px;
font-size: 12px;
font-weight: 700;
white-space: nowrap;
border: 1px solid transparent;
}
.component-badge.proprietary,
.type-badge.proprietary {
color: var(--avgen-orange);
background: #fff4e5;
border-color: #f2d4aa;
}
.component-badge.open,
.type-badge.opensource {
color: var(--avgen-blue);
background: #edf5ff;
border-color: #c7dff8;
}
.component-badge.neutral,
.type-badge.mixed {
color: var(--avgen-teal);
background: #eaf7f5;
border-color: #bde0dc;
}
.profile-panel {
display: grid;
grid-template-columns: minmax(0, 1fr) auto;
gap: 16px;
border: 1px solid var(--avgen-line);
background: #fff;
border-radius: 8px;
padding: 18px;
}
.profile-panel h2 {
margin: 2px 0 8px;
font-size: 26px;
}
.profile-total {
min-width: 140px;
text-align: right;
}
.profile-total strong {
display: block;
font-size: 42px;
line-height: 1;
}
.profile-grid {
grid-column: 1 / -1;
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 10px;
}
.profile-metric {
border: 1px solid #edf0f5;
border-radius: 8px;
padding: 10px;
}
.profile-metric-head {
display: flex;
justify-content: space-between;
gap: 10px;
font-variant-numeric: tabular-nums;
}
.bar-track {
height: 8px;
margin: 9px 0 6px;
border-radius: 999px;
background: #e7ebf1;
overflow: hidden;
}
.bar-fill {
height: 100%;
border-radius: inherit;
background: linear-gradient(90deg, var(--avgen-teal), var(--avgen-green));
}
.profile-metric small,
.methodology p {
color: var(--avgen-muted);
}
.method-grid {
display: grid;
grid-template-columns: repeat(3, minmax(0, 1fr));
gap: 10px;
margin-bottom: 12px;
}
.method-card h3 {
margin: 0 0 4px;
font-size: 16px;
}
.method-card strong {
display: block;
font-size: 30px;
color: var(--avgen-blue);
}
.empty-state {
border: 1px solid var(--avgen-line);
background: #fff;
border-radius: 8px;
padding: 22px;
color: var(--avgen-muted);
}
.overview-image img {
border: 1px solid var(--avgen-line);
border-radius: 8px;
background: #fff;
}
.submission-copy {
color: var(--avgen-muted);
margin: 0 0 14px;
}
.submission-copy strong {
color: var(--avgen-ink);
}
.status-card {
border: 1px solid var(--avgen-line);
border-radius: 8px;
padding: 14px;
background: #fff;
}
.status-card strong {
display: block;
margin-bottom: 4px;
}
.status-card p {
margin: 4px 0 0;
}
.status-card.success {
border-color: #b8dbc9;
background: #f1faf5;
}
.status-card.error {
border-color: #efc2bd;
background: #fff3f1;
}
@media (max-width: 980px) {
.app-header {
grid-template-columns: 1fr;
}
.header-links {
justify-content: flex-start;
}
.summary-grid,
.profile-grid,
.method-grid {
grid-template-columns: 1fr 1fr;
}
}
@media (max-width: 640px) {
.summary-grid,
.profile-grid,
.method-grid {
grid-template-columns: 1fr;
}
.profile-panel {
grid-template-columns: 1fr;
}
.profile-total {
text-align: left;
}
}
"""
HEADER = """
<div class="app-header">
<div>
<h1>AVGen-Bench Leaderboard</h1>
<p>
A leaderboard for multi-granular evaluation of Text-to-Audio-Video generation,
covering visual/audio quality, synchronization, fine-grained controllability,
physical plausibility, and holistic semantic alignment.
</p>
</div>
<div class="header-links">
<a href="https://github.com/microsoft/AVGen-Bench" target="_blank" rel="noopener">GitHub</a>
<a href="https://arxiv.org/abs/2604.08540" target="_blank" rel="noopener">Paper</a>
<a href="https://huggingface.co/datasets/microsoft/AVGen-Bench" target="_blank" rel="noopener">Dataset</a>
</div>
</div>
"""
with gr.Blocks(title="AVGen-Bench Leaderboard") as demo:
gr.HTML(HEADER)
with gr.Tab("Leaderboard"):
with gr.Row():
component_type = gr.Dropdown(
choices=["All", "Proprietary", "Open-source", "Mixed"],
value="All",
label="Component type",
)
sort_by = gr.Dropdown(
choices=SORT_CHOICES,
value="Total",
label="Sort item",
)
sort_order = gr.Radio(
choices=["Descending", "Ascending", "Best first"],
value="Descending",
label="Sort order",
)
query = gr.Textbox(label="Search", placeholder="Model or component")
summary = gr.HTML()
table = gr.HTML()
with gr.Tab("Model Profile"):
model = gr.Dropdown(choices=model_choices(LEADERBOARD), value=model_choices(LEADERBOARD)[0], label="Model")
profile = gr.HTML()
with gr.Tab("Metric Scheme"):
gr.HTML(render_methodology())
if OVERVIEW_IMAGE.exists():
gr.Image(
value=str(OVERVIEW_IMAGE),
label="AVGen-Bench evaluation suite",
show_label=False,
interactive=False,
elem_classes=["overview-image"],
)
with gr.Tab("Submission"):
gr.HTML(
"""
<p class="submission-copy">
Submit raw AVGen-Bench metrics for review. The app recomputes
<strong>Total</strong> from the raw metrics and sends the entry to a
pending-review backend. Accepted entries are still merged into the
official leaderboard manually.
</p>
"""
)
with gr.Row():
submit_model = gr.Textbox(label="Model name", placeholder="Your Model")
submit_component_type = gr.Dropdown(
choices=["Proprietary", "Open-source", "Mixed"],
value="Open-source",
label="Component type",
)
submit_components = gr.Textbox(
label="Components",
placeholder="VideoModel (Open-source)|AudioModel (Open-source)",
)
with gr.Row():
submit_contact = gr.Textbox(label="Public contact", placeholder="GitHub handle or email")
submit_model_url = gr.Textbox(label="Model or paper URL", placeholder="https://...")
submit_results_url = gr.Textbox(label="Evaluation artifact URL", placeholder="https://...")
submit_notes = gr.Textbox(label="Notes", lines=3, placeholder="Optional evaluation details")
with gr.Accordion("Raw metric scores", open=True):
with gr.Row():
submit_vis = gr.Number(label="Vis")
submit_aud = gr.Number(label="Aud (PQ)")
submit_av = gr.Number(label="AV")
submit_lip = gr.Number(label="Lip")
with gr.Row():
submit_text = gr.Number(label="Text")
submit_face = gr.Number(label="Face")
submit_music = gr.Number(label="Music")
submit_speech = gr.Number(label="Speech")
with gr.Row():
submit_lophy = gr.Number(label="Lo-Phy")
submit_hiphy = gr.Number(label="Hi-Phy")
submit_holistic = gr.Number(label="Holistic")
submit_button = gr.Button("Submit for Review", variant="primary")
submit_status = gr.HTML()
submit_payload = gr.Code(label="Submission JSON", language="json")
gr.File(value=str(SUBMISSION_TEMPLATE), label="CSV template", interactive=False)
demo.load(
fn=update_leaderboard,
inputs=[component_type, query, sort_by, sort_order],
outputs=[summary, table],
)
demo.load(fn=update_profile, inputs=[model], outputs=[profile])
for control in [component_type, query, sort_by, sort_order]:
control.change(
fn=update_leaderboard,
inputs=[component_type, query, sort_by, sort_order],
outputs=[summary, table],
)
model.change(fn=update_profile, inputs=[model], outputs=[profile])
submit_button.click(
fn=submit_score,
inputs=[
submit_model,
submit_components,
submit_component_type,
submit_contact,
submit_model_url,
submit_results_url,
submit_notes,
submit_vis,
submit_aud,
submit_av,
submit_lip,
submit_text,
submit_face,
submit_music,
submit_speech,
submit_lophy,
submit_hiphy,
submit_holistic,
],
outputs=[submit_status, submit_payload],
api_name="submit_score",
)
if __name__ == "__main__":
launch_kwargs = {
"css": CSS,
"server_name": os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
}
port = os.environ.get("PORT") or os.environ.get("GRADIO_SERVER_PORT")
if port:
launch_kwargs["server_port"] = int(port)
demo.launch(**launch_kwargs)