wangruisi1 commited on
Commit ·
c49b672
1
Parent(s): ecf3e7e
Init Commit
Browse files- app.py +182 -170
- src/about.py +77 -27
app.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
import pandas as pd
|
| 4 |
-
from
|
| 5 |
-
from huggingface_hub import snapshot_download
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
@@ -13,181 +11,198 @@ from src.about import (
|
|
| 13 |
TITLE,
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
-
from src.display.utils import (
|
| 17 |
-
BENCHMARK_COLS,
|
| 18 |
-
COLS,
|
| 19 |
-
EVAL_COLS,
|
| 20 |
-
EVAL_TYPES,
|
| 21 |
-
AutoEvalColumn,
|
| 22 |
-
ModelType,
|
| 23 |
-
fields,
|
| 24 |
-
WeightType,
|
| 25 |
-
Precision
|
| 26 |
-
)
|
| 27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
-
from src.submission.submit import add_new_eval
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def restart_space():
|
| 33 |
-
API.restart_space(repo_id=REPO_ID)
|
| 34 |
-
|
| 35 |
-
### Space initialisation
|
| 36 |
-
try:
|
| 37 |
-
print(EVAL_REQUESTS_PATH)
|
| 38 |
-
snapshot_download(
|
| 39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 40 |
-
)
|
| 41 |
-
except Exception:
|
| 42 |
-
restart_space()
|
| 43 |
-
try:
|
| 44 |
-
print(EVAL_RESULTS_PATH)
|
| 45 |
-
snapshot_download(
|
| 46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 47 |
-
)
|
| 48 |
-
except Exception:
|
| 49 |
-
restart_space()
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 53 |
-
|
| 54 |
-
(
|
| 55 |
-
finished_eval_queue_df,
|
| 56 |
-
running_eval_queue_df,
|
| 57 |
-
pending_eval_queue_df,
|
| 58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 59 |
-
|
| 60 |
-
def init_leaderboard(dataframe):
|
| 61 |
-
if dataframe is None or dataframe.empty:
|
| 62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
-
return Leaderboard(
|
| 64 |
-
value=dataframe,
|
| 65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 66 |
-
select_columns=SelectColumns(
|
| 67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 69 |
-
label="Select Columns to Display:",
|
| 70 |
-
),
|
| 71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
-
filter_columns=[
|
| 74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 76 |
-
ColumnFilter(
|
| 77 |
-
AutoEvalColumn.params.name,
|
| 78 |
-
type="slider",
|
| 79 |
-
min=0.01,
|
| 80 |
-
max=150,
|
| 81 |
-
label="Select the number of parameters (B)",
|
| 82 |
-
),
|
| 83 |
-
ColumnFilter(
|
| 84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 85 |
-
),
|
| 86 |
-
],
|
| 87 |
-
bool_checkboxgroup_label="Hide models",
|
| 88 |
-
interactive=False,
|
| 89 |
-
)
|
| 90 |
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
demo = gr.Blocks(css=custom_css)
|
| 93 |
with demo:
|
| 94 |
gr.HTML(TITLE)
|
| 95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 96 |
|
| 97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
-
with gr.TabItem("🏅
|
| 99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
-
|
| 101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 103 |
-
|
| 104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 105 |
-
with gr.Column():
|
| 106 |
-
with gr.Row():
|
| 107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
-
|
| 109 |
-
with gr.Column():
|
| 110 |
-
with gr.Accordion(
|
| 111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
-
open=False,
|
| 113 |
-
):
|
| 114 |
-
with gr.Row():
|
| 115 |
-
finished_eval_table = gr.components.Dataframe(
|
| 116 |
-
value=finished_eval_queue_df,
|
| 117 |
-
headers=EVAL_COLS,
|
| 118 |
-
datatype=EVAL_TYPES,
|
| 119 |
-
row_count=5,
|
| 120 |
-
)
|
| 121 |
-
with gr.Accordion(
|
| 122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
-
open=False,
|
| 124 |
-
):
|
| 125 |
-
with gr.Row():
|
| 126 |
-
running_eval_table = gr.components.Dataframe(
|
| 127 |
-
value=running_eval_queue_df,
|
| 128 |
-
headers=EVAL_COLS,
|
| 129 |
-
datatype=EVAL_TYPES,
|
| 130 |
-
row_count=5,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
with gr.Accordion(
|
| 134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
-
open=False,
|
| 136 |
-
):
|
| 137 |
-
with gr.Row():
|
| 138 |
-
pending_eval_table = gr.components.Dataframe(
|
| 139 |
-
value=pending_eval_queue_df,
|
| 140 |
-
headers=EVAL_COLS,
|
| 141 |
-
datatype=EVAL_TYPES,
|
| 142 |
-
row_count=5,
|
| 143 |
-
)
|
| 144 |
with gr.Row():
|
| 145 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
model_type = gr.Dropdown(
|
| 152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
-
label="Model type",
|
| 154 |
-
multiselect=False,
|
| 155 |
-
value=None,
|
| 156 |
-
interactive=True,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
with gr.Column():
|
| 160 |
-
precision = gr.Dropdown(
|
| 161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
-
label="Precision",
|
| 163 |
-
multiselect=False,
|
| 164 |
-
value="float16",
|
| 165 |
-
interactive=True,
|
| 166 |
-
)
|
| 167 |
-
weight_type = gr.Dropdown(
|
| 168 |
-
choices=[i.value.name for i in WeightType],
|
| 169 |
-
label="Weights type",
|
| 170 |
-
multiselect=False,
|
| 171 |
-
value="Original",
|
| 172 |
-
interactive=True,
|
| 173 |
-
)
|
| 174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
-
|
| 176 |
-
submit_button = gr.Button("Submit Eval")
|
| 177 |
-
submission_result = gr.Markdown()
|
| 178 |
-
submit_button.click(
|
| 179 |
-
add_new_eval,
|
| 180 |
-
[
|
| 181 |
-
model_name_textbox,
|
| 182 |
-
base_model_name_textbox,
|
| 183 |
-
revision_name_textbox,
|
| 184 |
-
precision,
|
| 185 |
-
weight_type,
|
| 186 |
-
model_type,
|
| 187 |
-
],
|
| 188 |
-
submission_result,
|
| 189 |
)
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
with gr.Row():
|
| 192 |
with gr.Accordion("📙 Citation", open=False):
|
| 193 |
citation_button = gr.Textbox(
|
|
@@ -198,7 +213,4 @@ with demo:
|
|
| 198 |
show_copy_button=True,
|
| 199 |
)
|
| 200 |
|
| 201 |
-
|
| 202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
-
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
from collections import OrderedDict
|
|
|
|
| 4 |
|
| 5 |
from src.about import (
|
| 6 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 11 |
TITLE,
|
| 12 |
)
|
| 13 |
from src.display.css_html_js import custom_css
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
+
# ============================================================
|
| 17 |
+
# Static Leaderboard Data for VBVR-Bench
|
| 18 |
+
# ============================================================
|
| 19 |
+
|
| 20 |
+
# Column group definitions (ordered for display)
|
| 21 |
+
COLUMN_GROUPS = OrderedDict([
|
| 22 |
+
("Overall", ["Overall"]),
|
| 23 |
+
("Overall by Category", [
|
| 24 |
+
"Abst.(All)", "Know.(All)", "Perc.(All)", "Spat.(All)", "Trans.(All)",
|
| 25 |
+
]),
|
| 26 |
+
("In-Domain (ID)", ["ID"]),
|
| 27 |
+
("In-Domain by Category", [
|
| 28 |
+
"Abst.(ID)", "Know.(ID)", "Perc.(ID)", "Spat.(ID)", "Trans.(ID)",
|
| 29 |
+
]),
|
| 30 |
+
("Out-of-Domain (OOD)", ["OOD"]),
|
| 31 |
+
("Out-of-Domain by Category", [
|
| 32 |
+
"Abst.(OOD)", "Know.(OOD)", "Perc.(OOD)", "Spat.(OOD)", "Trans.(OOD)",
|
| 33 |
+
]),
|
| 34 |
+
])
|
| 35 |
+
|
| 36 |
+
# Default column groups to show (matching LaTeX table layout)
|
| 37 |
+
DEFAULT_GROUPS = [
|
| 38 |
+
"Overall",
|
| 39 |
+
"In-Domain (ID)",
|
| 40 |
+
"In-Domain by Category",
|
| 41 |
+
"Out-of-Domain (OOD)",
|
| 42 |
+
"Out-of-Domain by Category",
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
# Columns always shown regardless of group selection
|
| 46 |
+
ALWAYS_VISIBLE_COLS = ["Model", "Type"]
|
| 47 |
+
|
| 48 |
+
# ============================================================
|
| 49 |
+
# Static model scores data
|
| 50 |
+
# ============================================================
|
| 51 |
+
MODELS_DATA = [
|
| 52 |
+
{
|
| 53 |
+
"Model": "Human",
|
| 54 |
+
"Type": "👤 Reference",
|
| 55 |
+
"Overall": 0.974, "ID": 0.960, "OOD": 0.988,
|
| 56 |
+
"Abst.(All)": 0.947, "Know.(All)": 0.972, "Perc.(All)": 0.994, "Spat.(All)": 0.969, "Trans.(All)": 0.981,
|
| 57 |
+
"Abst.(ID)": 0.919, "Know.(ID)": 0.956, "Perc.(ID)": 1.000, "Spat.(ID)": 0.950, "Trans.(ID)": 1.000,
|
| 58 |
+
"Abst.(OOD)": 1.000, "Know.(OOD)": 1.000, "Perc.(OOD)": 0.990, "Spat.(OOD)": 1.000, "Trans.(OOD)": 0.970,
|
| 59 |
+
},
|
| 60 |
+
# ---- Open-source Models ----
|
| 61 |
+
{
|
| 62 |
+
"Model": "CogVideoX1.5-5B-I2V",
|
| 63 |
+
"Type": "🟢 Open-source",
|
| 64 |
+
"Overall": 0.2727, "ID": 0.2831, "OOD": 0.2623,
|
| 65 |
+
"Abst.(All)": 0.2548, "Know.(All)": 0.2952, "Perc.(All)": 0.2525, "Spat.(All)": 0.2996, "Trans.(All)": 0.2903,
|
| 66 |
+
"Abst.(ID)": 0.2408, "Know.(ID)": 0.3285, "Perc.(ID)": 0.2567, "Spat.(ID)": 0.3281, "Trans.(ID)": 0.3051,
|
| 67 |
+
"Abst.(OOD)": 0.2809, "Know.(OOD)": 0.2352, "Perc.(OOD)": 0.2501, "Spat.(OOD)": 0.2539, "Trans.(OOD)": 0.2824,
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"Model": "HunyuanVideo-I2V",
|
| 71 |
+
"Type": "🟢 Open-source",
|
| 72 |
+
"Overall": 0.2726, "ID": 0.2799, "OOD": 0.2653,
|
| 73 |
+
"Abst.(All)": 0.1956, "Know.(All)": 0.3614, "Perc.(All)": 0.2910, "Spat.(All)": 0.2698, "Trans.(All)": 0.2733,
|
| 74 |
+
"Abst.(ID)": 0.2068, "Know.(ID)": 0.3573, "Perc.(ID)": 0.2933, "Spat.(ID)": 0.2802, "Trans.(ID)": 0.3160,
|
| 75 |
+
"Abst.(OOD)": 0.1747, "Know.(OOD)": 0.3688, "Perc.(OOD)": 0.2897, "Spat.(OOD)": 0.2530, "Trans.(OOD)": 0.2502,
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"Model": "Wan2.2-I2V-A14B",
|
| 79 |
+
"Type": "🟢 Open-source",
|
| 80 |
+
"Overall": 0.3714, "ID": 0.4125, "OOD": 0.3287,
|
| 81 |
+
"Abst.(All)": 0.4212, "Know.(All)": 0.3556, "Perc.(All)": 0.3710, "Spat.(All)": 0.3397, "Trans.(All)": 0.3465,
|
| 82 |
+
"Abst.(ID)": 0.4301, "Know.(ID)": 0.3823, "Perc.(ID)": 0.4147, "Spat.(ID)": 0.4043, "Trans.(ID)": 0.4192,
|
| 83 |
+
"Abst.(OOD)": 0.4046, "Know.(OOD)": 0.3077, "Perc.(OOD)": 0.3427, "Spat.(OOD)": 0.2364, "Trans.(OOD)": 0.3073,
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"Model": "LTX-2",
|
| 87 |
+
"Type": "🟢 Open-source",
|
| 88 |
+
"Overall": 0.3129, "ID": 0.3287, "OOD": 0.2971,
|
| 89 |
+
"Abst.(All)": 0.2908, "Know.(All)": 0.3531, "Perc.(All)": 0.3200, "Spat.(All)": 0.2980, "Trans.(All)": 0.3093,
|
| 90 |
+
"Abst.(ID)": 0.3156, "Know.(ID)": 0.3621, "Perc.(ID)": 0.3257, "Spat.(ID)": 0.3399, "Trans.(ID)": 0.3060,
|
| 91 |
+
"Abst.(OOD)": 0.2444, "Know.(OOD)": 0.3369, "Perc.(OOD)": 0.3167, "Spat.(OOD)": 0.2308, "Trans.(OOD)": 0.3110,
|
| 92 |
+
},
|
| 93 |
+
# ---- Proprietary Models ----
|
| 94 |
+
{
|
| 95 |
+
"Model": "Runway Gen-4 Turbo",
|
| 96 |
+
"Type": "🔵 Proprietary",
|
| 97 |
+
"Overall": 0.4031, "ID": 0.3920, "OOD": 0.4141,
|
| 98 |
+
"Abst.(All)": 0.4370, "Know.(All)": 0.4165, "Perc.(All)": 0.4223, "Spat.(All)": 0.3357, "Trans.(All)": 0.3696,
|
| 99 |
+
"Abst.(ID)": 0.3956, "Know.(ID)": 0.4094, "Perc.(ID)": 0.4288, "Spat.(ID)": 0.3409, "Trans.(ID)": 0.3629,
|
| 100 |
+
"Abst.(OOD)": 0.5147, "Know.(OOD)": 0.4294, "Perc.(OOD)": 0.4185, "Spat.(OOD)": 0.3274, "Trans.(OOD)": 0.3733,
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"Model": "Sora 2",
|
| 104 |
+
"Type": "🔵 Proprietary",
|
| 105 |
+
"Overall": 0.5457, "ID": 0.5691, "OOD": 0.5225,
|
| 106 |
+
"Abst.(All)": 0.5824, "Know.(All)": 0.4749, "Perc.(All)": 0.5458, "Spat.(All)": 0.5298, "Trans.(All)": 0.5640,
|
| 107 |
+
"Abst.(ID)": 0.6023, "Know.(ID)": 0.4767, "Perc.(ID)": 0.5810, "Spat.(ID)": 0.5720, "Trans.(ID)": 0.5967,
|
| 108 |
+
"Abst.(OOD)": 0.5462, "Know.(OOD)": 0.4715, "Perc.(OOD)": 0.5254, "Spat.(OOD)": 0.4623, "Trans.(OOD)": 0.5465,
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"Model": "Kling 2.6",
|
| 112 |
+
"Type": "🔵 Proprietary",
|
| 113 |
+
"Overall": 0.3691, "ID": 0.4082, "OOD": 0.3300,
|
| 114 |
+
"Abst.(All)": 0.4866, "Know.(All)": 0.2556, "Perc.(All)": 0.3095, "Spat.(All)": 0.3504, "Trans.(All)": 0.4149,
|
| 115 |
+
"Abst.(ID)": 0.4647, "Know.(ID)": 0.3225, "Perc.(ID)": 0.3749, "Spat.(ID)": 0.3471, "Trans.(ID)": 0.5193,
|
| 116 |
+
"Abst.(OOD)": 0.5277, "Know.(OOD)": 0.1350, "Perc.(OOD)": 0.2717, "Spat.(OOD)": 0.3556, "Trans.(OOD)": 0.3588,
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"Model": "Veo 3.1",
|
| 120 |
+
"Type": "🔵 Proprietary",
|
| 121 |
+
"Overall": 0.4800, "ID": 0.5307, "OOD": 0.4288,
|
| 122 |
+
"Abst.(All)": 0.5991, "Know.(All)": 0.4225, "Perc.(All)": 0.4568, "Spat.(All)": 0.4430, "Trans.(All)": 0.4413,
|
| 123 |
+
"Abst.(ID)": 0.6109, "Know.(ID)": 0.5032, "Perc.(ID)": 0.5196, "Spat.(ID)": 0.4443, "Trans.(ID)": 0.5103,
|
| 124 |
+
"Abst.(OOD)": 0.5770, "Know.(OOD)": 0.2772, "Perc.(OOD)": 0.4204, "Spat.(OOD)": 0.4406, "Trans.(OOD)": 0.4041,
|
| 125 |
+
},
|
| 126 |
+
# ---- Data Scaling Strong Baseline ----
|
| 127 |
+
{
|
| 128 |
+
"Model": "VBVR-Wan2.2",
|
| 129 |
+
"Type": "⭐ Strong Baseline",
|
| 130 |
+
"Overall": 0.6848, "ID": 0.7599, "OOD": 0.6097,
|
| 131 |
+
"Abst.(All)": 0.7394, "Know.(All)": 0.6864, "Perc.(All)": 0.6333, "Spat.(All)": 0.6960, "Trans.(All)": 0.6909,
|
| 132 |
+
"Abst.(ID)": 0.7240, "Know.(ID)": 0.7500, "Perc.(ID)": 0.7817, "Spat.(ID)": 0.7446, "Trans.(ID)": 0.8327,
|
| 133 |
+
"Abst.(OOD)": 0.7682, "Know.(OOD)": 0.5720, "Perc.(OOD)": 0.5474, "Spat.(OOD)": 0.6182, "Trans.(OOD)": 0.6145,
|
| 134 |
+
},
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def build_full_dataframe():
|
| 139 |
+
"""Build the complete DataFrame with all columns, sorted by Overall descending."""
|
| 140 |
+
df = pd.DataFrame(MODELS_DATA)
|
| 141 |
+
# Ensure column order: always-visible cols first, then groups in defined order
|
| 142 |
+
all_cols = list(ALWAYS_VISIBLE_COLS)
|
| 143 |
+
for group_cols in COLUMN_GROUPS.values():
|
| 144 |
+
all_cols.extend(group_cols)
|
| 145 |
+
df = df[all_cols]
|
| 146 |
+
# Sort by Overall descending
|
| 147 |
+
df = df.sort_values("Overall", ascending=False).reset_index(drop=True)
|
| 148 |
+
# Round numeric columns to 3 decimal places for clean display
|
| 149 |
+
numeric_cols = df.select_dtypes(include="number").columns
|
| 150 |
+
df[numeric_cols] = df[numeric_cols].round(3)
|
| 151 |
+
return df
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
FULL_DF = build_full_dataframe()
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_filtered_df(selected_groups):
|
| 158 |
+
"""Filter DataFrame columns based on selected column groups."""
|
| 159 |
+
if not selected_groups:
|
| 160 |
+
selected_groups = ["Overall"] # Always show at least Overall
|
| 161 |
+
|
| 162 |
+
cols = list(ALWAYS_VISIBLE_COLS)
|
| 163 |
+
for group_name, group_cols in COLUMN_GROUPS.items():
|
| 164 |
+
if group_name in selected_groups:
|
| 165 |
+
cols.extend(group_cols)
|
| 166 |
+
|
| 167 |
+
return FULL_DF[cols]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ============================================================
|
| 171 |
+
# Gradio Interface
|
| 172 |
+
# ============================================================
|
| 173 |
demo = gr.Blocks(css=custom_css)
|
| 174 |
with demo:
|
| 175 |
gr.HTML(TITLE)
|
| 176 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 177 |
|
| 178 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 179 |
+
with gr.TabItem("🏅 VBVR-Bench Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
with gr.Row():
|
| 181 |
+
column_selector = gr.CheckboxGroup(
|
| 182 |
+
choices=list(COLUMN_GROUPS.keys()),
|
| 183 |
+
value=DEFAULT_GROUPS,
|
| 184 |
+
label="Select Column Groups to Display:",
|
| 185 |
+
interactive=True,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
leaderboard_table = gr.Dataframe(
|
| 189 |
+
value=get_filtered_df(DEFAULT_GROUPS),
|
| 190 |
+
interactive=False,
|
| 191 |
+
elem_id="leaderboard-table",
|
| 192 |
+
)
|
| 193 |
|
| 194 |
+
column_selector.change(
|
| 195 |
+
fn=get_filtered_df,
|
| 196 |
+
inputs=[column_selector],
|
| 197 |
+
outputs=[leaderboard_table],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
)
|
| 199 |
|
| 200 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
| 201 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 202 |
+
|
| 203 |
+
with gr.TabItem("🚀 Submit", elem_id="llm-benchmark-tab-submit", id=2):
|
| 204 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 205 |
+
|
| 206 |
with gr.Row():
|
| 207 |
with gr.Accordion("📙 Citation", open=False):
|
| 208 |
citation_button = gr.Textbox(
|
|
|
|
| 213 |
show_copy_button=True,
|
| 214 |
)
|
| 215 |
|
| 216 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -11,7 +11,7 @@ class Task:
|
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
task0 = Task("anli_r1", "acc", "ANLI")
|
| 16 |
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 17 |
|
|
@@ -21,52 +21,102 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title">
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
| 41 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
###
|
| 44 |
-
```python
|
| 45 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 46 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 49 |
-
```
|
| 50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
|
| 65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 66 |
-
Make sure you have followed the above steps first.
|
| 67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 68 |
"""
|
| 69 |
|
| 70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
"""
|
|
|
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
task0 = Task("anli_r1", "acc", "ANLI")
|
| 16 |
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 17 |
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">VBVR-Bench Leaderboard</h1>"""
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
**VBVR-Bench** is a comprehensive benchmark for evaluating **video reasoning capabilities**.
|
| 29 |
+
|
| 30 |
+
To systematically assess model reasoning capabilities, VBVR-Bench employs a **dual-split evaluation strategy** across **100 diverse tasks**:
|
| 31 |
+
- **In-Domain (ID)**: 50 tasks that overlap with training categories but differ in unseen parameter configurations and sample instances, testing *in-domain generalization*.
|
| 32 |
+
- **Out-of-Domain (OOD)**: 50 entirely novel tasks designed to measure *out-of-domain generalization*, testing whether models acquire transferable reasoning primitives rather than relying on task-specific memorization.
|
| 33 |
+
|
| 34 |
+
Each task consists of **5 test samples**, enabling statistically robust evaluation across diverse reasoning scenarios.
|
| 35 |
+
|
| 36 |
+
Use the column group selector below to customize which score groups are displayed.
|
| 37 |
"""
|
| 38 |
|
| 39 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 40 |
LLM_BENCHMARKS_TEXT = f"""
|
| 41 |
+
## About VBVR-Bench
|
| 42 |
+
|
| 43 |
+
### Rule-Based Evaluation Framework
|
| 44 |
+
|
| 45 |
+
A key feature of VBVR-Bench is its fully **rule-based evaluation framework**. Most test tasks have a unique, verifiable correct answer, allowing interpretable evaluation based on spatial position, color, object identity, path, or logical outcome. Geometric, physical, and deductive constraints are also considered in the scoring rubrics.
|
| 46 |
+
|
| 47 |
+
Each of the 100 test tasks is paired with a dedicated evaluation rule, with scores on multiple aspects to compute a weighted, comprehensive score. Sub-criteria include:
|
| 48 |
+
- **Spatial Accuracy**: Correctness of object positions and arrangements
|
| 49 |
+
- **Trajectory Correctness**: Validity of movement paths
|
| 50 |
+
- **Temporal Consistency**: Smooth frame-by-frame progression
|
| 51 |
+
- **Logical Validity**: Adherence to task-specific reasoning constraints
|
| 52 |
+
|
| 53 |
+
### Example: Task G-45 (Key Door Matching)
|
| 54 |
+
|
| 55 |
+
A green dot agent must first locate a color-specified key and then navigate to the matching door within a grid maze. Performance is scored across four weighted dimensions:
|
| 56 |
|
| 57 |
+
| Dimension | Weight | Description |
|
| 58 |
+
|-----------|--------|-------------|
|
| 59 |
+
| Target Identification | 30% | Correct key and door selection without color confusion |
|
| 60 |
+
| Path Validity | 30% | Following allowed paths without wall collisions |
|
| 61 |
+
| Path Efficiency | 20% | Comparison to optimal BFS path |
|
| 62 |
+
| Animation Quality | 20% | Smooth movement and precise object alignment |
|
| 63 |
|
| 64 |
+
A perfect score requires all four dimensions to be satisfied.
|
| 65 |
+
|
| 66 |
+
### Key Benefits
|
| 67 |
+
|
| 68 |
+
- **Reproducibility and Determinism**: Fully deterministic evaluation avoiding stochastic variability or hallucinations associated with LLM-based judgments.
|
| 69 |
+
- **Granular Verifiability**: Each task is decomposed into interpretable vectors, allowing precise measurement of spatial, temporal, and logical correctness at the pixel or object-property level.
|
| 70 |
+
- **Transparent Diagnosis**: By explicitly encoding reasoning constraints, the benchmark not only ranks models but also reveals systematic capability gaps and cross-domain performance trends.
|
| 71 |
+
|
| 72 |
+
### Model Categories
|
| 73 |
+
- 👤 **Reference**: Human performance baseline
|
| 74 |
+
- 🟢 **Open-source**: Publicly available models
|
| 75 |
+
- 🔵 **Proprietary**: Commercial/closed-source models
|
| 76 |
+
- ⭐ **Strong Baseline**: Data scaling strong baseline (VBVR-Wan2.2)
|
| 77 |
"""
|
| 78 |
|
| 79 |
EVALUATION_QUEUE_TEXT = """
|
| 80 |
+
## How to Submit Your Results
|
| 81 |
+
|
| 82 |
+
We welcome submissions from the research community! To submit your model's evaluation results to the VBVR-Bench leaderboard:
|
| 83 |
+
|
| 84 |
+
### Submission Process
|
| 85 |
+
|
| 86 |
+
📧 **Email your submission to: [C200210@e.ntu.edu.sg](mailto:C200210@e.ntu.edu.sg)**
|
| 87 |
+
|
| 88 |
+
Please include the following in your submission:
|
| 89 |
|
| 90 |
+
### Required Materials
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
1. **Model Information**
|
| 93 |
+
- Model name and version
|
| 94 |
+
- Model type (Open-source / Proprietary)
|
| 95 |
+
- Link to model (if publicly available)
|
| 96 |
+
- Brief model description
|
| 97 |
|
| 98 |
+
2. **Evaluation Results**
|
| 99 |
+
- Complete evaluation scores in JSON format
|
| 100 |
+
- Scores for all 100 tasks (50 ID + 50 OOD)
|
| 101 |
+
- Category-wise breakdown (Abstraction, Knowledge, Perception, Spatiality, Transformation)
|
| 102 |
|
| 103 |
+
3. **Evaluation Logs**
|
| 104 |
+
- Full evaluation logs for verification
|
| 105 |
+
- Generated videos for a subset of tasks (optional but recommended)
|
| 106 |
|
| 107 |
+
4. **Technical Details**
|
| 108 |
+
- Inference configuration (resolution, frame rate, etc.)
|
| 109 |
+
- Hardware used for generation
|
| 110 |
+
- Any preprocessing or postprocessing applied
|
| 111 |
|
| 112 |
+
We will review your submission and add it to the leaderboard within 1-2 weeks.
|
|
|
|
|
|
|
|
|
|
| 113 |
"""
|
| 114 |
|
| 115 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 116 |
CITATION_BUTTON_TEXT = r"""
|
| 117 |
+
@article{vbvr2026,
|
| 118 |
+
title={A Very Big Video Reasoning Suite},
|
| 119 |
+
author={Wang, Maijunxian and Wang, Ruisi and Lin, Juyi and Ji, Ran and Wiedemer, Thaddäus and Gao, Qingying and Luo, Dezhi and Qian, Yaoyao and Huang, Lianyu and Hong, Zelong and Ge, Jiahui and Ma, Qianli and He, Hang and Zhou, Yifan and Guo, Lingzi and Mei, Lantao and Li, Jiachen and Xing, Hanwen and Zhao, Tianqi and Yu, Fengyuan and Xiao, Weihang and Jiao, Yizheng and Hou, Jianheng and Zhang, Danyang and Xu, Pengcheng and Zhong, Boyang and Zhao, Zehong and Fang, Gaoyun and Kitaoka, John and Xu, Yile and Xu, Hua and Blacutt, Kenton and Nguyen, Tin and Song, Siyuan and Sun, Haoran and Wen, Shaoyue and He, Linyang and Wang, Runming and Wang, Yanzhi and Yang, Mengyue and Ma, Ziqiao and Millière, Raphaël and Shi, Freda and Vasconcelos, Nuno and Khashabi, Daniel and Yuille, Alan and Du, Yilun and Liu, Ziming and Lin, Dahua and Liu, Ziwei and Kumar, Vikash and Li, Yijiang and Yang, Lei and Cai, Zhongang and Deng, Hokin},
|
| 120 |
+
year={2026}
|
| 121 |
+
}
|
| 122 |
"""
|