wangruisi1 commited on
Commit
c49b672
·
1 Parent(s): ecf3e7e

Init Commit

Browse files
Files changed (2) hide show
  1. app.py +182 -170
  2. src/about.py +77 -27
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -13,181 +11,198 @@ from src.about import (
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
  )
190
 
 
 
 
 
 
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
@@ -198,7 +213,4 @@ with demo:
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ from collections import OrderedDict
 
4
 
5
  from src.about import (
6
  CITATION_BUTTON_LABEL,
 
11
  TITLE,
12
  )
13
  from src.display.css_html_js import custom_css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
+ # ============================================================
17
+ # Static Leaderboard Data for VBVR-Bench
18
+ # ============================================================
19
+
20
+ # Column group definitions (ordered for display)
21
+ COLUMN_GROUPS = OrderedDict([
22
+ ("Overall", ["Overall"]),
23
+ ("Overall by Category", [
24
+ "Abst.(All)", "Know.(All)", "Perc.(All)", "Spat.(All)", "Trans.(All)",
25
+ ]),
26
+ ("In-Domain (ID)", ["ID"]),
27
+ ("In-Domain by Category", [
28
+ "Abst.(ID)", "Know.(ID)", "Perc.(ID)", "Spat.(ID)", "Trans.(ID)",
29
+ ]),
30
+ ("Out-of-Domain (OOD)", ["OOD"]),
31
+ ("Out-of-Domain by Category", [
32
+ "Abst.(OOD)", "Know.(OOD)", "Perc.(OOD)", "Spat.(OOD)", "Trans.(OOD)",
33
+ ]),
34
+ ])
35
+
36
+ # Default column groups to show (matching LaTeX table layout)
37
+ DEFAULT_GROUPS = [
38
+ "Overall",
39
+ "In-Domain (ID)",
40
+ "In-Domain by Category",
41
+ "Out-of-Domain (OOD)",
42
+ "Out-of-Domain by Category",
43
+ ]
44
+
45
+ # Columns always shown regardless of group selection
46
+ ALWAYS_VISIBLE_COLS = ["Model", "Type"]
47
+
48
+ # ============================================================
49
+ # Static model scores data
50
+ # ============================================================
51
+ MODELS_DATA = [
52
+ {
53
+ "Model": "Human",
54
+ "Type": "👤 Reference",
55
+ "Overall": 0.974, "ID": 0.960, "OOD": 0.988,
56
+ "Abst.(All)": 0.947, "Know.(All)": 0.972, "Perc.(All)": 0.994, "Spat.(All)": 0.969, "Trans.(All)": 0.981,
57
+ "Abst.(ID)": 0.919, "Know.(ID)": 0.956, "Perc.(ID)": 1.000, "Spat.(ID)": 0.950, "Trans.(ID)": 1.000,
58
+ "Abst.(OOD)": 1.000, "Know.(OOD)": 1.000, "Perc.(OOD)": 0.990, "Spat.(OOD)": 1.000, "Trans.(OOD)": 0.970,
59
+ },
60
+ # ---- Open-source Models ----
61
+ {
62
+ "Model": "CogVideoX1.5-5B-I2V",
63
+ "Type": "🟢 Open-source",
64
+ "Overall": 0.2727, "ID": 0.2831, "OOD": 0.2623,
65
+ "Abst.(All)": 0.2548, "Know.(All)": 0.2952, "Perc.(All)": 0.2525, "Spat.(All)": 0.2996, "Trans.(All)": 0.2903,
66
+ "Abst.(ID)": 0.2408, "Know.(ID)": 0.3285, "Perc.(ID)": 0.2567, "Spat.(ID)": 0.3281, "Trans.(ID)": 0.3051,
67
+ "Abst.(OOD)": 0.2809, "Know.(OOD)": 0.2352, "Perc.(OOD)": 0.2501, "Spat.(OOD)": 0.2539, "Trans.(OOD)": 0.2824,
68
+ },
69
+ {
70
+ "Model": "HunyuanVideo-I2V",
71
+ "Type": "🟢 Open-source",
72
+ "Overall": 0.2726, "ID": 0.2799, "OOD": 0.2653,
73
+ "Abst.(All)": 0.1956, "Know.(All)": 0.3614, "Perc.(All)": 0.2910, "Spat.(All)": 0.2698, "Trans.(All)": 0.2733,
74
+ "Abst.(ID)": 0.2068, "Know.(ID)": 0.3573, "Perc.(ID)": 0.2933, "Spat.(ID)": 0.2802, "Trans.(ID)": 0.3160,
75
+ "Abst.(OOD)": 0.1747, "Know.(OOD)": 0.3688, "Perc.(OOD)": 0.2897, "Spat.(OOD)": 0.2530, "Trans.(OOD)": 0.2502,
76
+ },
77
+ {
78
+ "Model": "Wan2.2-I2V-A14B",
79
+ "Type": "🟢 Open-source",
80
+ "Overall": 0.3714, "ID": 0.4125, "OOD": 0.3287,
81
+ "Abst.(All)": 0.4212, "Know.(All)": 0.3556, "Perc.(All)": 0.3710, "Spat.(All)": 0.3397, "Trans.(All)": 0.3465,
82
+ "Abst.(ID)": 0.4301, "Know.(ID)": 0.3823, "Perc.(ID)": 0.4147, "Spat.(ID)": 0.4043, "Trans.(ID)": 0.4192,
83
+ "Abst.(OOD)": 0.4046, "Know.(OOD)": 0.3077, "Perc.(OOD)": 0.3427, "Spat.(OOD)": 0.2364, "Trans.(OOD)": 0.3073,
84
+ },
85
+ {
86
+ "Model": "LTX-2",
87
+ "Type": "🟢 Open-source",
88
+ "Overall": 0.3129, "ID": 0.3287, "OOD": 0.2971,
89
+ "Abst.(All)": 0.2908, "Know.(All)": 0.3531, "Perc.(All)": 0.3200, "Spat.(All)": 0.2980, "Trans.(All)": 0.3093,
90
+ "Abst.(ID)": 0.3156, "Know.(ID)": 0.3621, "Perc.(ID)": 0.3257, "Spat.(ID)": 0.3399, "Trans.(ID)": 0.3060,
91
+ "Abst.(OOD)": 0.2444, "Know.(OOD)": 0.3369, "Perc.(OOD)": 0.3167, "Spat.(OOD)": 0.2308, "Trans.(OOD)": 0.3110,
92
+ },
93
+ # ---- Proprietary Models ----
94
+ {
95
+ "Model": "Runway Gen-4 Turbo",
96
+ "Type": "🔵 Proprietary",
97
+ "Overall": 0.4031, "ID": 0.3920, "OOD": 0.4141,
98
+ "Abst.(All)": 0.4370, "Know.(All)": 0.4165, "Perc.(All)": 0.4223, "Spat.(All)": 0.3357, "Trans.(All)": 0.3696,
99
+ "Abst.(ID)": 0.3956, "Know.(ID)": 0.4094, "Perc.(ID)": 0.4288, "Spat.(ID)": 0.3409, "Trans.(ID)": 0.3629,
100
+ "Abst.(OOD)": 0.5147, "Know.(OOD)": 0.4294, "Perc.(OOD)": 0.4185, "Spat.(OOD)": 0.3274, "Trans.(OOD)": 0.3733,
101
+ },
102
+ {
103
+ "Model": "Sora 2",
104
+ "Type": "🔵 Proprietary",
105
+ "Overall": 0.5457, "ID": 0.5691, "OOD": 0.5225,
106
+ "Abst.(All)": 0.5824, "Know.(All)": 0.4749, "Perc.(All)": 0.5458, "Spat.(All)": 0.5298, "Trans.(All)": 0.5640,
107
+ "Abst.(ID)": 0.6023, "Know.(ID)": 0.4767, "Perc.(ID)": 0.5810, "Spat.(ID)": 0.5720, "Trans.(ID)": 0.5967,
108
+ "Abst.(OOD)": 0.5462, "Know.(OOD)": 0.4715, "Perc.(OOD)": 0.5254, "Spat.(OOD)": 0.4623, "Trans.(OOD)": 0.5465,
109
+ },
110
+ {
111
+ "Model": "Kling 2.6",
112
+ "Type": "🔵 Proprietary",
113
+ "Overall": 0.3691, "ID": 0.4082, "OOD": 0.3300,
114
+ "Abst.(All)": 0.4866, "Know.(All)": 0.2556, "Perc.(All)": 0.3095, "Spat.(All)": 0.3504, "Trans.(All)": 0.4149,
115
+ "Abst.(ID)": 0.4647, "Know.(ID)": 0.3225, "Perc.(ID)": 0.3749, "Spat.(ID)": 0.3471, "Trans.(ID)": 0.5193,
116
+ "Abst.(OOD)": 0.5277, "Know.(OOD)": 0.1350, "Perc.(OOD)": 0.2717, "Spat.(OOD)": 0.3556, "Trans.(OOD)": 0.3588,
117
+ },
118
+ {
119
+ "Model": "Veo 3.1",
120
+ "Type": "🔵 Proprietary",
121
+ "Overall": 0.4800, "ID": 0.5307, "OOD": 0.4288,
122
+ "Abst.(All)": 0.5991, "Know.(All)": 0.4225, "Perc.(All)": 0.4568, "Spat.(All)": 0.4430, "Trans.(All)": 0.4413,
123
+ "Abst.(ID)": 0.6109, "Know.(ID)": 0.5032, "Perc.(ID)": 0.5196, "Spat.(ID)": 0.4443, "Trans.(ID)": 0.5103,
124
+ "Abst.(OOD)": 0.5770, "Know.(OOD)": 0.2772, "Perc.(OOD)": 0.4204, "Spat.(OOD)": 0.4406, "Trans.(OOD)": 0.4041,
125
+ },
126
+ # ---- Data Scaling Strong Baseline ----
127
+ {
128
+ "Model": "VBVR-Wan2.2",
129
+ "Type": "⭐ Strong Baseline",
130
+ "Overall": 0.6848, "ID": 0.7599, "OOD": 0.6097,
131
+ "Abst.(All)": 0.7394, "Know.(All)": 0.6864, "Perc.(All)": 0.6333, "Spat.(All)": 0.6960, "Trans.(All)": 0.6909,
132
+ "Abst.(ID)": 0.7240, "Know.(ID)": 0.7500, "Perc.(ID)": 0.7817, "Spat.(ID)": 0.7446, "Trans.(ID)": 0.8327,
133
+ "Abst.(OOD)": 0.7682, "Know.(OOD)": 0.5720, "Perc.(OOD)": 0.5474, "Spat.(OOD)": 0.6182, "Trans.(OOD)": 0.6145,
134
+ },
135
+ ]
136
+
137
+
138
+ def build_full_dataframe():
139
+ """Build the complete DataFrame with all columns, sorted by Overall descending."""
140
+ df = pd.DataFrame(MODELS_DATA)
141
+ # Ensure column order: always-visible cols first, then groups in defined order
142
+ all_cols = list(ALWAYS_VISIBLE_COLS)
143
+ for group_cols in COLUMN_GROUPS.values():
144
+ all_cols.extend(group_cols)
145
+ df = df[all_cols]
146
+ # Sort by Overall descending
147
+ df = df.sort_values("Overall", ascending=False).reset_index(drop=True)
148
+ # Round numeric columns to 3 decimal places for clean display
149
+ numeric_cols = df.select_dtypes(include="number").columns
150
+ df[numeric_cols] = df[numeric_cols].round(3)
151
+ return df
152
+
153
+
154
+ FULL_DF = build_full_dataframe()
155
+
156
+
157
+ def get_filtered_df(selected_groups):
158
+ """Filter DataFrame columns based on selected column groups."""
159
+ if not selected_groups:
160
+ selected_groups = ["Overall"] # Always show at least Overall
161
+
162
+ cols = list(ALWAYS_VISIBLE_COLS)
163
+ for group_name, group_cols in COLUMN_GROUPS.items():
164
+ if group_name in selected_groups:
165
+ cols.extend(group_cols)
166
+
167
+ return FULL_DF[cols]
168
+
169
+
170
+ # ============================================================
171
+ # Gradio Interface
172
+ # ============================================================
173
  demo = gr.Blocks(css=custom_css)
174
  with demo:
175
  gr.HTML(TITLE)
176
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
177
 
178
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
179
+ with gr.TabItem("🏅 VBVR-Bench Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  with gr.Row():
181
+ column_selector = gr.CheckboxGroup(
182
+ choices=list(COLUMN_GROUPS.keys()),
183
+ value=DEFAULT_GROUPS,
184
+ label="Select Column Groups to Display:",
185
+ interactive=True,
186
+ )
187
+
188
+ leaderboard_table = gr.Dataframe(
189
+ value=get_filtered_df(DEFAULT_GROUPS),
190
+ interactive=False,
191
+ elem_id="leaderboard-table",
192
+ )
193
 
194
+ column_selector.change(
195
+ fn=get_filtered_df,
196
+ inputs=[column_selector],
197
+ outputs=[leaderboard_table],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  )
199
 
200
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
201
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
202
+
203
+ with gr.TabItem("🚀 Submit", elem_id="llm-benchmark-tab-submit", id=2):
204
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
205
+
206
  with gr.Row():
207
  with gr.Accordion("📙 Citation", open=False):
208
  citation_button = gr.Textbox(
 
213
  show_copy_button=True,
214
  )
215
 
216
+ demo.queue(default_concurrency_limit=40).launch()
 
 
 
src/about.py CHANGED
@@ -11,7 +11,7 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
@@ -21,52 +21,102 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
 
 
 
 
 
 
 
 
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
 
 
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
 
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
72
  """
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">VBVR-Bench Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ **VBVR-Bench** is a comprehensive benchmark for evaluating **video reasoning capabilities**.
29
+
30
+ To systematically assess model reasoning capabilities, VBVR-Bench employs a **dual-split evaluation strategy** across **100 diverse tasks**:
31
+ - **In-Domain (ID)**: 50 tasks that overlap with training categories but differ in unseen parameter configurations and sample instances, testing *in-domain generalization*.
32
+ - **Out-of-Domain (OOD)**: 50 entirely novel tasks designed to measure *out-of-domain generalization*, testing whether models acquire transferable reasoning primitives rather than relying on task-specific memorization.
33
+
34
+ Each task consists of **5 test samples**, enabling statistically robust evaluation across diverse reasoning scenarios.
35
+
36
+ Use the column group selector below to customize which score groups are displayed.
37
  """
38
 
39
  # Which evaluations are you running? how can people reproduce what you have?
40
  LLM_BENCHMARKS_TEXT = f"""
41
+ ## About VBVR-Bench
42
+
43
+ ### Rule-Based Evaluation Framework
44
+
45
+ A key feature of VBVR-Bench is its fully **rule-based evaluation framework**. Most test tasks have a unique, verifiable correct answer, allowing interpretable evaluation based on spatial position, color, object identity, path, or logical outcome. Geometric, physical, and deductive constraints are also considered in the scoring rubrics.
46
+
47
+ Each of the 100 test tasks is paired with a dedicated evaluation rule, with scores on multiple aspects to compute a weighted, comprehensive score. Sub-criteria include:
48
+ - **Spatial Accuracy**: Correctness of object positions and arrangements
49
+ - **Trajectory Correctness**: Validity of movement paths
50
+ - **Temporal Consistency**: Smooth frame-by-frame progression
51
+ - **Logical Validity**: Adherence to task-specific reasoning constraints
52
+
53
+ ### Example: Task G-45 (Key Door Matching)
54
+
55
+ A green dot agent must first locate a color-specified key and then navigate to the matching door within a grid maze. Performance is scored across four weighted dimensions:
56
 
57
+ | Dimension | Weight | Description |
58
+ |-----------|--------|-------------|
59
+ | Target Identification | 30% | Correct key and door selection without color confusion |
60
+ | Path Validity | 30% | Following allowed paths without wall collisions |
61
+ | Path Efficiency | 20% | Comparison to optimal BFS path |
62
+ | Animation Quality | 20% | Smooth movement and precise object alignment |
63
 
64
+ A perfect score requires all four dimensions to be satisfied.
65
+
66
+ ### Key Benefits
67
+
68
+ - **Reproducibility and Determinism**: Fully deterministic evaluation avoiding stochastic variability or hallucinations associated with LLM-based judgments.
69
+ - **Granular Verifiability**: Each task is decomposed into interpretable vectors, allowing precise measurement of spatial, temporal, and logical correctness at the pixel or object-property level.
70
+ - **Transparent Diagnosis**: By explicitly encoding reasoning constraints, the benchmark not only ranks models but also reveals systematic capability gaps and cross-domain performance trends.
71
+
72
+ ### Model Categories
73
+ - 👤 **Reference**: Human performance baseline
74
+ - 🟢 **Open-source**: Publicly available models
75
+ - 🔵 **Proprietary**: Commercial/closed-source models
76
+ - ⭐ **Strong Baseline**: Data scaling strong baseline (VBVR-Wan2.2)
77
  """
78
 
79
  EVALUATION_QUEUE_TEXT = """
80
+ ## How to Submit Your Results
81
+
82
+ We welcome submissions from the research community! To submit your model's evaluation results to the VBVR-Bench leaderboard:
83
+
84
+ ### Submission Process
85
+
86
+ 📧 **Email your submission to: [C200210@e.ntu.edu.sg](mailto:C200210@e.ntu.edu.sg)**
87
+
88
+ Please include the following in your submission:
89
 
90
+ ### Required Materials
 
 
 
 
 
 
 
91
 
92
+ 1. **Model Information**
93
+ - Model name and version
94
+ - Model type (Open-source / Proprietary)
95
+ - Link to model (if publicly available)
96
+ - Brief model description
97
 
98
+ 2. **Evaluation Results**
99
+ - Complete evaluation scores in JSON format
100
+ - Scores for all 100 tasks (50 ID + 50 OOD)
101
+ - Category-wise breakdown (Abstraction, Knowledge, Perception, Spatiality, Transformation)
102
 
103
+ 3. **Evaluation Logs**
104
+ - Full evaluation logs for verification
105
+ - Generated videos for a subset of tasks (optional but recommended)
106
 
107
+ 4. **Technical Details**
108
+ - Inference configuration (resolution, frame rate, etc.)
109
+ - Hardware used for generation
110
+ - Any preprocessing or postprocessing applied
111
 
112
+ We will review your submission and add it to the leaderboard within 1-2 weeks.
 
 
 
113
  """
114
 
115
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
116
  CITATION_BUTTON_TEXT = r"""
117
+ @article{vbvr2026,
118
+ title={A Very Big Video Reasoning Suite},
119
+ author={Wang, Maijunxian and Wang, Ruisi and Lin, Juyi and Ji, Ran and Wiedemer, Thaddäus and Gao, Qingying and Luo, Dezhi and Qian, Yaoyao and Huang, Lianyu and Hong, Zelong and Ge, Jiahui and Ma, Qianli and He, Hang and Zhou, Yifan and Guo, Lingzi and Mei, Lantao and Li, Jiachen and Xing, Hanwen and Zhao, Tianqi and Yu, Fengyuan and Xiao, Weihang and Jiao, Yizheng and Hou, Jianheng and Zhang, Danyang and Xu, Pengcheng and Zhong, Boyang and Zhao, Zehong and Fang, Gaoyun and Kitaoka, John and Xu, Yile and Xu, Hua and Blacutt, Kenton and Nguyen, Tin and Song, Siyuan and Sun, Haoran and Wen, Shaoyue and He, Linyang and Wang, Runming and Wang, Yanzhi and Yang, Mengyue and Ma, Ziqiao and Millière, Raphaël and Shi, Freda and Vasconcelos, Nuno and Khashabi, Daniel and Yuille, Alan and Du, Yilun and Liu, Ziming and Lin, Dahua and Liu, Ziwei and Kumar, Vikash and Li, Yijiang and Yang, Lei and Cai, Zhongang and Deng, Hokin},
120
+ year={2026}
121
+ }
122
  """