Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -56,7 +56,7 @@ OMNI_MLLM_RENAME = {
|
|
| 56 |
}
|
| 57 |
|
| 58 |
AUDIO_RENAME = {
|
| 59 |
-
"Task1-3": "Audio
|
| 60 |
}
|
| 61 |
|
| 62 |
IMAGE_GEN_RENAME = {
|
|
@@ -218,6 +218,29 @@ with gr.Blocks(
|
|
| 218 |
min-width: 120px;
|
| 219 |
max-width: 120px;
|
| 220 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
""") as demo:
|
| 222 |
gr.Markdown(
|
| 223 |
"""
|
|
@@ -296,7 +319,7 @@ with gr.Blocks(
|
|
| 296 |
|
| 297 |
# ---------- Audio Reasoning ----------
|
| 298 |
with gr.Tab("🎵 Audio Reasoning"):
|
| 299 |
-
gr.Markdown("Evaluation results for
|
| 300 |
|
| 301 |
df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
|
| 302 |
df_aud = df_aud.rename(columns=AUDIO_RENAME)
|
|
@@ -317,5 +340,53 @@ with gr.Blocks(
|
|
| 317 |
),
|
| 318 |
outputs=[omni_table, image_table, video_table, audio_table],
|
| 319 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
-
demo.launch()
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
AUDIO_RENAME = {
|
| 59 |
+
"Task1-3": "Audio Reasoning"
|
| 60 |
}
|
| 61 |
|
| 62 |
IMAGE_GEN_RENAME = {
|
|
|
|
| 218 |
min-width: 120px;
|
| 219 |
max-width: 120px;
|
| 220 |
}
|
| 221 |
+
|
| 222 |
+
.overall-definition {
|
| 223 |
+
max-width: 900px;
|
| 224 |
+
margin: 30px auto 40px auto;
|
| 225 |
+
padding: 22px 28px;
|
| 226 |
+
background: #f9fafb;
|
| 227 |
+
border: 1px solid #e5e7eb;
|
| 228 |
+
border-radius: 14px;
|
| 229 |
+
font-size: 15px;
|
| 230 |
+
line-height: 1.7;
|
| 231 |
+
color: #1f2937;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
.overall-definition h3 {
|
| 235 |
+
text-align: center;
|
| 236 |
+
font-size: 22px;
|
| 237 |
+
margin-bottom: 16px;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.overall-definition strong {
|
| 241 |
+
color: #111827;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
""") as demo:
|
| 245 |
gr.Markdown(
|
| 246 |
"""
|
|
|
|
| 319 |
|
| 320 |
# ---------- Audio Reasoning ----------
|
| 321 |
with gr.Tab("🎵 Audio Reasoning"):
|
| 322 |
+
gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.")
|
| 323 |
|
| 324 |
df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
|
| 325 |
df_aud = df_aud.rename(columns=AUDIO_RENAME)
|
|
|
|
| 340 |
),
|
| 341 |
outputs=[omni_table, image_table, video_table, audio_table],
|
| 342 |
)
|
| 343 |
+
|
| 344 |
+
gr.Markdown(
|
| 345 |
+
"""
|
| 346 |
+
<div class="overall-definition">
|
| 347 |
+
|
| 348 |
+
<h3>📊 Overall Score Definition</h3>
|
| 349 |
+
|
| 350 |
+
<p>
|
| 351 |
+
To facilitate clearer and more consistent comparison across models, we introduce an
|
| 352 |
+
<b>Overall</b> score for each leaderboard track. The aggregation strategy is tailored
|
| 353 |
+
to the evaluation protocol of each task category:
|
| 354 |
+
</p>
|
| 355 |
+
|
| 356 |
+
<p><b>1. OmniLLM / MLLM</b><br>
|
| 357 |
+
The <b>Overall</b> score is computed as the arithmetic mean of all reported task-specific scores.
|
| 358 |
+
</p>
|
| 359 |
+
|
| 360 |
+
<p><b>2. Image Generation</b><br>
|
| 361 |
+
The evaluation involves metrics defined on different numerical scales.
|
| 362 |
+
<b>WIScore</b> is used for image generation, while <b>VIEScore</b> (averaged over three dimensions)
|
| 363 |
+
is used for image editing.
|
| 364 |
+
</p>
|
| 365 |
+
|
| 366 |
+
<p>
|
| 367 |
+
The <b>Overall</b> score is defined as:
|
| 368 |
+
</p>
|
| 369 |
+
|
| 370 |
+
<p style="text-align:center; font-size:16px;">
|
| 371 |
+
\\[
|
| 372 |
+
\\text{Overall} = \\frac{(\\text{WIScore} \\times 10) + \\left(\\frac{\\sum \\text{VIEScore}}{3}\\right)}{2}
|
| 373 |
+
\\]
|
| 374 |
+
</p>
|
| 375 |
+
|
| 376 |
+
<p>
|
| 377 |
+
This normalization-based formulation ensures a balanced contribution from both image generation
|
| 378 |
+
and image editing performance.
|
| 379 |
+
</p>
|
| 380 |
+
|
| 381 |
+
<p><b>3. Video Generation</b><br>
|
| 382 |
+
The <b>Overall</b> score is calculated as the arithmetic mean of all evaluated dimensions,
|
| 383 |
+
including imaging quality, aesthetics, motion, and temporal consistency.
|
| 384 |
+
</p>
|
| 385 |
+
|
| 386 |
+
</div>
|
| 387 |
+
""",
|
| 388 |
+
unsafe_allow_html=True,
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
demo.launch()
|
| 392 |
|
|
|