FRENKIE-CHIANG commited on
Commit
affe8ae
·
verified ·
1 Parent(s): 7252a59

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +74 -3
app.py CHANGED
@@ -56,7 +56,7 @@ OMNI_MLLM_RENAME = {
56
  }
57
 
58
  AUDIO_RENAME = {
59
- "Task1-3": "Audio\nReasoning"
60
  }
61
 
62
  IMAGE_GEN_RENAME = {
@@ -218,6 +218,29 @@ with gr.Blocks(
218
  min-width: 120px;
219
  max-width: 120px;
220
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  """) as demo:
222
  gr.Markdown(
223
  """
@@ -296,7 +319,7 @@ with gr.Blocks(
296
 
297
  # ---------- Audio Reasoning ----------
298
  with gr.Tab("🎵 Audio Reasoning"):
299
- gr.Markdown("Evaluation results for audio reasoning models.")
300
 
301
  df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
302
  df_aud = df_aud.rename(columns=AUDIO_RENAME)
@@ -317,5 +340,53 @@ with gr.Blocks(
317
  ),
318
  outputs=[omni_table, image_table, video_table, audio_table],
319
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
- demo.launch()
 
56
  }
57
 
58
  AUDIO_RENAME = {
59
+ "Task1-3": "Audio Reasoning"
60
  }
61
 
62
  IMAGE_GEN_RENAME = {
 
218
  min-width: 120px;
219
  max-width: 120px;
220
  }
221
+
222
+ .overall-definition {
223
+ max-width: 900px;
224
+ margin: 30px auto 40px auto;
225
+ padding: 22px 28px;
226
+ background: #f9fafb;
227
+ border: 1px solid #e5e7eb;
228
+ border-radius: 14px;
229
+ font-size: 15px;
230
+ line-height: 1.7;
231
+ color: #1f2937;
232
+ }
233
+
234
+ .overall-definition h3 {
235
+ text-align: center;
236
+ font-size: 22px;
237
+ margin-bottom: 16px;
238
+ }
239
+
240
+ .overall-definition strong {
241
+ color: #111827;
242
+ }
243
+
244
  """) as demo:
245
  gr.Markdown(
246
  """
 
319
 
320
  # ---------- Audio Reasoning ----------
321
  with gr.Tab("🎵 Audio Reasoning"):
322
+ gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.")
323
 
324
  df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
325
  df_aud = df_aud.rename(columns=AUDIO_RENAME)
 
340
  ),
341
  outputs=[omni_table, image_table, video_table, audio_table],
342
  )
343
+
344
+ gr.Markdown(
345
+ """
346
+ <div class="overall-definition">
347
+
348
+ <h3>📊 Overall Score Definition</h3>
349
+
350
+ <p>
351
+ To facilitate clearer and more consistent comparison across models, we introduce an
352
+ <b>Overall</b> score for each leaderboard track. The aggregation strategy is tailored
353
+ to the evaluation protocol of each task category:
354
+ </p>
355
+
356
+ <p><b>1. OmniLLM / MLLM</b><br>
357
+ The <b>Overall</b> score is computed as the arithmetic mean of all reported task-specific scores.
358
+ </p>
359
+
360
+ <p><b>2. Image Generation</b><br>
361
+ The evaluation involves metrics defined on different numerical scales.
362
+ <b>WIScore</b> is used for image generation, while <b>VIEScore</b> (averaged over three dimensions)
363
+ is used for image editing.
364
+ </p>
365
+
366
+ <p>
367
+ The <b>Overall</b> score is defined as:
368
+ </p>
369
+
370
+ <p style="text-align:center; font-size:16px;">
371
+ \\[
372
+ \\text{Overall} = \\frac{(\\text{WIScore} \\times 10) + \\left(\\frac{\\sum \\text{VIEScore}}{3}\\right)}{2}
373
+ \\]
374
+ </p>
375
+
376
+ <p>
377
+ This normalization-based formulation ensures a balanced contribution from both image generation
378
+ and image editing performance.
379
+ </p>
380
+
381
+ <p><b>3. Video Generation</b><br>
382
+ The <b>Overall</b> score is calculated as the arithmetic mean of all evaluated dimensions,
383
+ including imaging quality, aesthetics, motion, and temporal consistency.
384
+ </p>
385
+
386
+ </div>
387
+ """,
388
+ unsafe_allow_html=True,
389
+ )
390
+
391
+ demo.launch()
392