Spaces:

Fysics-AI
/

FysicsWorld-LeaderBoard

Running

App Files Files Community

FRENKIE-CHIANG commited on 25 days ago

Commit

affe8ae

verified ·

1 Parent(s): 7252a59

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +74 -3

app.py CHANGED Viewed

@@ -56,7 +56,7 @@ OMNI_MLLM_RENAME = {
 }
 AUDIO_RENAME = {
-    "Task1-3": "Audio\nReasoning"
 }
 IMAGE_GEN_RENAME = {
@@ -218,6 +218,29 @@ with gr.Blocks(
         min-width: 120px;
         max-width: 120px;
     }
     """) as demo:
     gr.Markdown(
         """
@@ -296,7 +319,7 @@ with gr.Blocks(
         # ---------- Audio Reasoning ----------
         with gr.Tab("🎵 Audio Reasoning"):
-            gr.Markdown("Evaluation results for audio reasoning models.")
             df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
             df_aud = df_aud.rename(columns=AUDIO_RENAME)
@@ -317,5 +340,53 @@ with gr.Blocks(
         ),
         outputs=[omni_table, image_table, video_table, audio_table],
     )
-demo.launch()

 }
 AUDIO_RENAME = {
+    "Task1-3": "Audio Reasoning"
 }
 IMAGE_GEN_RENAME = {
         min-width: 120px;
         max-width: 120px;
     }
+    .overall-definition {
+        max-width: 900px;
+        margin: 30px auto 40px auto;
+        padding: 22px 28px;
+        background: #f9fafb;
+        border: 1px solid #e5e7eb;
+        border-radius: 14px;
+        font-size: 15px;
+        line-height: 1.7;
+        color: #1f2937;
+    }
+    .overall-definition h3 {
+        text-align: center;
+        font-size: 22px;
+        margin-bottom: 16px;
+    }
+    .overall-definition strong {
+        color: #111827;
+    }
     """) as demo:
     gr.Markdown(
         """
         # ---------- Audio Reasoning ----------
         with gr.Tab("🎵 Audio Reasoning"):
+            gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.")
             df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3")
             df_aud = df_aud.rename(columns=AUDIO_RENAME)
         ),
         outputs=[omni_table, image_table, video_table, audio_table],
     )
+    gr.Markdown(
+        """
+        <div class="overall-definition">
+        <h3>📊 Overall Score Definition</h3>
+        <p>
+        To facilitate clearer and more consistent comparison across models, we introduce an
+        <b>Overall</b> score for each leaderboard track. The aggregation strategy is tailored
+        to the evaluation protocol of each task category:
+        </p>
+        <p><b>1. OmniLLM / MLLM</b><br>
+        The <b>Overall</b> score is computed as the arithmetic mean of all reported task-specific scores.
+        </p>
+        <p><b>2. Image Generation</b><br>
+        The evaluation involves metrics defined on different numerical scales.
+        <b>WIScore</b> is used for image generation, while <b>VIEScore</b> (averaged over three dimensions)
+        is used for image editing.
+        </p>
+        <p>
+        The <b>Overall</b> score is defined as:
+        </p>
+        <p style="text-align:center; font-size:16px;">
+        \\[
+        \\text{Overall} = \\frac{(\\text{WIScore} \\times 10) + \\left(\\frac{\\sum \\text{VIEScore}}{3}\\right)}{2}
+        \\]
+        </p>
+        <p>
+        This normalization-based formulation ensures a balanced contribution from both image generation
+        and image editing performance.
+        </p>
+        <p><b>3. Video Generation</b><br>
+        The <b>Overall</b> score is calculated as the arithmetic mean of all evaluated dimensions,
+        including imaging quality, aesthetics, motion, and temporal consistency.
+        </p>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+demo.launch()