FormulaOne-Leaderboard

Runtime error

App Files Files Community

galb-dai commited on Aug 14, 2025

Commit

985eb9a

1 Parent(s): ba1146a

Update tiers.

Browse files

Files changed (5) hide show

app.py +19 -17
assets/{tier1_performance.png → deeper_tier_performance.png} +0 -0
assets/{warmup_performance.png → shallow_tier_performance.png} +0 -0
src/about.py +7 -7
src/display/utils.py +2 -2

app.py CHANGED Viewed

@@ -203,9 +203,9 @@ def get_theme():
 # --- Gradio-based tabs for examples (no JS in HTML) ---
 def _select_example_tab(choice: str):
     return (
-        gr.update(visible=(choice == "Warmup")),
-        gr.update(visible=(choice == "Tier 1")),
-        gr.update(visible=(choice == "Tier 2")),
     )
@@ -219,25 +219,25 @@ MODEL_RELEASES = {
     "o3 Pro": "2025-06-10",
 }
-TIER_TOTALS = {"Warmup": 100, "Tier 1": 100, "Tier 2": 20}
 MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
 ACCURACY_PCT = {
-    "Warmup": {
         "GPT-5": 49,
         "Gemini 2.5 Pro": 30,
         "Grok 4": 28,
         "Claude Opus 4": 30,
         "o3 Pro": 24,
     },
-    "Tier 1": {
         "GPT-5": 4,
         "Gemini 2.5 Pro": 0,
         "Grok 4": 0,
         "Claude Opus 4": 0,
         "o3 Pro": 0,
     },
-    "Tier 2": {
         "GPT-5": 0,
         "Gemini 2.5 Pro": 0,
         "Grok 4": 0,
@@ -301,7 +301,7 @@ def build_accuracy_figure(tier: str):
     return fig
-_initial_accuracy_fig = build_accuracy_figure("Tier 1")
 # Force light theme even if HF user prefers dark
 blocks = gr.Blocks(
@@ -369,7 +369,7 @@ with blocks:
             with gr.Row(elem_id="f1-tier-select-row"):
                 tier_selector = gr.Radio(
                     choices=list(TIER_TOTALS.keys()),
-                    value="Tier 1",
                     label=None,
                     show_label=False,
                     elem_id="f1-tier-select",
@@ -461,8 +461,8 @@ with blocks:
                 )
                 tab_radio = gr.Radio(
-                    choices=["Warmup", "Tier 1", "Tier 2"],
-                    value="Warmup",
                     label=None,
                     show_label=False,
                     elem_id="f1-example-radio",
@@ -492,7 +492,7 @@ with blocks:
             # Evaluation: Warmup figure
             gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
             gr.Image(
-                "assets/warmup_performance.png",
                 width=600,
                 show_label=False,
                 elem_classes=["f1-image"],
@@ -500,14 +500,16 @@ with blocks:
                 show_download_button=False,
                 show_fullscreen_button=False,
             )
-            gr.HTML('<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Warmup dataset.</div>')
             # Between warmup and tier1 figs
             gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
-            # Tier 1 figure with corrected caption text
             gr.Image(
-                "assets/tier1_performance.png",
                 width=600,
                 show_label=False,
                 elem_classes=["f1-image"],
@@ -516,10 +518,10 @@ with blocks:
                 show_fullscreen_button=False,
             )
             gr.HTML(
-                '<div class="f1-figcaption">Performance of frontier reasoning models on Tier 1 of FormulaOne.</div>'
             )
-            # Tail after Tier 1 fig
             gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
         # Rename tab to "Leaderboard" and cap at 800px width

 # --- Gradio-based tabs for examples (no JS in HTML) ---
 def _select_example_tab(choice: str):
     return (
+        gr.update(visible=(choice == "Shallow")),
+        gr.update(visible=(choice == "Deeper")),
+        gr.update(visible=(choice == "Deepest")),
     )
     "o3 Pro": "2025-06-10",
 }
+TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
 MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
 ACCURACY_PCT = {
+    "Shallow Tier": {
         "GPT-5": 49,
         "Gemini 2.5 Pro": 30,
         "Grok 4": 28,
         "Claude Opus 4": 30,
         "o3 Pro": 24,
     },
+    "Deeper Tier": {
         "GPT-5": 4,
         "Gemini 2.5 Pro": 0,
         "Grok 4": 0,
         "Claude Opus 4": 0,
         "o3 Pro": 0,
     },
+    "Deepest Tier": {
         "GPT-5": 0,
         "Gemini 2.5 Pro": 0,
         "Grok 4": 0,
     return fig
+_initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
 # Force light theme even if HF user prefers dark
 blocks = gr.Blocks(
             with gr.Row(elem_id="f1-tier-select-row"):
                 tier_selector = gr.Radio(
                     choices=list(TIER_TOTALS.keys()),
+                    value="Deeper Tier",
                     label=None,
                     show_label=False,
                     elem_id="f1-tier-select",
                 )
                 tab_radio = gr.Radio(
+                    choices=["Shallow", "Deeper", "Deepest"],
+                    value="Shallow",
                     label=None,
                     show_label=False,
                     elem_id="f1-example-radio",
             # Evaluation: Warmup figure
             gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
             gr.Image(
+                "assets/shallow_tier_performance.png",
                 width=600,
                 show_label=False,
                 elem_classes=["f1-image"],
                 show_download_button=False,
                 show_fullscreen_button=False,
             )
+            gr.HTML(
+                '<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Shallow ("warmup") dataset.</div>'
+            )
             # Between warmup and tier1 figs
             gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
+            # Deeper tier figure with corrected caption text
             gr.Image(
+                "assets/deeper_tier_performance.png",
                 width=600,
                 show_label=False,
                 elem_classes=["f1-image"],
                 show_fullscreen_button=False,
             )
             gr.HTML(
+                '<div class="f1-figcaption">Performance of frontier reasoning models on the Deeper Tier of FormulaOne.</div>'
             )
+            # Tail after Deeper Tier fig
             gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
         # Rename tab to "Leaderboard" and cap at 800px width

assets/{tier1_performance.png → deeper_tier_performance.png} RENAMED Viewed

File without changes

assets/{warmup_performance.png → shallow_tier_performance.png} RENAMED Viewed

File without changes

src/about.py CHANGED Viewed

@@ -23,17 +23,17 @@ WHAT_IS_F1_HTML_TOP = f"""
           <div class="f1-grid-cell" role="columnheader">Description</div>
         </div>
         <div class="f1-grid-row" role="row">
-          <div class="f1-grid-cell" role="cell">Warmup</div>
           <div class="f1-grid-cell" role="cell">100</div>
           <div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
         </div>
         <div class="f1-grid-row" role="row">
-          <div class="f1-grid-cell" role="cell">Tier 1</div>
           <div class="f1-grid-cell" role="cell">100</div>
           <div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
         </div>
         <div class="f1-grid-row" role="row">
-          <div class="f1-grid-cell" role="cell">Tier 2</div>
           <div class="f1-grid-cell" role="cell">20</div>
           <div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
         </div>
@@ -83,7 +83,7 @@ WHAT_IS_F1_HTML_AFTER_VIDEO = """
         <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
     </ul>
     <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
-    <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Tier 1 and Tier 2 problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
 """
 # Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
@@ -93,13 +93,13 @@ WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG = """
     <!-- warmup_performance figure inserted via gr.Image in app.py -->
 """
-# Between Warmup and Tier 1 figures
 WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
-    <p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>Tier 1</strong>, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
     <!-- tier1_performance figure inserted via gr.Image in app.py -->
 """
-# Tail after Tier 1 figure (closes evaluation section + container)
 WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
     <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
   </section>

           <div class="f1-grid-cell" role="columnheader">Description</div>
         </div>
         <div class="f1-grid-row" role="row">
+          <div class="f1-grid-cell" role="cell">Shallow</div>
           <div class="f1-grid-cell" role="cell">100</div>
           <div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
         </div>
         <div class="f1-grid-row" role="row">
+          <div class="f1-grid-cell" role="cell">Deeper</div>
           <div class="f1-grid-cell" role="cell">100</div>
           <div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
         </div>
         <div class="f1-grid-row" role="row">
+          <div class="f1-grid-cell" role="cell">Deepest</div>
           <div class="f1-grid-cell" role="cell">20</div>
           <div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
         </div>
         <li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
     </ul>
     <p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
+    <p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Deeper and Deepest Tier problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
 """
 # Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
     <!-- warmup_performance figure inserted via gr.Image in app.py -->
 """
+# Between Shallow and Deeper figures
 WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
+    <p class="mb-4 f1-p">However, as the reasoning depth increases in the <strong>Deeper</strong> tier, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
     <!-- tier1_performance figure inserted via gr.Image in app.py -->
 """
+# Tail after Deeper figure (closes evaluation section + container)
 WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
     <p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
   </section>

src/display/utils.py CHANGED Viewed

@@ -24,8 +24,8 @@ class AutoEvalColumn:
     system = ColumnContent("System Name", "markdown", True, never_hidden=True)
     organization = ColumnContent("Organization", "str", True, never_hidden=True)
     success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
-    success_rate_tier1 = ColumnContent("Tier 1 Success (%)", "number", True)
-    success_rate_tier2 = ColumnContent("Tier 2 Success (%)", "number", True)
     submitted_on = ColumnContent("Submitted On", "datetime", True)

     system = ColumnContent("System Name", "markdown", True, never_hidden=True)
     organization = ColumnContent("Organization", "str", True, never_hidden=True)
     success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
+    success_rate_tier1 = ColumnContent("Deeper Tier Success (%)", "number", True)
+    success_rate_tier2 = ColumnContent("Deepest Tier Success (%)", "number", True)
     submitted_on = ColumnContent("Submitted On", "datetime", True)