Spaces:
Runtime error
Runtime error
Update tiers.
Browse files
app.py
CHANGED
|
@@ -203,9 +203,9 @@ def get_theme():
|
|
| 203 |
# --- Gradio-based tabs for examples (no JS in HTML) ---
|
| 204 |
def _select_example_tab(choice: str):
|
| 205 |
return (
|
| 206 |
-
gr.update(visible=(choice == "
|
| 207 |
-
gr.update(visible=(choice == "
|
| 208 |
-
gr.update(visible=(choice == "
|
| 209 |
)
|
| 210 |
|
| 211 |
|
|
@@ -219,25 +219,25 @@ MODEL_RELEASES = {
|
|
| 219 |
"o3 Pro": "2025-06-10",
|
| 220 |
}
|
| 221 |
|
| 222 |
-
TIER_TOTALS = {"
|
| 223 |
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
|
| 224 |
|
| 225 |
ACCURACY_PCT = {
|
| 226 |
-
"
|
| 227 |
"GPT-5": 49,
|
| 228 |
"Gemini 2.5 Pro": 30,
|
| 229 |
"Grok 4": 28,
|
| 230 |
"Claude Opus 4": 30,
|
| 231 |
"o3 Pro": 24,
|
| 232 |
},
|
| 233 |
-
"
|
| 234 |
"GPT-5": 4,
|
| 235 |
"Gemini 2.5 Pro": 0,
|
| 236 |
"Grok 4": 0,
|
| 237 |
"Claude Opus 4": 0,
|
| 238 |
"o3 Pro": 0,
|
| 239 |
},
|
| 240 |
-
"
|
| 241 |
"GPT-5": 0,
|
| 242 |
"Gemini 2.5 Pro": 0,
|
| 243 |
"Grok 4": 0,
|
|
@@ -301,7 +301,7 @@ def build_accuracy_figure(tier: str):
|
|
| 301 |
return fig
|
| 302 |
|
| 303 |
|
| 304 |
-
_initial_accuracy_fig = build_accuracy_figure("
|
| 305 |
|
| 306 |
# Force light theme even if HF user prefers dark
|
| 307 |
blocks = gr.Blocks(
|
|
@@ -369,7 +369,7 @@ with blocks:
|
|
| 369 |
with gr.Row(elem_id="f1-tier-select-row"):
|
| 370 |
tier_selector = gr.Radio(
|
| 371 |
choices=list(TIER_TOTALS.keys()),
|
| 372 |
-
value="
|
| 373 |
label=None,
|
| 374 |
show_label=False,
|
| 375 |
elem_id="f1-tier-select",
|
|
@@ -461,8 +461,8 @@ with blocks:
|
|
| 461 |
)
|
| 462 |
|
| 463 |
tab_radio = gr.Radio(
|
| 464 |
-
choices=["
|
| 465 |
-
value="
|
| 466 |
label=None,
|
| 467 |
show_label=False,
|
| 468 |
elem_id="f1-example-radio",
|
|
@@ -492,7 +492,7 @@ with blocks:
|
|
| 492 |
# Evaluation: Warmup figure
|
| 493 |
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
|
| 494 |
gr.Image(
|
| 495 |
-
"assets/
|
| 496 |
width=600,
|
| 497 |
show_label=False,
|
| 498 |
elem_classes=["f1-image"],
|
|
@@ -500,14 +500,16 @@ with blocks:
|
|
| 500 |
show_download_button=False,
|
| 501 |
show_fullscreen_button=False,
|
| 502 |
)
|
| 503 |
-
gr.HTML(
|
|
|
|
|
|
|
| 504 |
|
| 505 |
# Between warmup and tier1 figs
|
| 506 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
|
| 507 |
|
| 508 |
-
#
|
| 509 |
gr.Image(
|
| 510 |
-
"assets/
|
| 511 |
width=600,
|
| 512 |
show_label=False,
|
| 513 |
elem_classes=["f1-image"],
|
|
@@ -516,10 +518,10 @@ with blocks:
|
|
| 516 |
show_fullscreen_button=False,
|
| 517 |
)
|
| 518 |
gr.HTML(
|
| 519 |
-
'<div class="f1-figcaption">Performance of frontier reasoning models on
|
| 520 |
)
|
| 521 |
|
| 522 |
-
# Tail after Tier
|
| 523 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
|
| 524 |
|
| 525 |
# Rename tab to "Leaderboard" and cap at 800px width
|
|
|
|
| 203 |
# --- Gradio-based tabs for examples (no JS in HTML) ---
|
| 204 |
def _select_example_tab(choice: str):
|
| 205 |
return (
|
| 206 |
+
gr.update(visible=(choice == "Shallow")),
|
| 207 |
+
gr.update(visible=(choice == "Deeper")),
|
| 208 |
+
gr.update(visible=(choice == "Deepest")),
|
| 209 |
)
|
| 210 |
|
| 211 |
|
|
|
|
| 219 |
"o3 Pro": "2025-06-10",
|
| 220 |
}
|
| 221 |
|
| 222 |
+
TIER_TOTALS = {"Shallow Tier": 100, "Deeper Tier": 100, "Deepest Tier": 20}
|
| 223 |
MODELS_ORDER = ["GPT-5", "Gemini 2.5 Pro", "Grok 4", "Claude Opus 4", "o3 Pro"]
|
| 224 |
|
| 225 |
ACCURACY_PCT = {
|
| 226 |
+
"Shallow Tier": {
|
| 227 |
"GPT-5": 49,
|
| 228 |
"Gemini 2.5 Pro": 30,
|
| 229 |
"Grok 4": 28,
|
| 230 |
"Claude Opus 4": 30,
|
| 231 |
"o3 Pro": 24,
|
| 232 |
},
|
| 233 |
+
"Deeper Tier": {
|
| 234 |
"GPT-5": 4,
|
| 235 |
"Gemini 2.5 Pro": 0,
|
| 236 |
"Grok 4": 0,
|
| 237 |
"Claude Opus 4": 0,
|
| 238 |
"o3 Pro": 0,
|
| 239 |
},
|
| 240 |
+
"Deepest Tier": {
|
| 241 |
"GPT-5": 0,
|
| 242 |
"Gemini 2.5 Pro": 0,
|
| 243 |
"Grok 4": 0,
|
|
|
|
| 301 |
return fig
|
| 302 |
|
| 303 |
|
| 304 |
+
_initial_accuracy_fig = build_accuracy_figure("Deeper Tier")
|
| 305 |
|
| 306 |
# Force light theme even if HF user prefers dark
|
| 307 |
blocks = gr.Blocks(
|
|
|
|
| 369 |
with gr.Row(elem_id="f1-tier-select-row"):
|
| 370 |
tier_selector = gr.Radio(
|
| 371 |
choices=list(TIER_TOTALS.keys()),
|
| 372 |
+
value="Deeper Tier",
|
| 373 |
label=None,
|
| 374 |
show_label=False,
|
| 375 |
elem_id="f1-tier-select",
|
|
|
|
| 461 |
)
|
| 462 |
|
| 463 |
tab_radio = gr.Radio(
|
| 464 |
+
choices=["Shallow", "Deeper", "Deepest"],
|
| 465 |
+
value="Shallow",
|
| 466 |
label=None,
|
| 467 |
show_label=False,
|
| 468 |
elem_id="f1-example-radio",
|
|
|
|
| 492 |
# Evaluation: Warmup figure
|
| 493 |
gr.HTML(WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG, padding=False)
|
| 494 |
gr.Image(
|
| 495 |
+
"assets/shallow_tier_performance.png",
|
| 496 |
width=600,
|
| 497 |
show_label=False,
|
| 498 |
elem_classes=["f1-image"],
|
|
|
|
| 500 |
show_download_button=False,
|
| 501 |
show_fullscreen_button=False,
|
| 502 |
)
|
| 503 |
+
gr.HTML(
|
| 504 |
+
'<div class="f1-figcaption">Performance of frontier models on the FormulaOne-Shallow ("warmup") dataset.</div>'
|
| 505 |
+
)
|
| 506 |
|
| 507 |
# Between warmup and tier1 figs
|
| 508 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_WARMUPFIG)
|
| 509 |
|
| 510 |
+
# Deeper tier figure with corrected caption text
|
| 511 |
gr.Image(
|
| 512 |
+
"assets/deeper_tier_performance.png",
|
| 513 |
width=600,
|
| 514 |
show_label=False,
|
| 515 |
elem_classes=["f1-image"],
|
|
|
|
| 518 |
show_fullscreen_button=False,
|
| 519 |
)
|
| 520 |
gr.HTML(
|
| 521 |
+
'<div class="f1-figcaption">Performance of frontier reasoning models on the Deeper Tier of FormulaOne.</div>'
|
| 522 |
)
|
| 523 |
|
| 524 |
+
# Tail after Deeper Tier fig
|
| 525 |
gr.HTML(WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL)
|
| 526 |
|
| 527 |
# Rename tab to "Leaderboard" and cap at 800px width
|
assets/{tier1_performance.png → deeper_tier_performance.png}
RENAMED
|
File without changes
|
assets/{warmup_performance.png → shallow_tier_performance.png}
RENAMED
|
File without changes
|
src/about.py
CHANGED
|
@@ -23,17 +23,17 @@ WHAT_IS_F1_HTML_TOP = f"""
|
|
| 23 |
<div class="f1-grid-cell" role="columnheader">Description</div>
|
| 24 |
</div>
|
| 25 |
<div class="f1-grid-row" role="row">
|
| 26 |
-
<div class="f1-grid-cell" role="cell">
|
| 27 |
<div class="f1-grid-cell" role="cell">100</div>
|
| 28 |
<div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
|
| 29 |
</div>
|
| 30 |
<div class="f1-grid-row" role="row">
|
| 31 |
-
<div class="f1-grid-cell" role="cell">
|
| 32 |
<div class="f1-grid-cell" role="cell">100</div>
|
| 33 |
<div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
|
| 34 |
</div>
|
| 35 |
<div class="f1-grid-row" role="row">
|
| 36 |
-
<div class="f1-grid-cell" role="cell">
|
| 37 |
<div class="f1-grid-cell" role="cell">20</div>
|
| 38 |
<div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
|
| 39 |
</div>
|
|
@@ -83,7 +83,7 @@ WHAT_IS_F1_HTML_AFTER_VIDEO = """
|
|
| 83 |
<li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
|
| 84 |
</ul>
|
| 85 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
|
| 86 |
-
<p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the
|
| 87 |
"""
|
| 88 |
|
| 89 |
# Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
|
|
@@ -93,13 +93,13 @@ WHAT_IS_F1_HTML_EVAL_BEFORE_WARMUPFIG = """
|
|
| 93 |
<!-- warmup_performance figure inserted via gr.Image in app.py -->
|
| 94 |
"""
|
| 95 |
|
| 96 |
-
# Between
|
| 97 |
WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
|
| 98 |
-
<p class="mb-4 f1-p">However, as the reasoning depth increases in <strong>
|
| 99 |
<!-- tier1_performance figure inserted via gr.Image in app.py -->
|
| 100 |
"""
|
| 101 |
|
| 102 |
-
# Tail after
|
| 103 |
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
|
| 104 |
<p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
|
| 105 |
</section>
|
|
|
|
| 23 |
<div class="f1-grid-cell" role="columnheader">Description</div>
|
| 24 |
</div>
|
| 25 |
<div class="f1-grid-row" role="row">
|
| 26 |
+
<div class="f1-grid-cell" role="cell">Shallow</div>
|
| 27 |
<div class="f1-grid-cell" role="cell">100</div>
|
| 28 |
<div class="f1-grid-cell" role="cell">A set of “easier” problems.</div>
|
| 29 |
</div>
|
| 30 |
<div class="f1-grid-row" role="row">
|
| 31 |
+
<div class="f1-grid-cell" role="cell">Deeper</div>
|
| 32 |
<div class="f1-grid-cell" role="cell">100</div>
|
| 33 |
<div class="f1-grid-cell" role="cell">A set of challenging problems.</div>
|
| 34 |
</div>
|
| 35 |
<div class="f1-grid-row" role="row">
|
| 36 |
+
<div class="f1-grid-cell" role="cell">Deepest</div>
|
| 37 |
<div class="f1-grid-cell" role="cell">20</div>
|
| 38 |
<div class="f1-grid-cell" role="cell">A set of highly challenging problems.</div>
|
| 39 |
</div>
|
|
|
|
| 83 |
<li class="f1-li"><strong>Efficiency:</strong> The solution must be truly <a href="https://en.wikipedia.org/wiki/Parameterized_complexity" target="_blank" rel="noopener noreferrer" class="f1-a">fixed-parameter linear</a>.</li>
|
| 84 |
</ul>
|
| 85 |
<p class="mb-4 f1-p">To support research and encourage community contributions, the <code>FormulaOne-Warmup</code> dataset is released as a public resource for training and fine-tuning models. The complete test suite for all 100 Warmup problems is available, alongside a standalone evaluation environment, in our <a href="https://github.com/double-ai/formulaone-dataset/tree/main" target="_blank" rel="noopener noreferrer" class="f1-a">GitHub repository</a>.</p>
|
| 86 |
+
<p class="f1-p">To maintain the integrity of the core benchmark, only a minimal subset of tests is released for the Deeper and Deepest Tier problems. Solutions submitted for evaluation on our benchmark are evaluated against a withheld comprehensive test-suite.</p>
|
| 87 |
"""
|
| 88 |
|
| 89 |
# Evaluation: begins the "Model Accuracy" subsection and the Warmup paragraph, up to (but not including) the Warmup figure.
|
|
|
|
| 93 |
<!-- warmup_performance figure inserted via gr.Image in app.py -->
|
| 94 |
"""
|
| 95 |
|
| 96 |
+
# Between Shallow and Deeper figures
|
| 97 |
WHAT_IS_F1_HTML_AFTER_WARMUPFIG = """
|
| 98 |
+
<p class="mb-4 f1-p">However, as the reasoning depth increases in the <strong>Deeper</strong> tier, and solutions require the discovery and integration of novel and more complex state representations, model performance drops off sharply.</p>
|
| 99 |
<!-- tier1_performance figure inserted via gr.Image in app.py -->
|
| 100 |
"""
|
| 101 |
|
| 102 |
+
# Tail after Deeper figure (closes evaluation section + container)
|
| 103 |
WHAT_IS_F1_HTML_AFTER_TIER1FIG_TAIL = """
|
| 104 |
<p class="f1-p">This trend culminates in <strong>Tier 2</strong>, where the difficulty is characteristic of exploratory research problems. On this set of 20 problems, no current frontier model solves even a single one. This result starkly illustrates the gap that remains between high performance on existing benchmarks and the deep algorithmic reasoning required for truly complex problems.</p>
|
| 105 |
</section>
|
src/display/utils.py
CHANGED
|
@@ -24,8 +24,8 @@ class AutoEvalColumn:
|
|
| 24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
| 25 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
| 26 |
success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
|
| 27 |
-
success_rate_tier1 = ColumnContent("Tier
|
| 28 |
-
success_rate_tier2 = ColumnContent("Tier
|
| 29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
| 30 |
|
| 31 |
|
|
|
|
| 24 |
system = ColumnContent("System Name", "markdown", True, never_hidden=True)
|
| 25 |
organization = ColumnContent("Organization", "str", True, never_hidden=True)
|
| 26 |
success_rate_overall = ColumnContent("Overall Success (%)", "number", True)
|
| 27 |
+
success_rate_tier1 = ColumnContent("Deeper Tier Success (%)", "number", True)
|
| 28 |
+
success_rate_tier2 = ColumnContent("Deepest Tier Success (%)", "number", True)
|
| 29 |
submitted_on = ColumnContent("Submitted On", "datetime", True)
|
| 30 |
|
| 31 |
|