Spaces:
Running
Running
Danny Liu commited on
Commit ·
26417d8
1
Parent(s): 77ec02d
Rearrange text per user request
Browse files- app.py +2 -2
- src/about.py +2 -2
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import pandas as pd
|
|
| 5 |
from src.about import (
|
| 6 |
CITATION_BUTTON_LABEL,
|
| 7 |
CITATION_BUTTON_TEXT,
|
| 8 |
-
|
| 9 |
LLM_BENCHMARKS_TEXT,
|
| 10 |
TITLE,
|
| 11 |
)
|
|
@@ -38,13 +38,13 @@ def init_leaderboard(dataframe):
|
|
| 38 |
demo = gr.Blocks(css=custom_css)
|
| 39 |
with demo:
|
| 40 |
gr.HTML(TITLE)
|
| 41 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 42 |
|
| 43 |
gr.Image("taxonomy_overview.png", elem_id="taxonomy-img", show_label=False, show_download_button=False)
|
| 44 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 45 |
|
| 46 |
gr.Markdown("### Model evaluation on VerilogEval-Human V1 benchmark (156 problems, 10 rollouts each)")
|
| 47 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
| 48 |
|
| 49 |
gr.Markdown("### Transition Matrices")
|
| 50 |
gr.Markdown("The transition matrices below show how errors evolve during the SFT and RL phases, revealing the surface convergence gap where optimization reduces syntax errors but increases functional testbench failures.")
|
|
|
|
| 5 |
from src.about import (
|
| 6 |
CITATION_BUTTON_LABEL,
|
| 7 |
CITATION_BUTTON_TEXT,
|
| 8 |
+
CONCLUSION_TEXT,
|
| 9 |
LLM_BENCHMARKS_TEXT,
|
| 10 |
TITLE,
|
| 11 |
)
|
|
|
|
| 38 |
demo = gr.Blocks(css=custom_css)
|
| 39 |
with demo:
|
| 40 |
gr.HTML(TITLE)
|
|
|
|
| 41 |
|
| 42 |
gr.Image("taxonomy_overview.png", elem_id="taxonomy-img", show_label=False, show_download_button=False)
|
| 43 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 44 |
|
| 45 |
gr.Markdown("### Model evaluation on VerilogEval-Human V1 benchmark (156 problems, 10 rollouts each)")
|
| 46 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 47 |
+
gr.Markdown(CONCLUSION_TEXT, elem_classes="markdown-text")
|
| 48 |
|
| 49 |
gr.Markdown("### Transition Matrices")
|
| 50 |
gr.Markdown("The transition matrices below show how errors evolve during the SFT and RL phases, revealing the surface convergence gap where optimization reduces syntax errors but increases functional testbench failures.")
|
src/about.py
CHANGED
|
@@ -28,9 +28,9 @@ TITLE = """<h1 align="center" id="space-title">How LLMs Fail and Generalize in R
|
|
| 28 |
|
| 29 |
# What does your leaderboard evaluate?
|
| 30 |
INTRODUCTION_TEXT = """
|
| 31 |
-
|
| 32 |
-
We introduce a four-level error taxonomy—**L1 syntactic**, **L2 semantic**, **L3S functional-solvable**, and **L3U functional-unsolvable**—where the L3 split is determined by problem-level solvability: whether the model can solve the problem in any rollout.
|
| 33 |
|
|
|
|
| 34 |
Evaluations on the VerilogEval Human benchmark reveal a strict empirical ceiling, with frontier models plateauing at a 90.8% initial pass rate.
|
| 35 |
The solvability taxonomy exposes that L3U (Unsolvable) errors dominate across all model families, revealing persistent knowledge gaps that inference-time scaling cannot address.
|
| 36 |
Our analysis exposes a striking surface convergence gap: optimization drastically reduces syntax errors but concurrently increases functional testbench failures.
|
|
|
|
| 28 |
|
| 29 |
# What does your leaderboard evaluate?
|
| 30 |
INTRODUCTION_TEXT = """
|
| 31 |
+
"""
|
|
|
|
| 32 |
|
| 33 |
+
CONCLUSION_TEXT = """
|
| 34 |
Evaluations on the VerilogEval Human benchmark reveal a strict empirical ceiling, with frontier models plateauing at a 90.8% initial pass rate.
|
| 35 |
The solvability taxonomy exposes that L3U (Unsolvable) errors dominate across all model families, revealing persistent knowledge gaps that inference-time scaling cannot address.
|
| 36 |
Our analysis exposes a striking surface convergence gap: optimization drastically reduces syntax errors but concurrently increases functional testbench failures.
|