Spaces:
Running
Running
File size: 2,546 Bytes
e148f6b b233f03 e148f6b 26417d8 e148f6b 0e8f5d6 e148f6b 0e8f5d6 e148f6b 4e5862b e148f6b 4e5862b e148f6b b233f03 3de1276 9a205f0 b233f03 10f2787 7c6ad47 10f2787 b233f03 10f2787 e148f6b 0e8f5d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter
import pandas as pd
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
CONCLUSION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
AutoEvalColumn,
fields,
)
from src.populate import get_leaderboard_df
LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
search_columns=[AutoEvalColumn.model.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Image("taxonomy_overview.png", elem_id="taxonomy-img", show_label=False, show_download_button=False)
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
gr.Markdown("## Benchmark")
gr.Markdown("### Model evaluation on VerilogEval-Human V1 benchmark (156 problems, 10 rollouts each)")
with gr.Column(elem_id="leaderboard-container"):
leaderboard = init_leaderboard(LEADERBOARD_DF)
gr.Markdown("### Evaluation Results")
gr.Markdown(CONCLUSION_TEXT, elem_classes="markdown-text")
gr.Markdown("### Transition Matrices")
gr.Markdown("The transition matrices below show how errors evolve during the SFT and RL phases, revealing the surface convergence gap where optimization reduces syntax errors but increases functional testbench failures.")
with gr.Row():
gr.Image("subq1_sft_transition_matrix.png", show_label=False, show_download_button=False)
gr.Image("subq1_transition_matrix.png", show_label=False, show_download_button=False)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
demo.queue(default_concurrency_limit=40).launch()
|