Spaces:

nvidia
/

LLM_RTL_Errors_Explainer

Running

Danny Liu commited on 30 days ago

Commit

b233f03

1 Parent(s): 4ae29ac

Add PNGs with LFS tracking

Files changed (8) hide show

.gitattributes CHANGED Viewed

@@ -32,4 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
 scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from src.about import (
@@ -26,12 +26,6 @@ def init_leaderboard(dataframe):
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
@@ -44,13 +38,18 @@ demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 RTL Models Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 Taxonomy & About", elem_id="llm-benchmark-tab-table", id=1):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

 import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter
 import pandas as pd
 from src.about import (
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         filter_columns=[
             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    gr.Image("taxonomy_overview.png", show_label=False, show_download_button=False)
+    gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+    gr.Markdown("### Model evaluation on VerilogEval-Human V1 benchmark (156 problems, 10 rollouts each)")
+    leaderboard = init_leaderboard(LEADERBOARD_DF)
+    gr.Markdown("### Transition Matrices")
+    gr.Markdown("The transition matrices below show how errors evolve during the SFT and RL phases, revealing the surface convergence gap where optimization reduces syntax errors but increases functional testbench failures.")
+    with gr.Row():
+        gr.Image("subq1_sft_transition_matrix.png", label="SFT Transition Matrix", show_label=True, show_download_button=False)
+        gr.Image("subq1_transition_matrix.png", label="RL Transition Matrix", show_label=True, show_download_button=False)
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

src/about.py CHANGED Viewed

@@ -24,7 +24,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">RTL Error Analysis Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">How LLMs Fail and Generalize in RTL Coding for Hardware Design?</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """

src/display/utils.py CHANGED Viewed

@@ -23,13 +23,12 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False, hidden=True)])

 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", True)])
 auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
 auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False, hidden=True)])

src/populate.py CHANGED Viewed

@@ -10,10 +10,10 @@ def get_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
     df = pd.read_csv(csv_path)
     # Add model_type_symbol based on model_type
-    def get_symbol(mtype):
-        return ModelType.from_str(str(mtype)).value.symbol
-    df["model_type_symbol"] = df["model_type"].apply(get_symbol)
     # Sort by pass_rate
     if "pass_rate" in df.columns:
@@ -21,7 +21,6 @@ def get_leaderboard_df(cols: list, benchmark_cols: list) -> pd.DataFrame:
     # Rename columns to match the expected names in AutoEvalColumn
     rename_map = {
-        "model_type_symbol": AutoEvalColumn.model_type_symbol.name,
         "model": AutoEvalColumn.model.name,
         "model_type": AutoEvalColumn.model_type.name,
         "params": AutoEvalColumn.params.name,

     df = pd.read_csv(csv_path)
     # Add model_type_symbol based on model_type
+    def get_type_with_symbol(mtype):
+        return ModelType.from_str(str(mtype)).to_str()
+    df["model_type"] = df["model_type"].apply(get_type_with_symbol)
     # Sort by pass_rate
     if "pass_rate" in df.columns:
     # Rename columns to match the expected names in AutoEvalColumn
     rename_map = {
         "model": AutoEvalColumn.model.name,
         "model_type": AutoEvalColumn.model_type.name,
         "params": AutoEvalColumn.params.name,

subq1_sft_transition_matrix.png ADDED Viewed

subq1_transition_matrix.png ADDED Viewed

taxonomy_overview.png ADDED Viewed