TAG-Leaderboard

Running

App Files Files Community

abiswal commited on Jan 14, 2025

Commit

e25cebf

1 Parent(s): d369ab3

leaderboard updates

Browse files

Files changed (2) hide show

app.py +74 -25
src/about.py +4 -4

app.py CHANGED Viewed

@@ -10,62 +10,56 @@ from src.about import (
     TITLE,
 )
-# Simplified DataFrame for the leaderboard
 data = {
-    "Model": [
         "Handwritten TAG",
-        "Zero-shot Text2SQL",
-        "Zero-shot Text2SQL + LM Generation",
         "RAG (E5)",
         "RAG (E5) + LM Rerank",
     ],
-    "Execution Accuracy": ["55%", "17%", "13%", "0%", "2%"],
 }
-# Create a DataFrame
 leaderboard_df = pd.DataFrame(data)
-# Convert Execution Accuracy to numeric for sorting
-leaderboard_df["Execution Accuracy (numeric)"] = (
-    leaderboard_df["Execution Accuracy"].str.rstrip("%").astype(float)
-)
 leaderboard_df = leaderboard_df.sort_values(
-    "Execution Accuracy (numeric)", ascending=False
 ).reset_index(drop=True)
-# Add the Rank column
 leaderboard_df.insert(0, "Rank", leaderboard_df.index + 1)
-# Drop the numeric column for display
-leaderboard_df = leaderboard_df.drop(columns=["Execution Accuracy (numeric)"])
-# Add hyperlinks to the Model column
-def hyperlink_model(model):
     base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
     return f'<a href="{base_url}" target="_blank">{model}</a>'
-leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)
-# Simplified Gradio app
 with gr.Blocks() as demo:
     gr.HTML(
         """
         <div style="text-align: center;">
             <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
-            <p style="font-size: 1.25rem; color: gray;">Comparing baseline approaches for structured data queries</p>
         </div>
         """
     )
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            # Highlight the top row in green for "Handwritten TAG"
             with gr.Row():
                 gr.Dataframe(
                     value=leaderboard_df,
-                    headers=["Model", "Code", "Execution Accuracy"],
-                    datatype=["str", "html", "str"],
                     row_count=(5, "dynamic"),
                     wrap=True,
                     elem_id="leaderboard",
@@ -76,7 +70,62 @@ with gr.Blocks() as demo:
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
-            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 demo.launch()

     TITLE,
 )
 data = {
+    "Method": [
         "Handwritten TAG",
+        "Zero-shot Text2SQL (llama-3.1-70B)",
+        "Zero-shot Text2SQL + LM Generation (llama-3.1-70B)",
         "RAG (E5)",
         "RAG (E5) + LM Rerank",
     ],
+    # "Model": ["meta-llama/Llama-3.1-70B"] * 5,
+    "Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0],
 }
 leaderboard_df = pd.DataFrame(data)
 leaderboard_df = leaderboard_df.sort_values(
+    "Execution Accuracy", ascending=False
 ).reset_index(drop=True)
 leaderboard_df.insert(0, "Rank", leaderboard_df.index + 1)
+def hyperlink_method(method):
     base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
+    return f'<a href="{base_url}" target="_blank">{method}</a>'
+def hyperlink_model(model):
+    base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
     return f'<a href="{base_url}" target="_blank">{model}</a>'
+leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)
+# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)
 with gr.Blocks() as demo:
     gr.HTML(
         """
         <div style="text-align: center;">
             <h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
+            <p style="font-size: 1.25rem; color: gray;">Evaluating complex natural language queries over structured data.</p>
         </div>
         """
     )
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 gr.Dataframe(
                     value=leaderboard_df,
+                    headers=["Rank", "Method", "Execution Accuracy"],
+                    datatype=["str", "html",  "float"],
                     row_count=(5, "dynamic"),
                     wrap=True,
                     elem_id="leaderboard",
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
+            with gr.Accordion("1️⃣ Required Materials", open=True):
+                gr.Markdown(
+                    """
+                    Ensure the following files are included in your submission:
+                    - **output.json**: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
+                    - **requirements.txt**: A list of dependencies needed to run your model or script.
+                    - **README.md**: A detailed description of your submission, including:
+                        - Purpose and overview of the submission.
+                        - Instructions to reproduce the results.
+                        - Any additional notes for evaluators.
+                    - **Model/Keys**: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.
+                    **Note**: Submissions missing any of these materials will not be processed.
+                    """
+                )
+            # Section 2: Submission Frequency
+            with gr.Accordion("2️⃣ Submission Frequency", open=False):
+                gr.Markdown(
+                    """
+                    - Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
+                    - Plan your submission timeline accordingly to avoid delays.
+                    """
+                )
+            # Section 3: How to Upload Materials
+            with gr.Accordion("3️⃣ How to Upload Materials", open=False):
+                gr.Markdown(
+                    """
+                    Follow these steps to upload your materials:
+                    1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
+                    2. Email the `.zip` file or repositoty link to our email [email].
+                    """
+                )
+            # Section 4: Submission Process
+            with gr.Accordion("4️⃣ Submission Process", open=False):
+                gr.Markdown(
+                    """
+                    After uploading your materials:
+                    -
+                    - Provide accurate contact information for follow-ups.
+                    - Double-check your materials for completeness to avoid processing delays.
+                    **Important:** Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
+                    """
+                )
+            # Footer
+            gr.Markdown(
+                """
+                <div style="text-align: center; margin-top: 2rem;">
+                    For further assistance, reach out to [email] with questions.
+                </div>
+                """
+            )
 demo.launch()

src/about.py CHANGED Viewed

@@ -30,11 +30,11 @@ Intro text
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """

 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+## What does the TAG leaderboard evaluate?
+In this leaderboard, you'll find execution accuracy comparisons of table question answering approaches on [TAG-Bench] (https://github.com/TAG-Research/TAG-Bench/tree/main). TAG-Bench contains complex queries requiring world knowledge or semantic reasoning that goes beyond the information explicitly available in the database.
+## How is accuracy measured?
+Execution accuracy is measured as the number of exact matches to our annotated ground truth answers which are hand-labeled by experts.
 """
 EVALUATION_QUEUE_TEXT = """