Spaces:
Running
Running
leaderboard update
Browse files- app.py +17 -7
- src/about.py +1 -1
app.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.about import (
|
|
| 12 |
|
| 13 |
data = {
|
| 14 |
"Method": [
|
| 15 |
-
"Handwritten
|
| 16 |
"Zero-shot Text2SQL (llama-3.1-70B)",
|
| 17 |
"Zero-shot Text2SQL + LM Generation (llama-3.1-70B)",
|
| 18 |
"RAG (E5)",
|
|
@@ -27,7 +27,17 @@ leaderboard_df = pd.DataFrame(data)
|
|
| 27 |
leaderboard_df = leaderboard_df.sort_values(
|
| 28 |
"Execution Accuracy", ascending=False
|
| 29 |
).reset_index(drop=True)
|
| 30 |
-
leaderboard_df.insert(0, "Rank", leaderboard_df.index +
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def hyperlink_method(method):
|
|
@@ -87,7 +97,7 @@ with gr.Blocks() as demo:
|
|
| 87 |
)
|
| 88 |
|
| 89 |
# Section 2: Submission Frequency
|
| 90 |
-
with gr.Accordion("2️⃣ Submission Frequency", open=
|
| 91 |
gr.Markdown(
|
| 92 |
"""
|
| 93 |
- Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
|
|
@@ -96,17 +106,17 @@ with gr.Blocks() as demo:
|
|
| 96 |
)
|
| 97 |
|
| 98 |
# Section 3: How to Upload Materials
|
| 99 |
-
with gr.Accordion("3️⃣ How to Upload Materials", open=
|
| 100 |
gr.Markdown(
|
| 101 |
"""
|
| 102 |
Follow these steps to upload your materials:
|
| 103 |
1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
|
| 104 |
-
2. Email the `.zip` file or repositoty link to our email
|
| 105 |
"""
|
| 106 |
)
|
| 107 |
|
| 108 |
# Section 4: Submission Process
|
| 109 |
-
with gr.Accordion("4️⃣ Submission Process", open=
|
| 110 |
gr.Markdown(
|
| 111 |
"""
|
| 112 |
After uploading your materials:
|
|
@@ -122,7 +132,7 @@ with gr.Blocks() as demo:
|
|
| 122 |
gr.Markdown(
|
| 123 |
"""
|
| 124 |
<div style="text-align: center; margin-top: 2rem;">
|
| 125 |
-
For further assistance, reach out to
|
| 126 |
</div>
|
| 127 |
"""
|
| 128 |
)
|
|
|
|
| 12 |
|
| 13 |
data = {
|
| 14 |
"Method": [
|
| 15 |
+
"Handwritten LOTUS (llama-3.1-70B)",
|
| 16 |
"Zero-shot Text2SQL (llama-3.1-70B)",
|
| 17 |
"Zero-shot Text2SQL + LM Generation (llama-3.1-70B)",
|
| 18 |
"RAG (E5)",
|
|
|
|
| 27 |
leaderboard_df = leaderboard_df.sort_values(
|
| 28 |
"Execution Accuracy", ascending=False
|
| 29 |
).reset_index(drop=True)
|
| 30 |
+
leaderboard_df.insert(0, "Rank", leaderboard_df.index + 2)
|
| 31 |
+
leaderboard_df.loc[0, "Rank"] = None
|
| 32 |
+
|
| 33 |
+
def highlight_row(row):
|
| 34 |
+
if pd.isna(row["Rank"]): # First row
|
| 35 |
+
return ["background-color: #d4edda; font-weight: bold;" for _ in row]
|
| 36 |
+
return [""] * len(row)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Apply the style
|
| 40 |
+
leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1)
|
| 41 |
|
| 42 |
|
| 43 |
def hyperlink_method(method):
|
|
|
|
| 97 |
)
|
| 98 |
|
| 99 |
# Section 2: Submission Frequency
|
| 100 |
+
with gr.Accordion("2️⃣ Submission Frequency", open=True):
|
| 101 |
gr.Markdown(
|
| 102 |
"""
|
| 103 |
- Submissions are accepted **once a month** to ensure sufficient evaluation bandwidth.
|
|
|
|
| 106 |
)
|
| 107 |
|
| 108 |
# Section 3: How to Upload Materials
|
| 109 |
+
with gr.Accordion("3️⃣ How to Upload Materials", open=True):
|
| 110 |
gr.Markdown(
|
| 111 |
"""
|
| 112 |
Follow these steps to upload your materials:
|
| 113 |
1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
|
| 114 |
+
2. Email the `.zip` file or repositoty link to our email tagbenchmark@gmail.com.
|
| 115 |
"""
|
| 116 |
)
|
| 117 |
|
| 118 |
# Section 4: Submission Process
|
| 119 |
+
with gr.Accordion("4️⃣ Submission Process", open=True):
|
| 120 |
gr.Markdown(
|
| 121 |
"""
|
| 122 |
After uploading your materials:
|
|
|
|
| 132 |
gr.Markdown(
|
| 133 |
"""
|
| 134 |
<div style="text-align: center; margin-top: 2rem;">
|
| 135 |
+
For further assistance, reach out to tagbenchmark@gmail.com with questions.
|
| 136 |
</div>
|
| 137 |
"""
|
| 138 |
)
|
src/about.py
CHANGED
|
@@ -31,7 +31,7 @@ Intro text
|
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
## What does the TAG leaderboard evaluate?
|
| 34 |
-
In this leaderboard, you'll find execution accuracy comparisons of table question answering approaches on [TAG-Bench]
|
| 35 |
|
| 36 |
## How is accuracy measured?
|
| 37 |
Execution accuracy is measured as the number of exact matches to our annotated ground truth answers which are hand-labeled by experts.
|
|
|
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
## What does the TAG leaderboard evaluate?
|
| 34 |
+
In this leaderboard, you'll find execution accuracy comparisons of table question answering approaches on [TAG-Bench](https://github.com/TAG-Research/TAG-Bench/tree/main). TAG-Bench contains complex queries requiring world knowledge or semantic reasoning that goes beyond the information explicitly available in the database.
|
| 35 |
|
| 36 |
## How is accuracy measured?
|
| 37 |
Execution accuracy is measured as the number of exact matches to our annotated ground truth answers which are hand-labeled by experts.
|