TAG-Leaderboard

Running

App Files Files Community

TAG-Leaderboard / app.py

abiswal

update

83bd87e 10 months ago

raw

history blame contribute delete

6.38 kB

	import gradio as gr
	import pandas as pd

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)

	data = {
	"Method": [
	"Human Performance (Handwritten LOTUS Llama-3.1-70B)",
	"Zero-shot Text2SQL (Llama-3.1-70B)",
	"Zero-shot Text2SQL + LM Generation (Llama-3.1-70B)",
	"RAG (E5 + Llama-3.1-70B)",
	"RAG (E5) + LM Rerank (Llama-3.1-70B)",
	"Human Performance (Handwritten LOTUS GPT-4o)",
	"Zero-shot Text2SQL (GPT-4o)",
	"Zero-shot Text2SQL + LM Generation (GPT-4o)",
	"RAG (E5 + GPT-4o)",
	"RAG (E5) + LM Rerank (GPT-4o)",
	"Human Performance (Handwritten LOTUS o3-mini)",
	"Zero-shot Text2SQL (o3-mini)",
	"Zero-shot Text2SQL + LM Generation (o3-mini)",
	"RAG (E5 + o3-mini)",
	"RAG (E5) + LM Rerank (o3-mini)",
	"Zero-shot Text2SQL (Deepseek-R1)",
	"Zero-shot Text2SQL + LM Generation (Deepseek-R1)",
	],
	# "Model": ["meta-llama/Llama-3.1-70B"] * 5,
	"Execution Accuracy": [55.0, 17.0, 13.0, 0.0, 2.0, 55.0, 18.0, 15.0, 3.0, 3.0, 65.0, 18.0, 30.0, 7.0, 7.0, 12.0, 0.0],
	# "Execution Accuracy": [0.0, 2.0, 55.0, 18.0, 3.0, 3.0, 65.0, 18.0, 7.0, 7.0, 12.0],
	}

	leaderboard_df = pd.DataFrame(data)

	leaderboard_df = leaderboard_df.sort_values(
	"Execution Accuracy", ascending=False
	).reset_index(drop=True)
	leaderboard_df.insert(0, "Rank", leaderboard_df.index - 2)
	leaderboard_df.loc[0, "Rank"] = ""
	leaderboard_df.loc[1, "Rank"] = ""
	leaderboard_df.loc[2, "Rank"] = ""

	def hyperlink_method(method):
	base_url = "https://github.com/TAG-Research/TAG-Bench/tree/main"
	return f'<a href="{base_url}" target="_blank">{method}</a>'

	def hyperlink_model(model):
	base_url = "https://huggingface.co/meta-llama/Llama-3.1-70B"
	return f'<a href="{base_url}" target="_blank">{model}</a>'

	leaderboard_df["Method"] = leaderboard_df["Method"].apply(hyperlink_method)

	def highlight_row(row):
	if row["Rank"] == "": # First row
	return ["background-color: #d4edda; font-weight: bold;" for _ in row]
	return [""] * len(row)


	# Apply the style
	leaderboard_df = leaderboard_df.style.apply(highlight_row, axis=1)



	# leaderboard_df["Model"] = leaderboard_df["Model"].apply(hyperlink_model)


	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center;">
	<h1 style="font-size: 2.5rem; margin-bottom: 0.5rem;">TAG Leaderboard</h1>
	<p style="font-size: 1.25rem; color: gray;">A benchmark for natural language queries over data</p>
	</div>
	"""
	)

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
	with gr.Row():
	gr.Dataframe(
	value=leaderboard_df,
	headers=["Rank", "Method", "Execution Accuracy"],
	datatype=["str", "html", "number"],
	row_count=(5, "dynamic"),
	wrap=True,
	elem_id="leaderboard",
	type="pandas"
	)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	with gr.TabItem("🚀 Submission Instructions ", elem_id="llm-benchmark-tab-table", id=3):
	with gr.Accordion("1️⃣ Required Materials", open=True):
	gr.Markdown(
	"""
	Ensure the following files are included in your submission:
	- output.json: File containing the evaluation outputs generated by your model. Please refer to [] for format instructions.
	- requirements.txt: A list of dependencies needed to run your model or script.
	- README.md: A detailed description of your submission, including:
	- Purpose and overview of the submission.
	- Instructions to reproduce the results.
	- Any additional notes for evaluators.
	- Model/Keys: Upload your models or API keys to [Hugging Face](https://huggingface.co/) if they are not publicly accessible.

	Note: Submissions missing any of these materials will not be processed.
	"""
	)

	# Section 2: Submission Frequency
	with gr.Accordion("2️⃣ Submission Frequency", open=True):
	gr.Markdown(
	"""
	- Submissions are accepted once a month to ensure sufficient evaluation bandwidth.
	- Plan your submission timeline accordingly to avoid delays.
	"""
	)

	# Section 3: How to Upload Materials
	with gr.Accordion("3️⃣ How to Upload Materials", open=True):
	gr.Markdown(
	"""
	Follow these steps to upload your materials:
	1. Compress all files in the code into a single `.zip` file, or provide a public repository to refer to.
	2. Email the `.zip` file or repositoty link to our email tagbenchmark@gmail.com.
	"""
	)

	# Section 4: Submission Process
	with gr.Accordion("4️⃣ Submission Process", open=True):
	gr.Markdown(
	"""
	After uploading your materials:
	-
	- Provide accurate contact information for follow-ups.
	- Double-check your materials for completeness to avoid processing delays.

	Important: Your submission will be added to the evaluation queue. Depending on the queue size, evaluations may take up to a few weeks.
	"""
	)

	# Footer
	gr.Markdown(
	"""
	<div style="text-align: center; margin-top: 2rem;">
	For further assistance, reach out to tagbenchmark@gmail.com with questions.
	</div>
	"""
	)


	demo.launch()