eval-leaderboard

Sleeping

App Files Files Community

eval-leaderboard / app.py

jwilles

Merge branch 'main' of hf.co:spaces/vector-institute/llm-eval-leaderboard

3159db8 10 months ago

raw

history blame

5.2 kB

	import gradio as gr
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	import pandas as pd

	from src.about import (
	REPRODUCIBILITY_TEXT,
	INTRODUCTION_TEXT,
	ABOUT_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css, custom_js
	from src.display.utils import (
	COLS,
	ST_BENCHMARK_COLS,
	AGENTIC_BENCHMARK_COLS,
	EVAL_COLS,
	AutoEvalColumn,
	fields,
	)
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
	from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
	from src.submission.submit import add_new_eval


	def restart_space():
	API.restart_space(repo_id=REPO_ID)

	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()


	ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
	AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	def bold_max(s):
	is_max = s == s.max() # Boolean Series: True for the max value(s)
	return ['font-weight: bold' if v else '' for v in is_max]

	def init_leaderboard(df, benchmark_type):
	if df is None or df.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	non_task_cols = ["Model"]
	if benchmark_type == "agentic":
	# Include agent column
	non_task_cols.append("Agent")
	elif benchmark_type == "base":
	# Drop agent column
	dataframe = dataframe.drop(columns=["Agent"])
	AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]

	# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
	# df.style.set_table_styles([
	# {'selector': 'th', 'props': [('text-align', 'center')]},
	# {'selector': 'td', 'props': [('text-align', 'center')]}
	# ])
	# Define a common tooltip text
	# tooltip_text = "This is the common tooltip"

	# # Create a tooltip DataFrame with the same shape as df,
	# # filled with the same tooltip text for each cell.
	# tooltips = pd.DataFrame(tooltip_text, index=df.index, columns=df.columns)

	# # Apply the tooltips to the DataFrame
	# styled_df = df.style.set_tooltips(tooltips)


	return gr.components.Dataframe(
	value=df,
	datatype=[c.type for c in AutoEvalColumnSubset],
	column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
	wrap=False,
	)

	black_logo_path = "src/assets/logo-icon-black.png"
	white_logo_path = "src/assets/logo-icon-white.png"

	demo = gr.Blocks(
	css=custom_css,
	js=custom_js,
	theme=gr.themes.Default(primary_hue=gr.themes.colors.pink),
	fill_height=True,
	fill_width=True,
	)
	with demo:
	gr.HTML(f"""
	<div id="page-header">
	<div id="header-container">
	<div id="left-container">
	<img id="black-logo" src="/gradio_api/file={black_logo_path}">
	<img id="white-logo" src="/gradio_api/file={white_logo_path}">
	</div>
	<div id="centre-container">
	<h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
	<p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results & Traces</p>
	</div>
	<div id="right-container">
	</div>
	</div>
	</div>
	""")
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False)

	with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
	with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
	leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")

	with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
	leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")

	with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)

	with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3):
	gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)

	assets = [black_logo_path, white_logo_path]

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets)