Spaces:

Autonomous-Scientific-Agents
/

chemgraph-leaderboard

Running

App Files Files Community

chemgraph-leaderboard / app.py

tdphamm

Fix HF Spaces performance: reduce concurrency, remove timer, lazy imports (#7)

be24d40 5 days ago

raw

history blame contribute delete

18.2 kB

	import sys
	import time
	from datetime import datetime, timezone

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	import pandas as pd
	import plotly.express as px
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	ModelType,
	fields,
	WeightType,
	Precision,
	)
	from src.envs import (
	API,
	EVAL_REQUESTS_PATH,
	EVAL_RESULTS_PATH,
	QUEUE_REPO,
	REPO_ID,
	RESULTS_REPO,
	TOKEN,
	WORKFLOWS,
	get_eval_results_path,
	get_eval_requests_path,
	)
	from src.leaderboard.read_evals import clear_eval_cache
	from src.populate import (
	get_evaluation_queue_df,
	get_leaderboard_df,
	get_trend_summary_df,
	get_trend_history_df,
	get_combined_trend_history_df,
	get_combined_trend_summary_df,
	)
	from src.submission.submit import add_new_eval

	# --local flag: skip HF Hub downloads and scheduler, use local data only.
	LOCAL_MODE = "--local" in sys.argv


	def restart_space():
	API.restart_space(repo_id=REPO_ID)


	### Space initialisation

	MAX_DOWNLOAD_RETRIES = 3


	def download_with_retry(repo_id: str, local_dir: str, label: str) -> None:
	"""Download a HF Hub dataset with retries. Restarts the Space only after all retries are exhausted."""
	for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
	try:
	print(f"Downloading {label} ({repo_id}) — attempt {attempt}/{MAX_DOWNLOAD_RETRIES}")
	snapshot_download(
	repo_id=repo_id,
	local_dir=local_dir,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN,
	)
	return # success
	except Exception as e:
	print(f"WARNING: Failed to download {label}: {e}")
	if attempt < MAX_DOWNLOAD_RETRIES:
	wait = 10 * attempt
	print(f" Retrying in {wait}s...")
	time.sleep(wait)
	else:
	print(f"ERROR: All {MAX_DOWNLOAD_RETRIES} download attempts failed for {label}. Restarting space.")
	restart_space()


	if not LOCAL_MODE:
	download_with_retry(QUEUE_REPO, EVAL_REQUESTS_PATH, "eval requests")
	download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results")
	else:
	print("LOCAL MODE: skipping HF Hub downloads, using local eval-results/ and eval-queue/")

	# Load leaderboard data for each workflow
	SINGLE_AGENT_RESULTS = get_eval_results_path("single_agent")
	SINGLE_AGENT_REQUESTS = get_eval_requests_path("single_agent")
	MULTI_AGENT_RESULTS = get_eval_results_path("multi_agent")
	MULTI_AGENT_REQUESTS = get_eval_requests_path("multi_agent")

	LEADERBOARD_DF = get_leaderboard_df(SINGLE_AGENT_RESULTS, SINGLE_AGENT_REQUESTS, COLS, BENCHMARK_COLS)
	if not LEADERBOARD_DF.empty:
	LEADERBOARD_DF["T"] = range(1, len(LEADERBOARD_DF) + 1)

	LEADERBOARD_DF_MULTI = get_leaderboard_df(MULTI_AGENT_RESULTS, MULTI_AGENT_REQUESTS, COLS, BENCHMARK_COLS)
	if not LEADERBOARD_DF_MULTI.empty:
	LEADERBOARD_DF_MULTI["T"] = range(1, len(LEADERBOARD_DF_MULTI) + 1)

	# Load combined trend data for the Trends tab
	try:
	TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
	TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
	except Exception as e:
	print(f"WARNING: Failed to load trend data: {e}")
	TREND_SUMMARY_DF = pd.DataFrame()
	TREND_HISTORY_DF = pd.DataFrame()

	try:
	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
	except Exception as e:
	print(f"WARNING: Failed to load evaluation queue: {e}")
	_empty_queue = pd.DataFrame(columns=EVAL_COLS)
	finished_eval_queue_df = _empty_queue
	running_eval_queue_df = _empty_queue.copy()
	pending_eval_queue_df = _empty_queue.copy()


	def build_trend_chart(history_df: pd.DataFrame, workflow_filter: str = "All"):
	"""Build a Plotly line chart showing model scores over time.

	Parameters
	----------
	history_df : pd.DataFrame
	Combined history with an optional ``workflow`` column.
	workflow_filter : str
	One of "All", "single_agent", or "multi_agent".
	"""
	if history_df.empty:
	fig = px.line(title="No historical data available yet")
	fig.update_layout(
	xaxis_title="Date",
	yaxis_title="Average Score (%)",
	)
	return fig

	df = history_df.copy()

	# Apply workflow filter
	if workflow_filter != "All" and "workflow" in df.columns:
	df = df[df["workflow"] == workflow_filter]

	if df.empty:
	fig = px.line(title=f"No data available for workflow: {workflow_filter}")
	fig.update_layout(xaxis_title="Date", yaxis_title="Average Score (%)")
	return fig

	# Build a display label combining model name and workflow
	if "workflow" in df.columns and workflow_filter == "All":
	df["label"] = df["model"] + " (" + df["workflow"] + ")"
	# Use dash style to distinguish workflows
	fig = px.line(
	df,
	x="eval_date",
	y="average",
	color="model",
	line_dash="workflow",
	markers=True,
	title="Model Performance Over Time",
	labels={
	"eval_date": "Evaluation Date",
	"average": "Average Score (%)",
	"model": "Model",
	"workflow": "Workflow",
	},
	)
	else:
	fig = px.line(
	df,
	x="eval_date",
	y="average",
	color="model",
	markers=True,
	title="Model Performance Over Time",
	labels={
	"eval_date": "Evaluation Date",
	"average": "Average Score (%)",
	"model": "Model",
	},
	)

	fig.update_layout(
	xaxis_title="Date",
	yaxis_title="Average Score (%)",
	legend_title="Model",
	hovermode="x unified",
	yaxis=dict(range=[0, 105]),
	)
	return fig


	def filter_trend_data(workflow_filter: str = "All"):
	"""Filter the already-loaded trend data in memory (no re-download).

	Used by the workflow dropdown so that switching filters is instant.
	"""
	global TREND_HISTORY_DF, TREND_SUMMARY_DF

	summary_df = TREND_SUMMARY_DF.copy() if not TREND_SUMMARY_DF.empty else pd.DataFrame()
	history_df = TREND_HISTORY_DF

	if workflow_filter != "All" and not summary_df.empty and "Workflow" in summary_df.columns:
	summary_df = summary_df[summary_df["Workflow"] == workflow_filter]

	chart = build_trend_chart(history_df, workflow_filter)
	timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
	return chart, summary_df, timestamp


	def refresh_trend_data(workflow_filter: str = "All"):
	"""Re-download eval results from HF Hub and recompute trend data.

	Returns updated values for the trend chart, summary table, and a
	last-updated timestamp string. Errors during download or computation
	are caught so the UI never crashes.
	"""
	global TREND_HISTORY_DF, TREND_SUMMARY_DF

	try:
	if not LOCAL_MODE:
	download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results")

	# Clear the evaluation cache so fresh data is loaded from disk.
	clear_eval_cache()

	TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
	TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
	except Exception as e:
	print(f"WARNING: Failed to refresh trend data: {e}")
	TREND_SUMMARY_DF = pd.DataFrame()
	TREND_HISTORY_DF = pd.DataFrame()

	# Delegate to the lightweight filter function for the current view.
	return filter_trend_data(workflow_filter)


	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	# Show an empty leaderboard instead of crashing.
	dataframe = pd.DataFrame(columns=COLS)
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	filter_columns=[
	ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
	ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
	ColumnFilter(
	AutoEvalColumn.params.name,
	type="slider",
	min=0.01,
	max=150,
	label="Select the number of parameters (B)",
	),
	ColumnFilter(
	AutoEvalColumn.still_on_hub.name,
	type="boolean",
	label="Deleted/incomplete",
	default=False,
	),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text", elem_id="cg-intro-block")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Single-Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("🤝 Multi-Agent Benchmark", elem_id="multi-agent-benchmark-tab-table", id=1):
	leaderboard_multi = init_leaderboard(LEADERBOARD_DF_MULTI)

	with gr.TabItem("📈 Trends", elem_id="llm-benchmark-tab-trends", id=2):
	gr.Markdown(
	"### Performance Trends\n"
	"Track how model scores change over time across workflows. "
	"Averages are computed over available evaluation days within each window. "
	"The (N/M) annotation shows how many days of data were available.\n\n"
	"When viewing All workflows, single-agent results are shown with solid lines "
	"and multi-agent results with dashed lines.",
	elem_classes="markdown-text",
	elem_id="cg-trends-header",
	)
	with gr.Row(elem_id="cg-trend-controls"):
	workflow_filter = gr.Dropdown(
	choices=["All", "single_agent", "multi_agent"],
	value="All",
	label="Workflow",
	interactive=True,
	scale=0,
	min_width=180,
	)
	refresh_btn = gr.Button("🔄 Refresh", scale=0, min_width=120, elem_id="cg-refresh-btn")
	last_updated_box = gr.Textbox(
	label="Last updated",
	value=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
	interactive=False,
	scale=1,
	)
	trend_chart = gr.Plot(value=build_trend_chart(TREND_HISTORY_DF), elem_id="cg-trend-chart")
	gr.Markdown("### Summary: 1-Day / 3-Day / 7-Day Averages", elem_id="cg-trend-summary-label")
	trend_table = gr.Dataframe(
	value=TREND_SUMMARY_DF,
	interactive=False,
	elem_id="cg-trend-summary",
	)

	# Workflow filter change — just re-filter in memory (no re-download)
	workflow_filter.change(
	fn=filter_trend_data,
	inputs=[workflow_filter],
	outputs=[trend_chart, trend_table, last_updated_box],
	)

	# Manual refresh on button click
	refresh_btn.click(
	fn=refresh_trend_data,
	inputs=[workflow_filter],
	outputs=[trend_chart, trend_table, last_updated_box],
	)

	# Auto-refresh removed: the BackgroundScheduler restarts the Space
	# every 6 hours, which reloads all data fresh. The hourly
	# gr.Timer was calling snapshot_download() which blocks a worker
	# and can freeze the UI on resource-constrained Spaces. Users
	# can still click the Refresh button for on-demand updates.

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text", elem_id="cg-about-content")

	with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text", elem_id="cg-submit-guide")

	with gr.Column():
	with gr.Accordion(
	f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
	open=False,
	elem_classes="cg-queue-accordion",
	):
	with gr.Row():
	finished_eval_table = gr.components.Dataframe(
	value=finished_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Accordion(
	f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
	open=False,
	elem_classes="cg-queue-accordion",
	):
	with gr.Row():
	running_eval_table = gr.components.Dataframe(
	value=running_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)

	with gr.Accordion(
	f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
	open=False,
	elem_classes="cg-queue-accordion",
	):
	with gr.Row():
	pending_eval_table = gr.components.Dataframe(
	value=pending_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Row():
	gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text", elem_id="cg-submit-heading")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(label="Model name")
	revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
	model_type = gr.Dropdown(
	choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
	label="Model type",
	multiselect=False,
	value=None,
	interactive=True,
	)

	with gr.Column():
	precision = gr.Dropdown(
	choices=[i.value.name for i in Precision if i != Precision.Unknown],
	label="Precision",
	multiselect=False,
	value="float16",
	interactive=True,
	)
	weight_type = gr.Dropdown(
	choices=[i.value.name for i in WeightType],
	label="Weights type",
	multiselect=False,
	value="Original",
	interactive=True,
	)
	base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
	workflow_type = gr.Dropdown(
	choices=["Both", "single_agent", "multi_agent"],
	label="Workflow",
	multiselect=False,
	value="Both",
	interactive=True,
	)

	submit_button = gr.Button("Submit Eval", elem_id="cg-submit-btn")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	model_name_textbox,
	base_model_name_textbox,
	revision_name_textbox,
	precision,
	weight_type,
	model_type,
	workflow_type,
	],
	submission_result,
	)

	with gr.Row(elem_id="cg-citation-section"):
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	if not LOCAL_MODE:
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=21600) # every 6 hours
	scheduler.start()

	demo.queue(default_concurrency_limit=4).launch()