tdphamm's picture
Fix HF Spaces performance: reduce concurrency, remove timer, lazy imports (#7)
be24d40
import sys
import time
from datetime import datetime, timezone
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
import plotly.express as px
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision,
)
from src.envs import (
API,
EVAL_REQUESTS_PATH,
EVAL_RESULTS_PATH,
QUEUE_REPO,
REPO_ID,
RESULTS_REPO,
TOKEN,
WORKFLOWS,
get_eval_results_path,
get_eval_requests_path,
)
from src.leaderboard.read_evals import clear_eval_cache
from src.populate import (
get_evaluation_queue_df,
get_leaderboard_df,
get_trend_summary_df,
get_trend_history_df,
get_combined_trend_history_df,
get_combined_trend_summary_df,
)
from src.submission.submit import add_new_eval
# --local flag: skip HF Hub downloads and scheduler, use local data only.
LOCAL_MODE = "--local" in sys.argv
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
MAX_DOWNLOAD_RETRIES = 3
def download_with_retry(repo_id: str, local_dir: str, label: str) -> None:
"""Download a HF Hub dataset with retries. Restarts the Space only after all retries are exhausted."""
for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
try:
print(f"Downloading {label} ({repo_id}) β€” attempt {attempt}/{MAX_DOWNLOAD_RETRIES}")
snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
return # success
except Exception as e:
print(f"WARNING: Failed to download {label}: {e}")
if attempt < MAX_DOWNLOAD_RETRIES:
wait = 10 * attempt
print(f" Retrying in {wait}s...")
time.sleep(wait)
else:
print(f"ERROR: All {MAX_DOWNLOAD_RETRIES} download attempts failed for {label}. Restarting space.")
restart_space()
if not LOCAL_MODE:
download_with_retry(QUEUE_REPO, EVAL_REQUESTS_PATH, "eval requests")
download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results")
else:
print("LOCAL MODE: skipping HF Hub downloads, using local eval-results/ and eval-queue/")
# Load leaderboard data for each workflow
SINGLE_AGENT_RESULTS = get_eval_results_path("single_agent")
SINGLE_AGENT_REQUESTS = get_eval_requests_path("single_agent")
MULTI_AGENT_RESULTS = get_eval_results_path("multi_agent")
MULTI_AGENT_REQUESTS = get_eval_requests_path("multi_agent")
LEADERBOARD_DF = get_leaderboard_df(SINGLE_AGENT_RESULTS, SINGLE_AGENT_REQUESTS, COLS, BENCHMARK_COLS)
if not LEADERBOARD_DF.empty:
LEADERBOARD_DF["T"] = range(1, len(LEADERBOARD_DF) + 1)
LEADERBOARD_DF_MULTI = get_leaderboard_df(MULTI_AGENT_RESULTS, MULTI_AGENT_REQUESTS, COLS, BENCHMARK_COLS)
if not LEADERBOARD_DF_MULTI.empty:
LEADERBOARD_DF_MULTI["T"] = range(1, len(LEADERBOARD_DF_MULTI) + 1)
# Load combined trend data for the Trends tab
try:
TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
except Exception as e:
print(f"WARNING: Failed to load trend data: {e}")
TREND_SUMMARY_DF = pd.DataFrame()
TREND_HISTORY_DF = pd.DataFrame()
try:
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
except Exception as e:
print(f"WARNING: Failed to load evaluation queue: {e}")
_empty_queue = pd.DataFrame(columns=EVAL_COLS)
finished_eval_queue_df = _empty_queue
running_eval_queue_df = _empty_queue.copy()
pending_eval_queue_df = _empty_queue.copy()
def build_trend_chart(history_df: pd.DataFrame, workflow_filter: str = "All"):
"""Build a Plotly line chart showing model scores over time.
Parameters
----------
history_df : pd.DataFrame
Combined history with an optional ``workflow`` column.
workflow_filter : str
One of "All", "single_agent", or "multi_agent".
"""
if history_df.empty:
fig = px.line(title="No historical data available yet")
fig.update_layout(
xaxis_title="Date",
yaxis_title="Average Score (%)",
)
return fig
df = history_df.copy()
# Apply workflow filter
if workflow_filter != "All" and "workflow" in df.columns:
df = df[df["workflow"] == workflow_filter]
if df.empty:
fig = px.line(title=f"No data available for workflow: {workflow_filter}")
fig.update_layout(xaxis_title="Date", yaxis_title="Average Score (%)")
return fig
# Build a display label combining model name and workflow
if "workflow" in df.columns and workflow_filter == "All":
df["label"] = df["model"] + " (" + df["workflow"] + ")"
# Use dash style to distinguish workflows
fig = px.line(
df,
x="eval_date",
y="average",
color="model",
line_dash="workflow",
markers=True,
title="Model Performance Over Time",
labels={
"eval_date": "Evaluation Date",
"average": "Average Score (%)",
"model": "Model",
"workflow": "Workflow",
},
)
else:
fig = px.line(
df,
x="eval_date",
y="average",
color="model",
markers=True,
title="Model Performance Over Time",
labels={
"eval_date": "Evaluation Date",
"average": "Average Score (%)",
"model": "Model",
},
)
fig.update_layout(
xaxis_title="Date",
yaxis_title="Average Score (%)",
legend_title="Model",
hovermode="x unified",
yaxis=dict(range=[0, 105]),
)
return fig
def filter_trend_data(workflow_filter: str = "All"):
"""Filter the already-loaded trend data in memory (no re-download).
Used by the workflow dropdown so that switching filters is instant.
"""
global TREND_HISTORY_DF, TREND_SUMMARY_DF
summary_df = TREND_SUMMARY_DF.copy() if not TREND_SUMMARY_DF.empty else pd.DataFrame()
history_df = TREND_HISTORY_DF
if workflow_filter != "All" and not summary_df.empty and "Workflow" in summary_df.columns:
summary_df = summary_df[summary_df["Workflow"] == workflow_filter]
chart = build_trend_chart(history_df, workflow_filter)
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
return chart, summary_df, timestamp
def refresh_trend_data(workflow_filter: str = "All"):
"""Re-download eval results from HF Hub and recompute trend data.
Returns updated values for the trend chart, summary table, and a
last-updated timestamp string. Errors during download or computation
are caught so the UI never crashes.
"""
global TREND_HISTORY_DF, TREND_SUMMARY_DF
try:
if not LOCAL_MODE:
download_with_retry(RESULTS_REPO, EVAL_RESULTS_PATH, "eval results")
# Clear the evaluation cache so fresh data is loaded from disk.
clear_eval_cache()
TREND_SUMMARY_DF = get_combined_trend_summary_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
TREND_HISTORY_DF = get_combined_trend_history_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
except Exception as e:
print(f"WARNING: Failed to refresh trend data: {e}")
TREND_SUMMARY_DF = pd.DataFrame()
TREND_HISTORY_DF = pd.DataFrame()
# Delegate to the lightweight filter function for the current view.
return filter_trend_data(workflow_filter)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
# Show an empty leaderboard instead of crashing.
dataframe = pd.DataFrame(columns=COLS)
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(
AutoEvalColumn.params.name,
type="slider",
min=0.01,
max=150,
label="Select the number of parameters (B)",
),
ColumnFilter(
AutoEvalColumn.still_on_hub.name,
type="boolean",
label="Deleted/incomplete",
default=False,
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text", elem_id="cg-intro-block")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Single-Agent Benchmark", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("🀝 Multi-Agent Benchmark", elem_id="multi-agent-benchmark-tab-table", id=1):
leaderboard_multi = init_leaderboard(LEADERBOARD_DF_MULTI)
with gr.TabItem("πŸ“ˆ Trends", elem_id="llm-benchmark-tab-trends", id=2):
gr.Markdown(
"### Performance Trends\n"
"Track how model scores change over time across workflows. "
"Averages are computed over available evaluation days within each window. "
"The *(N/M)* annotation shows how many days of data were available.\n\n"
"When viewing **All** workflows, single-agent results are shown with solid lines "
"and multi-agent results with dashed lines.",
elem_classes="markdown-text",
elem_id="cg-trends-header",
)
with gr.Row(elem_id="cg-trend-controls"):
workflow_filter = gr.Dropdown(
choices=["All", "single_agent", "multi_agent"],
value="All",
label="Workflow",
interactive=True,
scale=0,
min_width=180,
)
refresh_btn = gr.Button("πŸ”„ Refresh", scale=0, min_width=120, elem_id="cg-refresh-btn")
last_updated_box = gr.Textbox(
label="Last updated",
value=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC"),
interactive=False,
scale=1,
)
trend_chart = gr.Plot(value=build_trend_chart(TREND_HISTORY_DF), elem_id="cg-trend-chart")
gr.Markdown("### Summary: 1-Day / 3-Day / 7-Day Averages", elem_id="cg-trend-summary-label")
trend_table = gr.Dataframe(
value=TREND_SUMMARY_DF,
interactive=False,
elem_id="cg-trend-summary",
)
# Workflow filter change β€” just re-filter in memory (no re-download)
workflow_filter.change(
fn=filter_trend_data,
inputs=[workflow_filter],
outputs=[trend_chart, trend_table, last_updated_box],
)
# Manual refresh on button click
refresh_btn.click(
fn=refresh_trend_data,
inputs=[workflow_filter],
outputs=[trend_chart, trend_table, last_updated_box],
)
# Auto-refresh removed: the BackgroundScheduler restarts the Space
# every 6 hours, which reloads all data fresh. The hourly
# gr.Timer was calling snapshot_download() which blocks a worker
# and can freeze the UI on resource-constrained Spaces. Users
# can still click the Refresh button for on-demand updates.
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text", elem_id="cg-about-content")
with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text", elem_id="cg-submit-guide")
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
elem_classes="cg-queue-accordion",
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
elem_classes="cg-queue-accordion",
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
elem_classes="cg-queue-accordion",
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text", elem_id="cg-submit-heading")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
workflow_type = gr.Dropdown(
choices=["Both", "single_agent", "multi_agent"],
label="Workflow",
multiselect=False,
value="Both",
interactive=True,
)
submit_button = gr.Button("Submit Eval", elem_id="cg-submit-btn")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
workflow_type,
],
submission_result,
)
with gr.Row(elem_id="cg-citation-section"):
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
if not LOCAL_MODE:
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=21600) # every 6 hours
scheduler.start()
demo.queue(default_concurrency_limit=4).launch()