lzzzzy's picture
Inline rank label with toggle buttons
d0472dc
import os
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from gradio_leaderboard import Leaderboard, SelectColumns
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import BENCHMARK_COLS, COLS, AutoEvalColumn, fields
from src.envs import API, LOCAL_DEV, REPO_ID
from src.populate import get_commit_results_df
from src.submission.commit_results import SCORE_GROUPS, submit_commit_result
def restart_space():
if LOCAL_DEV:
print("[LOCAL_DEV] Skip space restart.")
return
API.restart_space(repo_id=REPO_ID)
COMMIT_RESULTS_PATH = os.path.join(os.path.dirname(__file__), "commit_results.jsonl")
DEFAULT_RANK_BY = "hard"
LEADERBOARD_DF = get_commit_results_df(COMMIT_RESULTS_PATH, COLS, BENCHMARK_COLS, rank_by=DEFAULT_RANK_BY)
def load_leaderboard_df(rank_by: str = DEFAULT_RANK_BY):
return get_commit_results_df(COMMIT_RESULTS_PATH, COLS, BENCHMARK_COLS, rank_by=rank_by)
def get_rank_button_updates(rank_by: str):
rank_by = (rank_by or DEFAULT_RANK_BY).lower()
return (
gr.update(variant="primary" if rank_by == "hard" else "secondary"),
gr.update(variant="primary" if rank_by == "easy" else "secondary"),
)
def set_rank_by(rank_by: str):
dataframe = load_leaderboard_df(rank_by)
hard_button_update, easy_button_update = get_rank_button_updates(rank_by)
return dataframe, hard_button_update, easy_button_update, rank_by
def submit_eval_result(rank_by: str, model_name: str, *score_values):
submission_message = submit_commit_result(COMMIT_RESULTS_PATH, model_name, *score_values)
return submission_message, load_leaderboard_df(rank_by)
def init_leaderboard(dataframe: pd.DataFrame):
df = dataframe
if df is None or df.empty:
print("[WARN] Leaderboard is empty. Showing an empty table with predefined columns.")
df = pd.DataFrame(columns=COLS)
return Leaderboard(
value=df,
datatype=["str"] * len(df.columns),
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[],
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons"):
with gr.TabItem("πŸ… Benchmark", elem_id="benchmark-tab-table", id=0):
rank_state = gr.State(DEFAULT_RANK_BY)
with gr.Column(elem_id="benchmark-board-shell"):
with gr.Row(elem_id="benchmark-rank-toolbar"):
gr.HTML('<span class="rank-by-inline-text">Rank By:</span>', elem_classes="rank-by-label")
hard_rank_button = gr.Button(
"Hard",
variant="primary",
elem_classes="rank-toggle-button",
)
easy_rank_button = gr.Button(
"Easy",
variant="secondary",
elem_classes="rank-toggle-button",
)
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ“ About", elem_id="benchmark-tab-about", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit here!", elem_id="benchmark-tab-submit", id=3):
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
model_name_textbox = gr.Textbox(
label="Model name",
placeholder="e.g. Nano-Banana-Pro",
)
score_inputs = []
for section_title, _, _, labels in SCORE_GROUPS:
with gr.Row(elem_classes="score-scroll-row"):
with gr.Column(scale=0, min_width=150, elem_classes="score-section-chip"):
gr.Markdown(section_title, elem_classes="score-section-chip-text")
for label in labels:
with gr.Column(scale=0, min_width=230, elem_classes="score-pair-card"):
gr.Markdown(label, elem_classes="score-pair-label")
with gr.Row(elem_classes="score-inline-inputs"):
with gr.Column(scale=0, min_width=88, elem_classes="score-inline-field"):
gr.Markdown("Hard", elem_classes="score-inline-field-label")
hard_input = gr.Textbox(
show_label=False,
container=False,
placeholder="0.0",
max_lines=1,
scale=0,
min_width=88,
elem_classes="score-value-input",
)
with gr.Column(scale=0, min_width=88, elem_classes="score-inline-field"):
gr.Markdown("Easy", elem_classes="score-inline-field-label")
easy_input = gr.Textbox(
show_label=False,
container=False,
placeholder="0.0",
max_lines=1,
scale=0,
min_width=88,
elem_classes="score-value-input",
)
score_inputs.extend([hard_input, easy_input])
submit_button = gr.Button("Submit Result")
submission_result = gr.Markdown()
submit_button.click(
submit_eval_result,
[rank_state, model_name_textbox, *score_inputs],
[submission_result, leaderboard],
)
hard_rank_button.click(
lambda: set_rank_by("hard"),
outputs=[leaderboard, hard_rank_button, easy_rank_button, rank_state],
)
easy_rank_button.click(
lambda: set_rank_by("easy"),
outputs=[leaderboard, hard_rank_button, easy_rank_button, rank_state],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
if not LOCAL_DEV:
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
else:
print("[LOCAL_DEV] Disable periodic space restart scheduler.")
# In Spaces, Gradio must listen on a container-visible address.
server_name = os.getenv("GRADIO_SERVER_NAME")
if not server_name:
server_name = "127.0.0.1" if LOCAL_DEV else "0.0.0.0"
demo.queue(default_concurrency_limit=40).launch(
server_name=server_name, server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860"))
)