test_space / app.py
Kyuho Heo
spacerank
e74285c
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.data_utils import get_dataframe_category, get_dataframe_language
import src.config as configs
from utils import get_profile, get_organizations, get_profile_and_organizations, download_with_restart
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
EVALUATION_QUEUE_TEXT_OPTION1,
EVALUATION_QUEUE_TEXT_OPTION2,
EVALUATION_QUEUE_TEXT_OPTION3,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval_option1, add_new_eval_option2
from handlers import (
search_leaderboard,
update_modelselector_group,
update_columnselector_group,
update_leaderboard,
get_models_by_group,
)
from ui import create_leaderboard_tab
from constants import TAB_KEYS, TAB_NAMES, VLLM_VERSIONS
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
download_with_restart(
snapshot_download,
repo_id=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
download_with_restart(
snapshot_download,
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
user_state = gr.State()
organization_state = gr.State()
with gr.Tabs(elem_classes="tab-buttons") as tabs:
for _, key in enumerate(TAB_KEYS):
if key == "Category":
df = get_dataframe_category()
column_selector_value = configs.ON_LOAD_COLUMNS_CATEGORY[3:]
else:
df = get_dataframe_language()
column_selector_value = configs.ON_LOAD_COLUMNS_LANG[3:]
create_leaderboard_tab(
df,
key,
search_leaderboard,
update_modelselector_group,
update_leaderboard,
column_selector_value
)
with gr.TabItem("๐Ÿ“ About", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("๐Ÿš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## โœ‰๏ธโœจ Submit your model here! (if vLLM inference is available)", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
benchmark_type = gr.Dropdown(
choices=["TRUEBench v0.1"],
label="The name of the benchmark to be evaluated",
multiselect=False,
value="TRUEBench v0.1",
interactive=True,
)
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
vllm_version_type = gr.Dropdown(
choices=VLLM_VERSIONS,
label="vLLM version",
multiselect=False,
value="v0.9.2",
interactive=True,
)
with gr.Column():
temperature_textbox = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
top_p_textbox = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
top_k_textbox = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
presence_penalty_textbox = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
frequency_penalty_textbox = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
repetition_penalty_textbox = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
login_button = gr.LoginButton()
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
event.then(
add_new_eval_option1,
[
benchmark_type,
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
temperature_textbox,
top_p_textbox,
top_k_textbox,
presence_penalty_textbox,
frequency_penalty_textbox,
repetition_penalty_textbox,
vllm_version_type,
user_state,
organization_state
],
submission_result,
)
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION2, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## โœ‰๏ธโœจ Submit your model here! (if vLLM inference is unavailable)", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
benchmark_type2 = gr.Dropdown(
choices=["TRUEBench v0.1"],
label="The name of the benchmark to be evaluated",
multiselect=False,
value="TRUEBench v0.1",
interactive=True,
)
model_name_textbox2 = gr.Textbox(label="Model name")
revision_name_textbox2 = gr.Textbox(label="Revision commit", placeholder="main")
precision2 = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
base_model_name_textbox2 = gr.Textbox(label="Base model (for delta or adapter weights)")
with gr.Column():
temperature_textbox2 = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
top_p_textbox2 = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
top_k_textbox2 = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
presence_penalty_textbox2 = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
frequency_penalty_textbox2 = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
repetition_penalty_textbox2 = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
with gr.Row():
with gr.Column():
model_load_code_snippet_textbox = gr.Textbox(label="Code for model loading", lines=15, placeholder="model = AutoModel.from_pretrained('your model name', revision=revision)")
with gr.Column():
inference_code_snippet_textbox = gr.Textbox(label="Code for inference", lines=15, placeholder="output = model(...)")
with gr.Column():
terminate_code_snippet_textbox = gr.Textbox(label="Code for termination", lines=15)
login_button2 = gr.LoginButton()
submit_button2 = gr.Button("Submit Eval")
submission_result2 = gr.Markdown()
event2 = submit_button2.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
event2.then(
add_new_eval_option2,
[
benchmark_type2,
model_name_textbox2,
base_model_name_textbox2,
revision_name_textbox2,
precision2,
temperature_textbox2,
top_p_textbox2,
top_k_textbox2,
presence_penalty_textbox2,
frequency_penalty_textbox2,
repetition_penalty_textbox2,
model_load_code_snippet_textbox,
inference_code_snippet_textbox,
terminate_code_snippet_textbox,
user_state,
organization_state
],
submission_result2,
)
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION3, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("๐Ÿ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()