import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.data_utils import get_dataframe_category, get_dataframe_language import src.config as configs from utils import get_profile, get_organizations, get_profile_and_organizations, download_with_restart from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, EVALUATION_QUEUE_TEXT_OPTION1, EVALUATION_QUEUE_TEXT_OPTION2, EVALUATION_QUEUE_TEXT_OPTION3, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, fields, WeightType, Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval_option1, add_new_eval_option2 from handlers import ( search_leaderboard, update_modelselector_group, update_columnselector_group, update_leaderboard, get_models_by_group, ) from ui import create_leaderboard_tab from constants import TAB_KEYS, TAB_NAMES, VLLM_VERSIONS def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation download_with_restart( snapshot_download, repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) download_with_restart( snapshot_download, repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") user_state = gr.State() organization_state = gr.State() with gr.Tabs(elem_classes="tab-buttons") as tabs: for _, key in enumerate(TAB_KEYS): if key == "Category": df = get_dataframe_category() column_selector_value = configs.ON_LOAD_COLUMNS_CATEGORY[3:] else: df = get_dataframe_language() column_selector_value = configs.ON_LOAD_COLUMNS_LANG[3:] create_leaderboard_tab( df, key, search_leaderboard, update_modelselector_group, update_leaderboard, column_selector_value ) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text") with gr.Row(): gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is available)", elem_classes="markdown-text") with gr.Row(): with gr.Column(): benchmark_type = gr.Dropdown( choices=["TRUEBench v0.1"], label="The name of the benchmark to be evaluated", multiselect=False, value="TRUEBench v0.1", interactive=True, ) model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") vllm_version_type = gr.Dropdown( choices=VLLM_VERSIONS, label="vLLM version", multiselect=False, value="v0.9.2", interactive=True, ) with gr.Column(): temperature_textbox = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0") top_p_textbox = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0") top_k_textbox = gr.Textbox(label="Top-k (default: -1)", placeholder="-1") presence_penalty_textbox = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0") frequency_penalty_textbox = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0") repetition_penalty_textbox = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0") login_button = gr.LoginButton() submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state]) event.then( add_new_eval_option1, [ benchmark_type, model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, temperature_textbox, top_p_textbox, top_k_textbox, presence_penalty_textbox, frequency_penalty_textbox, repetition_penalty_textbox, vllm_version_type, user_state, organization_state ], submission_result, ) with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION2, elem_classes="markdown-text") with gr.Row(): gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is unavailable)", elem_classes="markdown-text") with gr.Row(): with gr.Column(): benchmark_type2 = gr.Dropdown( choices=["TRUEBench v0.1"], label="The name of the benchmark to be evaluated", multiselect=False, value="TRUEBench v0.1", interactive=True, ) model_name_textbox2 = gr.Textbox(label="Model name") revision_name_textbox2 = gr.Textbox(label="Revision commit", placeholder="main") precision2 = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) base_model_name_textbox2 = gr.Textbox(label="Base model (for delta or adapter weights)") with gr.Column(): temperature_textbox2 = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0") top_p_textbox2 = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0") top_k_textbox2 = gr.Textbox(label="Top-k (default: -1)", placeholder="-1") presence_penalty_textbox2 = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0") frequency_penalty_textbox2 = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0") repetition_penalty_textbox2 = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0") with gr.Row(): with gr.Column(): model_load_code_snippet_textbox = gr.Textbox(label="Code for model loading", lines=15, placeholder="model = AutoModel.from_pretrained('your model name', revision=revision)") with gr.Column(): inference_code_snippet_textbox = gr.Textbox(label="Code for inference", lines=15, placeholder="output = model(...)") with gr.Column(): terminate_code_snippet_textbox = gr.Textbox(label="Code for termination", lines=15) login_button2 = gr.LoginButton() submit_button2 = gr.Button("Submit Eval") submission_result2 = gr.Markdown() event2 = submit_button2.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state]) event2.then( add_new_eval_option2, [ benchmark_type2, model_name_textbox2, base_model_name_textbox2, revision_name_textbox2, precision2, temperature_textbox2, top_p_textbox2, top_k_textbox2, presence_penalty_textbox2, frequency_penalty_textbox2, repetition_penalty_textbox2, model_load_code_snippet_textbox, inference_code_snippet_textbox, terminate_code_snippet_textbox, user_state, organization_state ], submission_result2, ) with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION3, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()