test_space

Runtime error

App Files Files Community

[ADD_MODEL]

by Jongyoon-Song - opened Jul 8, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+314

-1269

Files changed (20) hide show

.gitattributes +1 -1
README.md +39 -7
app.py +194 -160
constants.py +0 -36
handlers.py +0 -86
src/about.py +10 -58
src/config.py +2 -1
src/data/export_category_250618.csv +11 -0
src/data/export_category_250709.csv +0 -0
src/data/export_lang_250618.csv +11 -0
src/data/export_lang_250709.csv +0 -0
src/data_utils.py +2 -2
src/display/css_html_js.py +1 -281
src/display/formatting.py +0 -125
src/display/utils.py +15 -22
src/envs.py +2 -2
src/submission/check_validity.py +2 -3
src/submission/submit.py +24 -215
ui.py +0 -228
utils.py +0 -42

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,46 @@
 ---
-title: test_space
-emoji: 👀
 colorFrom: green
-colorTo: red
 sdk: gradio
-sdk_version: 5.38.0
 app_file: app.py
-pinned: false
 license: apache-2.0
-hf_oauth: true
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Test Space
+emoji: 🥇
 colorFrom: green
+colorTo: indigo
 sdk: gradio
 app_file: app.py
+pinned: true
 license: apache-2.0
+short_description: Duplicate this leaderboard to initialize your own!
+sdk_version: 5.19.0
 ---
+# Start the configuration
+Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
+Results files should have the following format and be stored as json files:
+```json
+{
+    "config": {
+        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
+        "model_name": "path of the model on the hub: org/model",
+        "model_sha": "revision on the hub",
+    },
+    "results": {
+        "task_name": {
+            "metric_name": score,
+        },
+        "task_name2": {
+            "metric_name": score,
+        }
+    }
+}
+```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+# Code logic for more complex edits
+You'll find
+- the main table' columns names and properties in `src/display/utils.py`
+- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
+- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

app.py CHANGED Viewed

@@ -1,21 +1,15 @@
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.data_utils import get_dataframe_category, get_dataframe_language
 import src.config as configs
-from utils import get_profile, get_organizations, get_profile_and_organizations, download_with_restart
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
-    EVALUATION_QUEUE_TEXT_OPTION1,
-    EVALUATION_QUEUE_TEXT_OPTION2,
-    EVALUATION_QUEUE_TEXT_OPTION3,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
@@ -27,45 +21,37 @@ from src.display.utils import (
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval_option1, add_new_eval_option2
-from handlers import (
-    search_leaderboard,
-    update_modelselector_group,
-    update_columnselector_group,
-    update_leaderboard,
-    get_models_by_group,
-)
-from ui import create_leaderboard_tab
-from constants import TAB_KEYS, TAB_NAMES, VLLM_VERSIONS
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
-download_with_restart(
-    snapshot_download,
-    repo_id=QUEUE_REPO,
-    local_dir=EVAL_REQUESTS_PATH,
-    repo_type="dataset",
-    token=TOKEN,
-    restart_func=restart_space
-)
-download_with_restart(
-    snapshot_download,
-    repo_id=RESULTS_REPO,
-    local_dir=EVAL_RESULTS_PATH,
-    repo_type="dataset",
-    token=TOKEN,
-    restart_func=restart_space
-)
 (
     finished_eval_queue_df,
@@ -73,28 +59,132 @@ download_with_restart(
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    user_state = gr.State()
-    organization_state = gr.State()
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        for _, key in enumerate(TAB_KEYS):
-            if key == "Category":
-                df = get_dataframe_category()
-                column_selector_value = configs.ON_LOAD_COLUMNS_CATEGORY[3:]
-            else:
-                df = get_dataframe_language()
-                column_selector_value = configs.ON_LOAD_COLUMNS_LANG[3:]
-            create_leaderboard_tab(
-                df,
-                key,
-                search_leaderboard,
-                update_modelselector_group,
-                update_leaderboard,
-                column_selector_value
-            )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -103,23 +193,57 @@ with demo:
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
             with gr.Row():
-                gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is available)", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    benchmark_type = gr.Dropdown(
-                        choices=["TRUEBench v0.1"],
-                        label="The name of the benchmark to be evaluated",
                         multiselect=False,
-                        value="TRUEBench v0.1",
                         interactive=True,
                     )
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
@@ -127,118 +251,29 @@ with demo:
                         value="float16",
                         interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-                    vllm_version_type = gr.Dropdown(
-                        choices=VLLM_VERSIONS,
-                        label="vLLM version",
                         multiselect=False,
-                        value="v0.9.2",
                         interactive=True,
                     )
-                with gr.Column():
-                    temperature_textbox = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
-                    top_p_textbox = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
-                    top_k_textbox = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
-                    presence_penalty_textbox = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
-                    frequency_penalty_textbox = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
-                    repetition_penalty_textbox = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
-            login_button = gr.LoginButton()
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
-            event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
-            event.then(
-                add_new_eval_option1,
                 [
-                    benchmark_type,
                     model_name_textbox,
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
-                    temperature_textbox,
-                    top_p_textbox,
-                    top_k_textbox,
-                    presence_penalty_textbox,
-                    frequency_penalty_textbox,
-                    repetition_penalty_textbox,
-                    vllm_version_type,
-                    user_state,
-                    organization_state
                 ],
                 submission_result,
             )
-            with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION2, elem_classes="markdown-text")
-            with gr.Row():
-                gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is unavailable)", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    benchmark_type2 = gr.Dropdown(
-                        choices=["TRUEBench v0.1"],
-                        label="The name of the benchmark to be evaluated",
-                        multiselect=False,
-                        value="TRUEBench v0.1",
-                        interactive=True,
-                    )
-                    model_name_textbox2 = gr.Textbox(label="Model name")
-                    revision_name_textbox2 = gr.Textbox(label="Revision commit", placeholder="main")
-                    precision2 = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    base_model_name_textbox2 = gr.Textbox(label="Base model (for delta or adapter weights)")
-                with gr.Column():
-                    temperature_textbox2 = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
-                    top_p_textbox2 = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
-                    top_k_textbox2 = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
-                    presence_penalty_textbox2 = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
-                    frequency_penalty_textbox2 = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
-                    repetition_penalty_textbox2 = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
-            with gr.Row():
-                with gr.Column():
-                    model_load_code_snippet_textbox = gr.Textbox(label="Code for model loading", lines=15, placeholder="model = AutoModel.from_pretrained('your model name', revision=revision)")
-                with gr.Column():
-                    inference_code_snippet_textbox = gr.Textbox(label="Code for inference", lines=15, placeholder="output = model(...)")
-                with gr.Column():
-                    terminate_code_snippet_textbox = gr.Textbox(label="Code for termination", lines=15)
-            login_button2 = gr.LoginButton()
-            submit_button2 = gr.Button("Submit Eval")
-            submission_result2 = gr.Markdown()
-            event2 = submit_button2.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
-            event2.then(
-                add_new_eval_option2,
-                [
-                    benchmark_type2,
-                    model_name_textbox2,
-                    base_model_name_textbox2,
-                    revision_name_textbox2,
-                    precision2,
-                    temperature_textbox2,
-                    top_p_textbox2,
-                    top_k_textbox2,
-                    presence_penalty_textbox2,
-                    frequency_penalty_textbox2,
-                    repetition_penalty_textbox2,
-                    model_load_code_snippet_textbox,
-                    inference_code_snippet_textbox,
-                    terminate_code_snippet_textbox,
-                    user_state,
-                    organization_state
-                ],
-                submission_result2,
-            )
-            with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION3, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
@@ -250,8 +285,7 @@ with demo:
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.data_utils import get_dataframe_category, get_dataframe_language
 import src.config as configs
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
+    ModelType,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
+try:
+    print(EVAL_REQUESTS_PATH)
+    snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+try:
+    print(EVAL_RESULTS_PATH)
+    snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
+    )
+except Exception:
+    restart_space()
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+# def init_leaderboard(dataframe):
+#     if dataframe is None or dataframe.empty:
+#         raise ValueError("Leaderboard DataFrame is empty or None.")
+#     return Leaderboard(
+#         value=dataframe,
+#         datatype=[c.type for c in fields(AutoEvalColumn)],
+#         select_columns=SelectColumns(
+#             default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+#             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+#             label="Select Columns to Display:",
+#         ),
+#         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
+#         hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+#         filter_columns=[
+#             ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+#             ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+#             ColumnFilter(
+#                 AutoEvalColumn.params.name,
+#                 type="slider",
+#                 min=0.01,
+#                 max=150,
+#                 label="Select the number of parameters (B)",
+#             ),
+#             ColumnFilter(
+#                 AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
+#             ),
+#         ],
+#         bool_checkboxgroup_label="Hide models",
+#         interactive=False,
+#     )
+tab_keys = ["Category", "Language"]
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        def search_leaderboard(query, df):
+            if not query.strip():
+                return df
+            filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
+            return filtered
+        def update_modelselector_group(groups, df):
+            """
+            groups (gr.CheckboxGroup): List of currently selected models
+            df (DataFrame or gr.State): Current dataframe
+            """
+            print("groups:", groups)
+            if not groups:
+                return None
+            filtered_df = df[df["Group"].isin(groups)]
+            models = filtered_df["Model Name"].unique().tolist()
+            return models
+        def update_columnselector_group(columns, groups, df):
+            print("column groups:", groups)
+            columns = [c for c in columns if c in df.columns[:3]]
+            columns.extend(df.columns[3:])
+            print(columns)
+            return columns
+        def update_leaderboard(models, columns, df):
+            print("models:", models)
+            print("columns:", columns)
+            filtered_df = df[df["Model Name"].isin(models)]
+            filtered_columns = [c for c in df.columns if c in columns or c in ["Model Name"]]
+            filtered_df = filtered_df[filtered_columns]
+            for col in filtered_df.select_dtypes(include="number").columns:
+                filtered_df[col] = filtered_df[col].round(3)
+            return filtered_df
+        def get_models_by_group(df, groups):
+            return df[df["Group"].isin(groups)]["Model Name"].tolist()
+        for _, key in enumerate(tab_keys):
+            with gr.TabItem(key, visible=True):
+                if key == "Category":
+                    df = get_dataframe_category()
+                else:
+                    df = get_dataframe_language()
+                df_state = gr.State(df)
+                with gr.Row():
+                    with gr.Column():
+                        search_box = gr.Textbox(label="Search Model by Name")
+                        group_list = df["Group"].unique().tolist()
+                        group_selector = gr.CheckboxGroup(choices=df["Group"].unique().tolist(), value=group_list, label="Select Model Group")
+                        if key == "Category":
+                            column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_CATEGORY[3:], label="Select Columns")
+                        else:
+                            column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_LANG[3:], label="Select Columns")
+                    with gr.Column():
+                        with gr.Accordion("세부 사항", open=False):
+                            model_group = df["Model Name"].tolist()
+                            model_selector = gr.CheckboxGroup(choices=df["Model Name"].tolist(), value=model_group, label="Select Models")
+                ld = gr.DataFrame(
+                    value=df.round(3)
+                )
+                # Define change functions for user interaction
+                search_box.change(fn=search_leaderboard, inputs=[search_box, df_state], outputs=ld)
+                group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
+                model_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
+                column_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
+        # with gr.TabItem("Docs"):
+        #     gr.Markdown((Path(__file__).parent / "docs.md").read_text())
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                        label="Model type",
                         multiselect=False,
+                        value=None,
                         interactive=True,
                     )
+                with gr.Column():
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         value="float16",
                         interactive=True,
                     )
+                    weight_type = gr.Dropdown(
+                        choices=[i.value.name for i in WeightType],
+                        label="Weights type",
                         multiselect=False,
+                        value="Original",
                         interactive=True,
                     )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
                 [
                     model_name_textbox,
                     base_model_name_textbox,
                     revision_name_textbox,
                     precision,
+                    weight_type,
+                    model_type,
                 ],
                 submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

constants.py DELETED Viewed

@@ -1,36 +0,0 @@
-# constants.py
-TAB_KEYS = ["Category", "Language"]
-TAB_NAMES = {
-    "Category": "TRUEBench v0.1 (Category 🔧)",
-    "Language": "TRUEBench v0.1 (Language 🌎)"
-}
-VLLM_VERSIONS = [
-    "v0.9.2", "v0.9.2rc2", "v0.9.2rc1", "v0.9.1", "v0.9.1rc2", "v0.9.1rc1",
-    "v0.9.0.1", "v0.9.0", "v0.8.5", "v0.8.5.post1", "v0.8.4", "v0.8.3",
-    "v0.8.3rc1", "v0.8.2", "v0.8.1", "v0.8.0", "v0.8.0rc2", "v0.8.0rc1",
-    "v0.7.3", "v0.7.2", "v0.7.1", "v0.6.6", "v0.6.6.post1", "v0.6.5",
-    "v0.6.4.post1", "v0.6.4", "v0.6.3.post1", "v0.6.2", "v0.6.1",
-    "v0.6.1.post2", "v0.6.1.post1", "v0.6.0"
-]
-# 리더보드 필수 컬럼(항상 포함되어야 함)
-LEADERBOARD_REQUIRED_COLUMNS = [
-    "Model Name", "Group", "Overall", "Model Type", "Output Form", "Rank"
-]
-# Model badge mappings (centralized for both UI and backend)
-MODEL_TYPE_MAP = {
-    "deepseek_r1": "open",
-    "deepseek_r1_0528": "open",
-    "Qwen3-32B": "open",
-    "Gauss2.3-Think-250708": "closed"
-}
-OUTPUT_FORM_MAP = {
-    "deepseek_r1": "reasoning",
-    "deepseek_r1_0528": "normal",
-    "Qwen3-32B": "reasoning",
-    "Gauss2.3-Think-250708": "reasoning"
-}

handlers.py DELETED Viewed

@@ -1,86 +0,0 @@
-import pandas as pd
-def search_leaderboard(query, df, sort_col=None, sort_asc=True):
-    if not query.strip():
-        filtered = df
-    else:
-        filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
-    if sort_col and sort_col in filtered.columns:
-        filtered = filtered.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
-    return filtered
-def update_modelselector_group(groups, df):
-    """
-    groups (gr.CheckboxGroup): List of currently selected models
-    df (DataFrame or gr.State): Current dataframe
-    """
-    print("groups:", groups)
-    if not groups:
-        return None
-    filtered_df = df[df["Group"].isin(groups)]
-    models = filtered_df["Model Name"].unique().tolist()
-    return models
-def update_columnselector_group(columns, groups, df):
-    print("column groups:", groups)
-    columns = [c for c in columns if c in df.columns[:3]]
-    columns.extend(df.columns[3:])
-    print(columns)
-    return columns
-from constants import LEADERBOARD_REQUIRED_COLUMNS, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
-def update_leaderboard(models, columns, df, sort_col=None, sort_asc=True):
-    print("models:", models)
-    print("columns:", columns)
-    print("sort_col:", sort_col, "sort_asc:", sort_asc)
-    # 필수 컬럼 항상 포함
-    columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + list(columns)))
-    # 뱃지/랭크 렌더링에 필요한 컬럼 항상 포함
-    always_include = ["Model Name", "Model Type", "Output Form", "Rank"]
-    filtered_df = df[df["Model Name"].isin(models)].copy()
-    # Model Type, Output Form, Rank 컬럼이 없으면 생성
-    if "Model Type" not in filtered_df.columns:
-        filtered_df["Model Type"] = filtered_df["Model Name"].map(MODEL_TYPE_MAP).fillna("open")
-    if "Output Form" not in filtered_df.columns:
-        filtered_df["Output Form"] = filtered_df["Model Name"].map(OUTPUT_FORM_MAP).fillna("normal")
-    if "Rank" not in filtered_df.columns:
-        # 정렬 기준: sort_col이 있으면 해당 컬럼, 없으면 Overall
-        rank_col = sort_col if sort_col and sort_col in filtered_df.columns else ("Overall" if "Overall" in filtered_df.columns else None)
-        if rank_col:
-            filtered_df = filtered_df.sort_values(rank_col, ascending=not sort_asc).reset_index(drop=True)
-            filtered_df["Rank"] = filtered_df.index + 1
-        else:
-            filtered_df["Rank"] = range(1, len(filtered_df) + 1)
-    # always_include 컬럼은 무조건 포함
-    filtered_columns = [c for c in df.columns if c in columns or c in always_include]
-    for col in always_include:
-        if col not in filtered_columns:
-            filtered_columns.append(col)
-    # 중복 제거 및 순서 보장
-    filtered_columns = list(dict.fromkeys(filtered_columns))
-    filtered_df = filtered_df[filtered_columns]
-    for col in filtered_df.select_dtypes(include="number").columns:
-        filtered_df[col] = filtered_df[col].round(3)
-    if sort_col and sort_col in filtered_df.columns:
-        filtered_df = filtered_df.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
-        # Rank 재계산
-        filtered_df["Rank"] = filtered_df.index + 1
-    return filtered_df
-def get_models_by_group(df, groups):
-    return df[df["Group"].isin(groups)]["Model Name"].tolist()

src/about.py CHANGED Viewed

@@ -21,32 +21,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">🥇 Test Space</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Leaderboards for LLM evaluation.
-*TRUE(Trustworthy Real-world Usage Evaluation)Bench* is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
-We utilize LLM Judge with human-crafted criteria to assess AI response.
 """
 EVALUATION_QUEUE_TEXT = """
-## Submission Policy
-For each benchmark:
-1. Each model affiliation (individual or organization) can submit up to 3 times within 24 hours.
-2. The same model can only be submitted once within 24 hours.
-3. Criteria for determining duplicate submissions:
-    - Benchmark name
-    - Model full name
-    - Sampling parameters, dtype, vLLM version, etc. are not subject to duplicate checking.
-4. Submissions are only allowed if the model's organization or username matches that of the submitter.
 ## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
@@ -69,50 +60,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
 ### 4) Fill up your model card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-"""
-EVALUATION_QUEUE_TEXT_OPTION1 = """
-# (Option 1) Submit HF model where vLLM inference is available
-1. Fill the information including model name, vLLM version, sampling hyperparameters.
-2. Sign in using the log-in button below.
-3. Press "Submit Eval" button to submit.
-"""
-EVALUATION_QUEUE_TEXT_OPTION2 = """
-# (Option 2) Submit HF model where vLLM inference is unavailable
-1. Fill the information same with Option 1 and code snippets of model loading, inference, and termination.
-2. Sign in using the log-in button below.
-3. Press "Submit Eval" button to submit.
-"""
-EVALUATION_QUEUE_TEXT_OPTION3 = """
-# (Option 3) Pull Request
-If Option 1 & 2 is unavailable, make [PR](https://huggingface.co/spaces/coms1580/test_space/discussions?new_pr=true) with [ADD_MODEL] prefix with contents as follows:
-```
-### Open-weight models:
-- Benchmark Name: [The name of benchmark to be evaluated]
-- HugingFace Model ID: [HF_MODEL_ID]
-- Pretty Name: [PRETTY_NAME]
-- Sampling parameters:
-    - Temperature
-    - Top-p
-    - Top-k
-    - Presence penalty
-    - Frequency penalty
-    - Repetition penalty
-- Supported by vLLM: [yes/no]
-- (If yes) Version of vLLM
-- (If no) Code snippets:
-    - Model loading
-    - Inference
-    - Termination
-### Misc.
-- Contact: [your email]
-- Description: [e.g.,  paper link, blog post, etc.]
-- Notes: [optional]
-```
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">🥇 ProductivityBench (v1)</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+ProductivityBench is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+## Reproducibility
+To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ### 4) Fill up your model card
 When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/config.py CHANGED Viewed

@@ -24,9 +24,10 @@ ON_LOAD_COLUMNS_CATEGORY = [
     "Editing",
     "Data Analysis",
     "Reasoning",
     "Hallucination",
     "Safety",
-    "Repetition",
     "Summarization",
     "Translation",
     "Multi-Turn"

     "Editing",
     "Data Analysis",
     "Reasoning",
+    "Samsung Knowledge",
     "Hallucination",
     "Safety",
+    "Repeatition",
     "Summarization",
     "Translation",
     "Multi-Turn"

src/data/export_category_250618.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+"Model Name"	"Group"	"Overall"	"Content Generation"	"Editing"	"Data Analysis"	"Reasoning"	"Samsung Knowledge"	"Hallucination"	"Safety"	"Repeatition"	"Summarization"	"Translation"	"Multi-Turn"
+"claude-3-haiku-20240307"	"Claude"	"40.60"	"44.16"	"36.90"	"39.33"	"21.00"	"23.33"	"43.33"	"50.00"	"30.00"	"60.96"	"40.00"	"23.33"
+"claude-3-sonnet-20240229"	"Claude"	"44.47"	"48.05"	"42.26"	"45.33"	"32.00"	"23.33"	"45.00"	"56.25"	"36.67"	"60.96"	"46.33"	"22.78"
+"claude-3-5-sonnet-20240620"	"Claude"	"56.35"	"53.25"	"54.17"	"64.00"	"49.00"	"55.00"	"60.00"	"52.50"	"40.00"	"69.86"	"58.67"	"36.67"
+"claude-3-5-sonnet-20241022"	"Claude"	"58.45"	"61.04"	"55.36"	"66.00"	"54.00"	"40.00"	"63.33"	"42.50"	"40.00"	"73.97"	"62.33"	"38.33"
+"claude-3-7-sonnet-20250219"	"Claude"	"56.99"	"59.09"	"59.52"	"64.00"	"54.00"	"50.00"	"65.00"	"37.50"	"50.00"	"71.58"	"55.33"	"37.22"
+"claude-3-7-sonnet-20250219-thinking"	"Claude"	"58.70"	"63.64"	"58.33"	"71.52"	"68.00"	"55.00"	"62.71"	"37.50"	"50.00"	"72.60"	"55.00"	"33.33"
+"deepseek_r1"	"DeepSeek"	"55.27"	"61.69"	"54.76"	"68.67"	"68.00"	"46.67"	"51.67"	"20.00"	"46.67"	"67.81"	"49.00"	"43.33"
+"deepseek_r1_0528"	"DeepSeek"	"52.60"	"59.09"	"51.19"	"65.33"	"65.00"	"38.33"	"43.33"	"27.50"	"53.33"	"69.18"	"41.33"	"41.67"
+"deepseek_v3"	"DeepSeek"	"56.99"	"62.99"	"58.93"	"58.00"	"59.00"	"36.67"	"41.67"	"25.00"	"40.00"	"72.60"	"60.00"	"46.67"
+"deepseek_v3_0324"	"DeepSeek"	"54.51"	"55.84"	"48.21"	"63.33"	"70.00"	"43.33"	"50.00"	"20.00"	"46.67"	"72.95"	"49.67"	"43.33"

src/data/export_category_250709.csv DELETED Viewed

Binary file (1.26 kB)

src/data/export_lang_250618.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+"Model Name"	"Group"	"Overall"	"KO"	"EN"	"JA"	"ZH"	"PL"	"DE"	"PT"	"ES"	"FR"	"IT"	"RU"	"VI"
+"claude-3-haiku-20240307"	"Claude"	"40.60"	"31.87"	"30.99"	"41.54"	"36.92"	"52.24"	"55.22"	"56.72"	"55.22"	"68.66"	"74.63"	"50.75"	"38.46"
+"claude-3-sonnet-20240229"	"Claude"	"44.47"	"41.32"	"33.19"	"50.77"	"38.46"	"55.22"	"52.24"	"58.21"	"61.19"	"65.67"	"67.16"	"49.25"	"44.62"
+"claude-3-5-sonnet-20240620"	"Claude"	"56.35"	"55.60"	"43.30"	"73.85"	"47.69"	"64.18"	"65.67"	"70.15"	"67.16"	"76.12"	"71.64"	"65.67"	"55.38"
+"claude-3-5-sonnet-20241022"	"Claude"	"58.45"	"57.14"	"47.91"	"69.23"	"49.23"	"61.19"	"62.69"	"70.15"	"71.64"	"80.60"	"73.13"	"67.16"	"60.00"
+"claude-3-7-sonnet-20250219"	"Claude"	"56.99"	"55.82"	"46.59"	"63.08"	"56.92"	"68.66"	"59.70"	"64.18"	"64.18"	"74.63"	"67.16"	"64.18"	"66.15"
+"claude-3-7-sonnet-20250219-thinking"	"Claude"	"58.70"	"60.44"	"50.11"	"64.62"	"44.62"	"65.67"	"67.16"	"65.67"	"50.75"	"74.63"	"70.15"	"67.16"	"63.08"
+"deepseek_r1"	"DeepSeek"	"55.27"	"53.19"	"50.99"	"64.62"	"44.62"	"59.70"	"64.18"	"55.22"	"58.21"	"70.15"	"67.16"	"58.21"	"53.85"
+"deepseek_r1_0528"	"DeepSeek"	"52.60"	"48.79"	"47.25"	"58.46"	"43.08"	"52.24"	"61.19"	"68.66"	"58.21"	"62.69"	"65.67"	"61.19"	"56.92"
+"deepseek_v3"	"DeepSeek"	"56.99"	"53.41"	"49.01"	"66.15"	"43.08"	"59.70"	"70.15"	"67.16"	"65.67"	"79.10"	"74.63"	"58.21"	"64.62"
+"deepseek_v3_0324"	"DeepSeek"	"54.51"	"50.99"	"49.67"	"56.92"	"43.08"	"64.18"	"68.66"	"61.19"	"56.72"	"71.64"	"62.69"	"64.18"	"52.31"

src/data/export_lang_250709.csv DELETED Viewed

Binary file (958 Bytes)

src/data_utils.py CHANGED Viewed

@@ -3,12 +3,12 @@ from pathlib import Path
 def get_dataframe_category():
     abs_path = Path(__file__).parent
-    df = pd.read_csv(str(abs_path / "data/export_category_250709.csv"), encoding='utf-16', delimiter="	")
     df = df.sort_values("Overall", ascending=False)
     return df
 def get_dataframe_language():
     abs_path = Path(__file__).parent
-    df = pd.read_csv(str(abs_path / "data/export_lang_250709.csv"), encoding='utf-16', delimiter="	")
     df = df.sort_values("Overall", ascending=False)
     return df

 def get_dataframe_category():
     abs_path = Path(__file__).parent
+    df = pd.read_csv(str(abs_path / "data/export_category_250618.csv"), encoding='utf-8', delimiter="	")
     df = df.sort_values("Overall", ascending=False)
     return df
 def get_dataframe_language():
     abs_path = Path(__file__).parent
+    df = pd.read_csv(str(abs_path / "data/export_lang_250618.csv"), encoding='utf-8', delimiter="	")
     df = df.sort_values("Overall", ascending=False)
     return df

src/display/css_html_js.py CHANGED Viewed

@@ -1,128 +1,5 @@
 custom_css = """
-/* Sort arrow/button styles */
-.sort-arrow, .sort-btn {
-    display: inline-flex;
-    align-items: center;
-    justify-content: center;
-    background: #23244a;
-    color: #ffd700 !important; /* 항상 노란색 */
-    border: 1.5px solid #ffd700; /* 금색 테두리 */
-    border-radius: 6px;
-    font-size: 15px;
-    font-weight: 700;
-    margin-left: 6px;
-    margin-right: 2px;
-    padding: 2px 8px 2px 6px;
-    cursor: pointer;
-    transition: background 0.2s, color 0.2s, border 0.2s;
-    min-width: 28px;
-    min-height: 28px;
-    outline: none;
-}
-.sort-arrow.active, .sort-btn.active {
-    color: #ffd700 !important; /* 금색 */
-    border-color: #ffd700;
-    background: #1a237e;
-}
-.sort-arrow:hover, .sort-btn:hover {
-    background: #ffd700;
-    color: #23244a !important;
-    border-color: #ffd700;
-}
-.sort-arrow svg, .sort-btn svg {
-    margin-left: 2px;
-    margin-right: 0;
-    width: 1em;
-    height: 1em;
-    vertical-align: middle;
-}
-/* Enhanced leaderboard table styles */
-.pretty-leaderboard-table {
-    width: 100%;
-    border-collapse: separate;
-    border-spacing: 0;
-    background: rgba(30, 34, 54, 0.98);
-    border-radius: 16px;
-    box-shadow: 0 4px 24px 0 rgba(16, 152, 247, 0.10), 0 1.5px 6px 0 rgba(227, 84, 84, 0.08);
-    overflow: hidden;
-    margin-bottom: 24px;
-}
-.pretty-leaderboard-table th, .pretty-leaderboard-table td {
-    padding: 12px 16px;
-    text-align: left;
-    border-bottom: 1px solid #23244a;
-    font-size: 15px;
-}
-.pretty-leaderboard-table th {
-    background: linear-gradient(90deg, #23244a 0%, #1a237e 100%);
-    color: #F5F6F7;
-    font-weight: 700;
-    letter-spacing: 0.5px;
-    border-bottom: 2px solid #1098F7;
-}
-.pretty-leaderboard-table tr:nth-child(even) {
-    background: rgba(245, 246, 247, 0.03);
-}
-.pretty-leaderboard-table tr:hover {
-    background: rgba(16, 152, 247, 0.08);
-    transition: background 0.2s;
-}
-.pretty-leaderboard-table td {
-    color: #F5F6F7;
-    vertical-align: middle;
-}
-.pretty-leaderboard-table tr:last-child td {
-    border-bottom: none;
-}
-.pretty-leaderboard-table th:first-child, .pretty-leaderboard-table td:first-child {
-    border-top-left-radius: 16px;
-}
-.pretty-leaderboard-table th:last-child, .pretty-leaderboard-table td:last-child {
-    border-top-right-radius: 16px;
-}
-/* Enhanced score bar styles */
-.score-bar {
-    display: flex;
-    align-items: center;
-    gap: 12px;
-    width: 100%;
-}
-.score-bar-track {
-    flex-grow: 1;
-    height: 10px;
-    background: rgba(245, 246, 247, 0.12);
-    border-radius: 5px;
-    overflow: hidden;
-    max-width: 220px;
-    box-shadow: 0 1px 4px 0 rgba(16, 152, 247, 0.10);
-}
-.score-bar-fill {
-    height: 100%;
-    background: linear-gradient(90deg, #E35454, #1098F7);
-    border-radius: 5px;
-    transition: width 0.3s cubic-bezier(0.4,0,0.2,1);
-}
-.score-bar-value {
-    font-family: 'SF Mono', monospace;
-    font-weight: 600;
-    color: #F5F6F7;
-    min-width: 60px;
-    font-size: 14px;
-}
-body {
-    min-height: 100vh;
-    background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
-    background-image:
-      radial-gradient(rgba(255,255,255,0.12) 1.2px, transparent 1.2px),
-      radial-gradient(rgba(255,255,255,0.08) 1px, transparent 1px);
-    background-size: 40px 40px, 80px 80px;
-    background-position: 0 0, 20px 20px;
-}
 .markdown-text {
     font-size: 16px !important;
 }
@@ -145,15 +22,7 @@ body {
 }
 #leaderboard-table {
-    margin-top: 15px;
-    /* Space-themed background */
-    background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
-    position: relative;
-    background-image:
-      radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
-      radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
-    background-size: 40px 40px, 80px 80px;
-    background-position: 0 0, 20px 20px;
 }
 #leaderboard-table-lite {
@@ -225,53 +94,6 @@ body {
 #box-filter > .form{
     border: 0
 }
-/* Model type and output form badge styles */
-.badge {
-    display: inline-block;
-    border-radius: 12px;
-    padding: 2px 10px;
-    font-size: 0.85em;
-    font-weight: 700;
-    margin-left: 6px;
-    box-shadow: 0 1px 4px rgba(0,0,0,0.10);
-    vertical-align: middle;
-}
-.badge-open {
-    background: linear-gradient(90deg, #2196f3, #21cbf3);
-    color: #fff;
-}
-.badge-closed {
-    background: linear-gradient(90deg, #757575, #bdbdbd);
-    color: #fff;
-}
-.badge-normal {
-    background: linear-gradient(90deg, #43a047, #66bb6a);
-    color: #fff;
-}
-.badge-reasoning {
-    background: linear-gradient(90deg, #8e24aa, #d500f9);
-    color: #fff;
-}
-/* Sort button styles */
-.sort-btn {
-    background: #23244a;
-    color: #F5F6F7;
-    border: 1px solid #1098F7;
-    border-radius: 6px;
-    font-size: 13px;
-    font-weight: 700;
-    margin-left: 4px;
-    margin-right: 2px;
-    padding: 2px 7px;
-    cursor: pointer;
-    transition: background 0.2s, color 0.2s;
-}
-.sort-btn:hover {
-    background: #1098F7;
-    color: #fff;
-}
 """
 get_window_url_params = """
@@ -281,105 +103,3 @@ get_window_url_params = """
         return url_params;
     }
     """
-def get_rank_badge(rank: int) -> str:
-    """
-    Returns HTML for a rank badge (1st, 2nd, 3rd) with appropriate styling.
-    """
-    badge_styles = {
-        1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
-        2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
-        3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
-    }
-    if rank in badge_styles:
-        label, gradient, text_color = badge_styles[rank]
-        return f'''
-            <div style="
-                display: inline-flex;
-                align-items: center;
-                justify-content: center;
-                min-width: 48px;
-                padding: 4px 12px;
-                background: {gradient};
-                color: {text_color};
-                border-radius: 6px;
-                font-weight: 700;
-                font-size: 1em;
-                box-shadow: 0 2px 4px rgba(0,0,0,0.18);
-                border: 1.5px solid #fff2;
-            ">
-                {label}
-            </div>
-        '''
-    return f'''
-        <div style="
-            display: inline-flex;
-            align-items: center;
-            justify-content: center;
-            min-width: 28px;
-            color: #a1a1aa;
-            font-weight: 500;
-        ">
-            {rank}
-        </div>
-    '''
-def get_score_gauge(score: float, max_score: float = 1.0) -> str:
-    """
-    Returns HTML for an overall score gauge (progress bar style).
-    """
-    percent = min(max(score / max_score, 0), 1) * 100
-    return f'''
-        <div class="score-bar" style="margin: 0.5em 0;">
-            <div class="score-bar-track">
-                <div class="score-bar-fill" style="width: {percent}%;"></div>
-            </div>
-            <span class="score-bar-value">{score:.3f}</span>
-        </div>
-    '''
-def get_leaderboard_table_html(df) -> str:
-    """
-    Returns HTML for a pretty leaderboard table using badge and gauge.
-    Expects df to have columns: 'Model', 'Score', 'Model Type', 'Output Form'.
-    """
-    def get_type_badge(model_type):
-        if model_type == "open":
-            return '<span class="badge badge-open">open</span>'
-        else:
-            return '<span class="badge badge-closed">closed</span>'
-    def get_output_badge(output_form):
-        if output_form == "reasoning":
-            return '<span class="badge badge-reasoning">reasoning</span>'
-        else:
-            return '<span class="badge badge-normal">normal</span>'
-    html = ['<table class="pretty-leaderboard-table">']
-    # Header
-    html.append(
-        "<thead><tr>"
-        "<th>Rank</th>"
-        "<th>Model</th>"
-        "<th>Overall Score</th>"
-        "</tr></thead>"
-    )
-    html.append("<tbody>")
-    for idx, row in enumerate(df.itertuples(index=False), 1):
-        model = getattr(row, "Model", "")
-        score = getattr(row, "Score", 0.0)
-        model_type = getattr(row, "Model_Type", getattr(row, "Model Type", "open"))
-        output_form = getattr(row, "Output_Form", getattr(row, "Output Form", "normal"))
-        badge = get_rank_badge(idx)
-        gauge = get_score_gauge(score)
-        type_badge = get_type_badge(model_type)
-        output_badge = get_output_badge(output_form)
-        html.append(
-            f"<tr>"
-            f"<td>{badge}</td>"
-            f"<td>{model} {type_badge} {output_badge}</td>"
-            f"<td>{gauge}</td>"
-            f"</tr>"
-        )
-    html.append("</tbody></table>")
-    return "\n".join(html)

 custom_css = """
 .markdown-text {
     font-size: 16px !important;
 }
 }
 #leaderboard-table {
+    margin-top: 15px
 }
 #leaderboard-table-lite {
 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """
         return url_params;
     }
     """

src/display/formatting.py CHANGED Viewed

@@ -25,128 +25,3 @@ def has_no_nan_values(df, columns):
 def has_nan_values(df, columns):
     return df[columns].isna().any(axis=1)
-def get_score_bar(score):
-    """
-    Generate HTML for a score bar with gradient styling.
-    Expects score in the range 0-100.
-    """
-    width = max(0, min(score, 100))  # Clamp to [0, 100]
-    return f"""
-        <div class="score-bar">
-            <div class="score-bar-track">
-                <div class="score-bar-fill" style="width: {width}%;"></div>
-            </div>
-            <span class="score-bar-value">{score:.3f}</span>
-        </div>
-    """
-def render_leaderboard_html(df, overall_col="average"):
-    """
-    Render a DataFrame as an HTML table, replacing the overall_col with a gauge bar.
-    """
-    from .formatting import get_score_bar
-    from src.display.css_html_js import get_rank_badge
-    def get_type_badge(model_type):
-        if model_type == "open":
-            return '<span class="badge badge-open">open</span>'
-        else:
-            return '<span class="badge badge-closed">closed</span>'
-    def get_output_badge(output_form):
-        if output_form == "reasoning":
-            return '<span class="badge badge-reasoning">reasoning</span>'
-        else:
-            return '<span class="badge badge-normal">normal</span>'
-    # 숨길 컬럼
-    hidden_cols = ["Model", "Model Type", "Output Form", "Rank"]
-    # Build table header
-    def get_sort_arrow(col, sort_col, sort_asc):
-        # "Model Name", "Group" 컬럼을 제외한 모든 컬럼에 정렬 버튼 노출
-        if col in {"Model Name", "Group"}:
-            return ""
-        # 하나의 버튼(▲ 또는 ▼)만 노출, 클릭 시 asc가 반전됨
-        if col == sort_col:
-            # 현재 정렬 상태에 따라 아이콘과 data-asc를 반전
-            if sort_asc:
-                # 오름차순 상태: ▼ 아이콘, 클릭 시 내림차순
-                svg = (
-                    '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
-                    '<polygon points="3,5 11,5 7,11" fill="currentColor"/></svg>'
-                )
-                return (
-                    f'<span class="sort-arrow active" data-col="{col}" data-asc="false" aria-label="내림차순 정렬">{svg}</span>'
-                )
-            else:
-                # 내림차순 상태: ▲ 아이콘, 클릭 시 오름차순
-                svg = (
-                    '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
-                    '<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
-                )
-                return (
-                    f'<span class="sort-arrow active" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
-                )
-        else:
-            # 정렬 중이 아닌 컬럼: ▲(오름차순) 아이콘, 클릭 시 오름차순
-            svg = (
-                '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
-                '<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
-            )
-            return (
-                f'<span class="sort-arrow" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
-            )
-    # 정렬 상태 추출 (State에서 전달받거나 기본값)
-    sort_col = getattr(df, "_sort_col", None) or (df.columns[0] if len(df.columns) > 0 else None)
-    sort_asc = getattr(df, "_sort_asc", None)
-    if sort_asc is None:
-        sort_asc = True
-    html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
-    for col in df.columns:
-        if col in hidden_cols:
-            continue
-        html += f'<th>{col}{get_sort_arrow(col, sort_col, sort_asc)}</th>'
-    html += '</tr></thead>\n<tbody>\n'
-    # Build table rows
-    for idx, row in df.iterrows():
-        html += '<tr>'
-        for col in df.columns:
-            if col in hidden_cols:
-                continue
-            cell = row[col]
-            if col == overall_col:
-                try:
-                    cell_html = get_score_bar(float(cell))
-                except Exception:
-                    cell_html = str(cell)
-                html += f'<td>{cell_html}</td>'
-            elif col in ["Model Name"]:
-                # 1~3위 하이라이트 + 4등 이후 흰색 + 뱃지 항상 표시
-                rank = row.get("Rank", None)
-                model_type = row.get("Model Type", None) or row.get("Model_Type", None)
-                output_form = row.get("Output Form", None) or row.get("Output_Form", None)
-                highlight_style = ""
-                if rank == 1 or rank == "1":
-                    highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
-                elif rank == 2 or rank == "2":
-                    highlight_style = "color: #b0b0b0; font-weight: bold;"
-                elif rank == 3 or rank == "3":
-                    highlight_style = "color: #cd7f32; font-weight: bold;"
-                else:
-                    highlight_style = "color: #fff; font-weight: 600;"
-                badge_html = ""
-                if model_type:
-                    badge_html += " " + get_type_badge(model_type)
-                if output_form:
-                    badge_html += " " + get_output_badge(output_form)
-                html += f'<td><span style="{highlight_style}">{cell}</span>{badge_html}</td>'
-            else:
-                html += f'<td>{cell}</td>'
-        html += '</tr>\n'
-    html += '</tbody></table>'
-    return html


25
26	def has_nan_values(df, columns):
27	return df[columns].isna().any(axis=1)

src/display/utils.py CHANGED Viewed

@@ -21,26 +21,24 @@ class ColumnContent:
     never_hidden: bool = False
 ## Leaderboard columns
-from dataclasses import field
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
-auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
-# Scores
-auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda t=task: ColumnContent(t.value.col_name, "number", True))])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
-auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
-auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
-auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
-auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
-auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
-auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False))])
-auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -93,8 +91,6 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
-    fp8 = ModelDetails("fp8")
-    int4 = ModelDetails("int4")
     Unknown = ModelDetails("?")
     def from_str(precision):
@@ -102,10 +98,6 @@ class Precision(Enum):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
-        if precision == "fp8":
-            return Precision.fp8
-        if precision == "int4":
-            return Precision.int4
         return Precision.Unknown
 # Column selection
@@ -115,3 +107,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

     never_hidden: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     Unknown = ModelDetails("?")
     def from_str(precision):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py CHANGED Viewed

@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "coms1580" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/test_space"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"

src/submission/check_validity.py CHANGED Viewed

@@ -88,13 +88,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
-                    file_names.append(f"{info['benchmark']}_{info['model']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:
                         continue
                     organisation, _ = info["model"].split("/")
-                    users_to_submission_dates[organisation].extend([{"benchmark": info['benchmark'], "model": info["model"], "submitted_time": info["submitted_time"]}])
     return set(file_names), users_to_submission_dates

                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:
                         continue
                     organisation, _ = info["model"].split("/")
+                    users_to_submission_dates[organisation].append(info["submitted_time"])
     return set(file_names), users_to_submission_dates

src/submission/submit.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import os
 from datetime import datetime, timezone
-from typing import Optional
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
@@ -10,26 +10,17 @@ from src.submission.check_validity import (
     get_model_size,
     is_model_on_hub,
 )
-import gradio as gr
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
-def add_new_eval_option1(
-    benchmark: str,
     model: str,
     base_model: str,
     revision: str,
     precision: str,
-    temperature: str,
-    top_p: str,
-    top_k: str,
-    presence_penalty: str,
-    frequency_penalty: str,
-    repetition_penalty: str,
-    vllm_version: str,
-    user_state: str,
-    organization_list: list
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -43,174 +34,25 @@ def add_new_eval_option1(
         model_path = model.split("/")[1]
     precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S %z")
-    # Check submitter qualification
-    if user_name != user_state and user_name not in organization_list:
-        return styled_error("The submitter does not have submission rights for this model.")
-    # Does the organization submit more than three times in a day?
-    submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
-    submission_cnt = 0
-    for i in range(len(submission_times)):
-        hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
-        if hours_diff <= 24:
-            submission_cnt += 1
-    if submission_cnt > 3:
-        return styled_error("The organization already submitted three times for this benchmark today.")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model.")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    if temperature == "":
-        temperature = "1.0"
-    if top_p == "":
-        top_p = "1.0"
-    if top_k == "":
-        top_k = "-1"
-    if presence_penalty == "":
-        presence_penalty = "0.0"
-    if frequency_penalty == "":
-        frequency_penalty = "0.0"
-    if repetition_penalty == "":
-        repetition_penalty = "1.0"
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "benchmark": benchmark,
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-        "temperature": float(temperature),
-        "top_p": float(top_p),
-        "top_k": float(top_k),
-        "vllm_version": vllm_version,
-        "presence_penalty": float(presence_penalty),
-        "frequency_penalty": float(frequency_penalty),
-        "repetition_penalty": float(repetition_penalty),
-        "load_model_code": "None",
-        "inference_code": "None",
-        "termination_code": "None",
-    }
-    # Check for duplicate submission
-    submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
-    submission_cnt = 0
-    for i in range(len(submission_times)):
-        hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
-        if hours_diff <= 24:
-            submission_cnt += 1
-    if submission_cnt > 1:
-        return styled_warning("This model has been already submitted within 24 hours.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{benchmark}_{model_path}_eval_request_False.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!"
-    )
-def add_new_eval_option2(
-    benchmark: str,
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    temperature: str,
-    top_p: str,
-    top_k: str,
-    presence_penalty: str,
-    frequency_penalty: str,
-    repetition_penalty: str,
-    load_model_code: str,
-    inference_code: str,
-    termination_code: str,
-    user_state: str,
-    organization_list: list
-):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S %z")
-    # Check submitter qualification
-    if user_name != user_state and user_name not in organization_list:
-        return styled_error("The submitter does not have submission rights for this model.")
-    # Does the organization submit more than three times in a day?
-    submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
-    submission_cnt = 0
-    for i in range(len(submission_times)):
-        hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
-        if hours_diff <= 24:
-            submission_cnt += 1
-    if submission_cnt > 3:
-        return styled_error("The organization already submitted three times for this benchmark today.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
     # Is the model info correctly filled?
     try:
@@ -224,71 +66,38 @@ def add_new_eval_option2(
     try:
         license = model_info.cardData["license"]
     except Exception:
-        return styled_error("Please select a license for your model.")
     modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
-    if temperature == "":
-        temperature = "1.0"
-    if top_p == "":
-        top_p = "1.0"
-    if top_k == "":
-        top_k = "-1"
-    if presence_penalty == "":
-        presence_penalty = "0.0"
-    if frequency_penalty == "":
-        frequency_penalty = "0.0"
-    if repetition_penalty == "":
-        repetition_penalty = "1.0"
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
-        "benchmark": benchmark,
         "model": model,
         "base_model": base_model,
         "revision": revision,
         "precision": precision,
         "status": "PENDING",
         "submitted_time": current_time,
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
         "private": False,
-        "temperature": float(temperature),
-        "top_p": float(top_p),
-        "top_k": float(top_k),
-        "vllm_version": "None",
-        "presence_penalty": float(presence_penalty),
-        "frequency_penalty": float(frequency_penalty),
-        "repetition_penalty": float(repetition_penalty),
-        "load_model_code": load_model_code,
-        "inference_code": inference_code,
-        "termination_code": termination_code
     }
     # Check for duplicate submission
-    submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
-    submission_cnt = 0
-    for i in range(len(submission_times)):
-        hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
-        if hours_diff <= 24:
-            submission_cnt += 1
-    if submission_cnt > 1:
-        return styled_warning("This model has been already submitted within 24 hours.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{benchmark}_{model_path}_eval_request_False.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
@@ -306,5 +115,5 @@ def add_new_eval_option2(
     os.remove(out_path)
     return styled_message(
-        "Your request has been submitted to the evaluation queue!"
     )

 import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     get_model_size,
     is_model_on_hub,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def add_new_eval(
     model: str,
     base_model: str,
     revision: str,
     precision: str,
+    weight_type: str,
+    model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
         model_path = model.split("/")[1]
     precision = precision.split(" ")[0]
+    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    if model_type is None or model_type == "":
+        return styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
+    # Is the model on the hub?
+    if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
+        if not base_model_on_hub:
+            return styled_error(f'Base model "{base_model}" {error}')
+    if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
+        if not model_on_hub:
+            return styled_error(f'Model "{model}" {error}')
     # Is the model info correctly filled?
     try:
     try:
         license = model_info.cardData["license"]
     except Exception:
+        return styled_error("Please select a license for your model")
     modelcard_OK, error_msg = check_model_card(model)
     if not modelcard_OK:
         return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
         "model": model,
         "base_model": base_model,
         "revision": revision,
         "precision": precision,
+        "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
+        "model_type": model_type,
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
         "private": False,
     }
     # Check for duplicate submission
+    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
+        return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     os.remove(out_path)
     return styled_message(
+        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

ui.py DELETED Viewed

@@ -1,228 +0,0 @@
-import gradio as gr
-import src.config as configs
-from constants import TAB_NAMES, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
-from src.display.formatting import render_leaderboard_html
-from src.display.css_html_js import get_leaderboard_table_html, custom_css
-import pandas as pd
-from constants import LEADERBOARD_REQUIRED_COLUMNS
-def render_pretty_leaderboard_html(df):
-    """
-    Renders a pretty leaderboard table using badge and gauge.
-    Supports both ['Model', 'Score'] and ['Model Name', 'Overall'] columns.
-    Sorts by score descending and rounds for display.
-    """
-    # Flexible column mapping
-    col_map = {}
-    if "Model" in df.columns:
-        col_map["Model"] = "Model"
-    elif "Model Name" in df.columns:
-        col_map["Model"] = "Model Name"
-    else:
-        return "<div style='color:red'>DataFrame must have a 'Model' or 'Model Name' column.</div>"
-    if "Score" in df.columns:
-        col_map["Score"] = "Score"
-    elif "Overall" in df.columns:
-        col_map["Score"] = "Overall"
-    else:
-        return "<div style='color:red'>DataFrame must have a 'Score' or 'Overall' column.</div>"
-    # Example mappings for demonstration (expand as needed)
-    model_type_map = MODEL_TYPE_MAP
-    output_form_map = OUTPUT_FORM_MAP
-    # Copy and rename for uniformity
-    df2 = df.copy()
-    df2 = df2.rename(columns={col_map["Model"]: "Model", col_map["Score"]: "Score"})
-    # 매핑 전후로 누락된 모델명을 출력 (디버깅용)
-    missing_type = set(df2["Model"]) - set(model_type_map.keys())
-    missing_output = set(df2["Model"]) - set(output_form_map.keys())
-    if missing_type:
-        print("Model Type 매핑 누락:", missing_type)
-    if missing_output:
-        print("Output Form 매핑 누락:", missing_output)
-    # Add badge columns
-    df2["Model Type"] = df2["Model"].map(model_type_map).fillna("open")
-    df2["Output Form"] = df2["Model"].map(output_form_map).fillna("normal")
-    # Drop NA, sort, round
-    df2 = df2[["Model", "Score", "Model Type", "Output Form"]].dropna()
-    df2["Score"] = pd.to_numeric(df2["Score"], errors="coerce").round(2)
-    df2 = df2.sort_values("Score", ascending=False).reset_index(drop=True)
-    return get_leaderboard_table_html(df2)
-def create_leaderboard_tab(df, key, search_leaderboard, update_modelselector_group, update_leaderboard, column_selector_value):
-    """
-    df: DataFrame to display
-    key: "Category" or "Language"
-    search_leaderboard, update_modelselector_group, update_leaderboard: handler functions
-    column_selector_value: default columns to select
-    """
-    with gr.TabItem(
-        TAB_NAMES[key],
-        visible=True
-    ):
-        df_state = gr.State(df)
-        with gr.Row():
-            with gr.Column():
-                search_box = gr.Textbox(label="Search Model by Name")
-                group_list = df["Group"].unique().tolist()
-                group_selector = gr.CheckboxGroup(
-                    choices=df["Group"].unique().tolist(),
-                    value=group_list,
-                    label="Select Model Group"
-                )
-                # 필수 컬럼 항상 포함, 체크 해제 불가(disabled)
-                # 선택지에서 "Model Name", "Group", "Overall" 제외
-                exclude_cols = {"Model Name", "Group", "Overall"}
-                selectable_columns = [col for col in df.columns.tolist()[3:] if col not in exclude_cols]
-                all_columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + selectable_columns))
-                column_selector = gr.CheckboxGroup(
-                    choices=selectable_columns,
-                    value=[col for col in column_selector_value if col in selectable_columns],
-                    label="Select Columns"
-                )
-            with gr.Column():
-                with gr.Accordion("Model List", open=False):
-                    model_group = df["Model Name"].tolist()
-                    model_selector = gr.CheckboxGroup(
-                        choices=df["Model Name"].tolist(),
-                        value=model_group,
-                        label="Select Models"
-                    )
-        # badge 정보 포함 DataFrame 생성 (위쪽 테이블용)
-        df_badge = df.copy()
-        # Model 컬럼명 통일
-        if "Model Name" in df_badge.columns:
-            df_badge["Model"] = df_badge["Model Name"]
-        # 예시 매핑 (아래쪽과 동일하게 확장)
-        model_type_map = MODEL_TYPE_MAP
-        output_form_map = OUTPUT_FORM_MAP
-        df_badge["Model Type"] = df_badge["Model"].map(model_type_map).fillna("open")
-        df_badge["Output Form"] = df_badge["Model"].map(output_form_map).fillna("normal")
-        df_badge = df_badge.sort_values("Overall" if "Overall" in df_badge.columns else "Score", ascending=False).reset_index(drop=True)
-        df_badge["Rank"] = df_badge.index + 1
-        # 정렬 상태 관리용 State (한 번만 생성, 이후 재사용)
-        default_sort_col = "Overall" if "Overall" in df_badge.columns else "Score"
-        sort_col_state = gr.State(default_sort_col)
-        sort_asc_state = gr.State(False)  # 내림차순이 기본값
-        # 정렬 함수 (JS에서 넘긴 asc 값을 그대로 사용)
-        def sort_and_render(col, asc, models, columns, df_):
-            print(f"[sort_and_render] called: col={col}, asc={asc}, models={models}, columns={columns}")
-            filtered_df = update_leaderboard(models, columns, df_, col, asc)
-            # 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
-            filtered_df._sort_col = col
-            filtered_df._sort_asc = asc
-            return render_leaderboard_html(filtered_df.round(3)), col, asc
-        leaderboard_html = render_leaderboard_html(df_badge.round(3))
-        leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")
-        # 정렬 트리거용 hidden textbox 추가
-        sort_trigger = gr.Textbox(visible=False, elem_id="sort-leaderboard-trigger")
-        # sort-arrow 클릭 시 항상 새로운 값으로 value를 변경하는 JS 삽입 (정렬 방향 포함)
-        sort_js = """
-        <script>
-        (function() {
-            document.addEventListener('DOMContentLoaded', function() {
-                const table = document.getElementById('leaderboard-table');
-                if (!table) return;
-                table.addEventListener('click', function(e) {
-                    const arrow = e.target.closest('.sort-arrow');
-                    if (arrow) {
-                        const col = arrow.getAttribute('data-col');
-                        const asc = arrow.getAttribute('data-asc');
-                        // 항상 새로운 값으로 value를 변경하여 change 이벤트 강제 발생
-                        const trigger = document.querySelector('#sort-leaderboard-trigger input');
-                        if (trigger) {
-                            trigger.value = col + '|' + asc + '|' + Date.now();
-                            trigger.dispatchEvent(new Event('input', { bubbles: true }));
-                            trigger.dispatchEvent(new Event('change', { bubbles: true }));
-                        }
-                    }
-                });
-            });
-        })();
-        </script>
-        """
-        # 정렬 버튼 클릭 시에도 update_leaderboard를 호출하도록 wiring
-        def sort_trigger_change(col_val, models, columns, df_, prev_col, prev_asc):
-            print(f"[sort_trigger.change] col_val={col_val}, prev_col={prev_col}, prev_asc={prev_asc}")
-            col, asc = col_val.split('|')[0], col_val.split('|')[1].lower() == "true"
-            return sort_and_render(col, asc, models, columns, df_)
-        sort_trigger.change(
-            fn=sort_trigger_change,
-            inputs=[sort_trigger, model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
-            outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
-        )
-        # 커스텀 JS를 상단 테이블에 삽입
-        leaderboard_html_comp.style = None  # gr.HTML에는 style 파라미터가 없으므로, 아래에서 삽입
-        leaderboard_html_comp.value += sort_js
-        # Pretty leaderboard preview (uses only 'Model' and 'Score' columns)
-        pretty_html = gr.HTML(value=render_pretty_leaderboard_html(df.round(3)))
-        # Define change functions for user interaction
-        # 모든 UI 이벤트에서 update_leaderboard → sort_and_render → render_leaderboard_html 순으로 갱신
-        def filter_and_sort_search(query, df, sort_col, sort_asc):
-            print(f"[filter_and_sort_search] sort_col={sort_col}, sort_asc={sort_asc}")
-            filtered_df = search_leaderboard(query, df, sort_col, sort_asc)
-            # 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
-            filtered_df._sort_col = sort_col
-            filtered_df._sort_asc = sort_asc
-            return render_leaderboard_html(filtered_df), sort_col, sort_asc
-        def filter_and_sort_model(models, columns, df, sort_col, sort_asc):
-            print(f"[filter_and_sort_model] sort_col={sort_col}, sort_asc={sort_asc}")
-            filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
-            filtered_df._sort_col = sort_col
-            filtered_df._sort_asc = sort_asc
-            return render_leaderboard_html(filtered_df), sort_col, sort_asc
-        def filter_and_sort_column(models, columns, df, sort_col, sort_asc):
-            print(f"[filter_and_sort_column] sort_col={sort_col}, sort_asc={sort_asc}")
-            filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
-            filtered_df._sort_col = sort_col
-            filtered_df._sort_asc = sort_asc
-            return render_leaderboard_html(filtered_df), sort_col, sort_asc
-        search_box.change(
-            fn=filter_and_sort_search,
-            inputs=[search_box, df_state, sort_col_state, sort_asc_state],
-            outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
-        )
-        group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
-        model_selector.change(
-            fn=filter_and_sort_model,
-            inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
-            outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
-        )
-        # column_selector 변경 시에도 항상 최신 sort_col, sort_asc를 유지
-        column_selector.change(
-            fn=filter_and_sort_column,
-            inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
-            outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
-        )
-        return {
-            "search_box": search_box,
-            "group_selector": group_selector,
-            "column_selector": column_selector,
-            "model_selector": model_selector,
-            "leaderboard_html_comp": leaderboard_html_comp,
-            "sort_trigger": sort_trigger,
-            "df_state": df_state,
-            "pretty_html": pretty_html
-        }

utils.py DELETED Viewed

@@ -1,42 +0,0 @@
-from __future__ import annotations
-import gradio as gr
-from huggingface_hub import whoami
-def get_profile(profile: gr.OAuthProfile | None) -> str:
-    if profile is None:
-        return "Anonymous"
-    return profile.username
-def get_organizations(oauth_token: gr.OAuthToken | None) -> str:
-    if oauth_token is None:
-        return "No Organization"
-    org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
-    return org_names
-def get_profile_and_organizations(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> tuple[str, str]:
-    if profile is None:
-        output_profile = "Anonymous"
-    else:
-        output_profile = profile.username
-    if oauth_token is None:
-        output_org = "No Organization"
-    else:
-        output_org = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
-    return output_profile, output_org
-def download_with_restart(snapshot_download_func, repo_id, local_dir, repo_type, token, restart_func):
-    try:
-        print(local_dir)
-        snapshot_download_func(
-            repo_id=repo_id,
-            local_dir=local_dir,
-            repo_type=repo_type,
-            tqdm_class=None,
-            etag_timeout=30,
-            token=token
-        )
-    except Exception:
-        restart_func()