Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

Borchmann commited on Nov 15, 2025

Commit

87993b5

verified ·

1 Parent(s): 7658988

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

app.py +38 -5
requirements.txt +2 -2
src/display/css_html_js.py +26 -6
src/display/formatting.py +7 -3
src/display/utils.py +11 -10
src/leaderboard/read_evals.py +25 -9
src/populate.py +12 -6
src/submission/submit.py +10 -7

app.py CHANGED Viewed

@@ -255,6 +255,15 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -263,8 +272,26 @@ def init_leaderboard(dataframe):
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.submitted_by.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
@@ -399,8 +426,8 @@ with demo:
                     model_name_textbox = gr.Textbox(
                         label="Model Name", placeholder="e.g., GPT-4-Turbo-Agent, Claude-3-Opus-Agent"
                     )
-                    submitted_by_textbox = gr.Textbox(
-                        label="Submitted By", placeholder="e.g., your name, organization, or team name"
                     )
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
@@ -409,6 +436,11 @@ with demo:
                         value=None,
                         interactive=True,
                     )
                 with gr.Column():
                     predictions_file = gr.File(label="Predictions JSONL File", file_types=[".jsonl"], type="filepath")
@@ -434,9 +466,10 @@ with demo:
                 add_new_eval,
                 [
                     model_name_textbox,
-                    submitted_by_textbox,
                     model_type,
                     predictions_file,
                 ],
                 submission_result,
             )

 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    # Calculate dynamic filter ranges from actual data
+    max_agent_steps = int(dataframe[AutoEvalColumn.agent_steps.name].max()) if len(dataframe) > 0 else 1000
+    max_cost = float(dataframe[AutoEvalColumn.cost_usd.name].max()) if len(dataframe) > 0 else 10.0
+    # Add some headroom to max values
+    max_agent_steps = max(max_agent_steps + 100, 1000)
+    max_cost = max(max_cost + 1.0, 10.0)
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
             cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
             label="Select Columns to Display:",
         ),
+        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.organization.name],
+        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden] + ["Type"],
+        filter_columns=[
+            ColumnFilter(
+                AutoEvalColumn.agent_steps.name,
+                type="slider",
+                min=0,
+                max=max_agent_steps,
+                default=[0, max_agent_steps],
+                label="Agent Steps",
+            ),
+            ColumnFilter(
+                AutoEvalColumn.cost_usd.name,
+                type="slider",
+                min=0.0,
+                max=max_cost,
+                default=[0.0, max_cost],
+                label="Cost (USD)",
+            ),
+        ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
                     model_name_textbox = gr.Textbox(
                         label="Model Name", placeholder="e.g., GPT-4-Turbo-Agent, Claude-3-Opus-Agent"
                     )
+                    organization_textbox = gr.Textbox(
+                        label="Organization", placeholder="e.g., OpenAI, Anthropic, Meta, or your organization name"
                     )
                     model_type = gr.Dropdown(
                         choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         value=None,
                         interactive=True,
                     )
+                    link_textbox = gr.Textbox(
+                        label="Link (Optional)",
+                        placeholder="e.g., https://arxiv.org/abs/... or https://github.com/...",
+                        info="Link to paper, code repository, or model card (optional)"
+                    )
                 with gr.Column():
                     predictions_file = gr.File(label="Predictions JSONL File", file_types=[".jsonl"], type="filepath")
                 add_new_eval,
                 [
                     model_name_textbox,
+                    organization_textbox,
                     model_type,
                     predictions_file,
+                    link_textbox,
                 ],
                 submission_result,
             )

requirements.txt CHANGED Viewed

@@ -4,10 +4,10 @@ datasets
 gradio
 gradio[oauth]
 gradio_client
-gradio_leaderboard>=0.0.13
 huggingface-hub>=0.18.0
 matplotlib
-numpy
 pandas
 plotly
 python-dateutil

 gradio
 gradio[oauth]
 gradio_client
+gradio_leaderboard==0.0.13
 huggingface-hub>=0.18.0
 matplotlib
+numpy<2.0
 pandas
 plotly
 python-dateutil

src/display/css_html_js.py CHANGED Viewed

@@ -54,12 +54,32 @@ table a:hover {
     padding: 0px;
 }
-/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
-#leaderboard-table td:nth-child(2),
-#leaderboard-table th:nth-child(2) {
-    max-width: 400px;
-    overflow: auto;
-    white-space: nowrap;
 }
 /* Style for inline SVG icons in table */

     padding: 0px;
 }
+/* Set width for the Model column (now first column after reordering) */
+#leaderboard-table td:nth-child(1),
+#leaderboard-table th:nth-child(1),
+.leaderboard table td:first-child,
+.leaderboard table th:first-child,
+table td:first-child,
+table th:first-child {
+    min-width: 350px !important;
+    width: 400px !important;
+    max-width: 500px !important;
+    overflow: visible !important;
+    white-space: normal !important;
+    word-break: break-word !important;
+}
+/* Also target the gradio leaderboard specifically with highest priority */
+.gradio-container .gradio-leaderboard table td:first-child,
+.gradio-container .gradio-leaderboard table th:first-child,
+[class*="leaderboard"] table td:first-child,
+[class*="leaderboard"] table th:first-child {
+    min-width: 350px !important;
+    width: 400px !important;
+    max-width: 500px !important;
+    white-space: normal !important;
+    word-break: break-word !important;
+    overflow-wrap: break-word !important;
 }
 /* Style for inline SVG icons in table */

src/display/formatting.py CHANGED Viewed

@@ -2,9 +2,13 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
-    link = f"https://huggingface.co/{model_name}"
-    return model_hyperlink(link, model_name)
 def styled_error(error):

     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name, link=None):
+    """Create a clickable model name with optional custom link"""
+    if link and link.strip():
+        # Use provided link (paper/code)
+        return model_hyperlink(link, model_name)
+    # No link provided, just return plain text
+    return model_name
 def styled_error(error):

src/display/utils.py CHANGED Viewed

@@ -22,11 +22,15 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
-# Init
 auto_eval_column_dict.append(
-    ("model_type_symbol", ColumnContent, ColumnContent("T", "markdown", True, never_hidden=True))
 )
-auto_eval_column_dict.append(("model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)))
 # Scores
 for idx, task in enumerate(Tasks):
     # Only show overall ANLS (first task) by default
@@ -34,13 +38,10 @@ for idx, task in enumerate(Tasks):
     auto_eval_column_dict.append(
         (task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default))
     )
-# Cost/Efficiency metrics
-auto_eval_column_dict.append(("agent_steps", ColumnContent, ColumnContent("Agent Steps", "number", True)))
-auto_eval_column_dict.append(("cost_usd", ColumnContent, ColumnContent("Cost (USD)", "number", True)))
-# Model information
-auto_eval_column_dict.append(("model_type", ColumnContent, ColumnContent("Model Type", "str", True)))
-auto_eval_column_dict.append(("submitted_by", ColumnContent, ColumnContent("Submitted By", "str", False)))
 auto_eval_column_dict.append(("submission_date", ColumnContent, ColumnContent("Submission Date", "str", False)))
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -50,7 +51,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     model_type = ColumnContent("model_type", "str", True)
-    submitted_by = ColumnContent("submitted_by", "str", True)
     status = ColumnContent("status", "str", True)

 ## Leaderboard columns
 auto_eval_column_dict = []
+# Main columns (displayed by default, in order)
+auto_eval_column_dict.append(("model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)))
+auto_eval_column_dict.append(("organization", ColumnContent, ColumnContent("Organization", "str", True)))
 auto_eval_column_dict.append(
+    ("model_type_symbol", ColumnContent, ColumnContent("Model Type", "markdown", True, never_hidden=True))
 )
+# Cost/Efficiency metrics
+auto_eval_column_dict.append(("agent_steps", ColumnContent, ColumnContent("Agent Steps", "number", True)))
+auto_eval_column_dict.append(("cost_usd", ColumnContent, ColumnContent("Cost (USD)", "number", True)))
 # Scores
 for idx, task in enumerate(Tasks):
     # Only show overall ANLS (first task) by default
     auto_eval_column_dict.append(
         (task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default))
     )
+# Model information (will be hidden in display but needed for filtering)
+auto_eval_column_dict.append(("model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=False)))
 auto_eval_column_dict.append(("submission_date", ColumnContent, ColumnContent("Submission Date", "str", False)))
+auto_eval_column_dict.append(("link", ColumnContent, ColumnContent("Link", "str", False, hidden=True)))
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     model_type = ColumnContent("model_type", "str", True)
+    organization = ColumnContent("organization", "str", True)
     status = ColumnContent("status", "str", True)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -20,8 +20,9 @@ class EvalResult:
     agent_steps: int = 0
     cost_usd: float = 0.0
     model_type: ModelType = ModelType.Unknown  # API or open-weight
-    submitted_by: str = ""
     submission_date: str = ""
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -52,8 +53,9 @@ class EvalResult:
         model_type = ModelType.from_str(model_type_str)
         # Extract submission info (will be updated from request file)
-        submitted_by = data.get("submitted_by", "")
         submission_date = data.get("submission_date", "")
         # Create unique eval name
         eval_name = model_name.replace("/", "_").replace(" ", "_")
@@ -65,8 +67,9 @@ class EvalResult:
             agent_steps=agent_steps,
             cost_usd=cost_usd,
             model_type=model_type,
-            submitted_by=submitted_by,
             submission_date=submission_date,
         )
     def update_with_request_file(self, requests_path):
@@ -77,8 +80,9 @@ class EvalResult:
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.submitted_by = request.get("submitted_by", "")
             self.submission_date = request.get("submitted_time", "")
         except Exception as e:
             print(f"Could not find request file for {self.model_name}: {e}")
@@ -87,12 +91,13 @@ class EvalResult:
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name
             AutoEvalColumn.model_type_symbol.name: get_model_type_icon(self.model_type),
-            AutoEvalColumn.model.name: make_clickable_model(self.model_name),
             AutoEvalColumn.agent_steps.name: self.agent_steps,
             AutoEvalColumn.cost_usd.name: self.cost_usd,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.submitted_by.name: self.submitted_by,
             AutoEvalColumn.submission_date.name: self.submission_date,
         }
         # Add individual task scores
@@ -155,11 +160,22 @@ MODEL_TYPE_ICON_MAP = {
 def get_model_type_icon(model_type: ModelType) -> str:
     data_uri = MODEL_TYPE_ICON_MAP.get(model_type)
     if data_uri:
-        alt_text = model_type.value.display_name or model_type.value.name or "model"
-        return f'<img src="{data_uri}" alt="{alt_text} icon" class="table-icon-img" />'
-    return model_type.value.symbol
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:

     agent_steps: int = 0
     cost_usd: float = 0.0
     model_type: ModelType = ModelType.Unknown  # API or open-weight
+    organization: str = ""
     submission_date: str = ""
+    link: str = ""  # Optional link to paper or code
     @classmethod
     def init_from_json_file(self, json_filepath):
         model_type = ModelType.from_str(model_type_str)
         # Extract submission info (will be updated from request file)
+        organization = data.get("organization", data.get("submitted_by", ""))  # Backward compatibility
         submission_date = data.get("submission_date", "")
+        link = data.get("link", "")
         # Create unique eval name
         eval_name = model_name.replace("/", "_").replace(" ", "_")
             agent_steps=agent_steps,
             cost_usd=cost_usd,
             model_type=model_type,
+            organization=organization,
             submission_date=submission_date,
+            link=link,
         )
     def update_with_request_file(self, requests_path):
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", ""))
+            self.organization = request.get("organization", request.get("submitted_by", ""))  # Backward compatibility
             self.submission_date = request.get("submitted_time", "")
+            self.link = request.get("link", "")
         except Exception as e:
             print(f"Could not find request file for {self.model_name}: {e}")
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name
             AutoEvalColumn.model_type_symbol.name: get_model_type_icon(self.model_type),
+            AutoEvalColumn.model.name: make_clickable_model(self.model_name, self.link),
             AutoEvalColumn.agent_steps.name: self.agent_steps,
             AutoEvalColumn.cost_usd.name: self.cost_usd,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
+            AutoEvalColumn.organization.name: self.organization,
             AutoEvalColumn.submission_date.name: self.submission_date,
+            AutoEvalColumn.link.name: self.link,
         }
         # Add individual task scores
 def get_model_type_icon(model_type: ModelType) -> str:
+    """Returns icon + colored text for model type"""
     data_uri = MODEL_TYPE_ICON_MAP.get(model_type)
+    type_name = model_type.value.name
+    type_color = model_type.value.color
     if data_uri:
+        # Icon + colored text in a flex container to keep them inline
+        alt_text = model_type.value.display_name or type_name or "model"
+        icon_html = f'<img src="{data_uri}" alt="{alt_text} icon" class="table-icon-img" style="vertical-align: middle;" />'
+        text_html = f'<span style="color: {type_color}; font-weight: 500; margin-left: 6px; vertical-align: middle;">{type_name}</span>'
+        return f'<div style="display: inline-flex; align-items: center; white-space: nowrap;">{icon_html}{text_html}</div>'
+    # Fallback: emoji + colored text
+    symbol = model_type.value.symbol
+    text_html = f'<span style="color: {type_color}; font-weight: 500; margin-left: 4px;">{type_name}</span>'
+    return f'<div style="display: inline-flex; align-items: center; white-space: nowrap;">{symbol}{text_html}</div>'
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:

src/populate.py CHANGED Viewed

@@ -35,12 +35,15 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
             # Ensure all required columns are present
             if EvalQueueColumn.model_type.name not in data:
                 data[EvalQueueColumn.model_type.name] = "unknown"
-            if EvalQueueColumn.submitted_by.name not in data:
-                data[EvalQueueColumn.submitted_by.name] = "unknown"
             all_evals.append(data)
         elif ".md" not in entry:
@@ -53,12 +56,15 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 with open(file_path) as fp:
                     data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                 # Ensure all required columns are present
                 if EvalQueueColumn.model_type.name not in data:
                     data[EvalQueueColumn.model_type.name] = "unknown"
-                if EvalQueueColumn.submitted_by.name not in data:
-                    data[EvalQueueColumn.submitted_by.name] = "unknown"
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

             with open(file_path) as fp:
                 data = json.load(fp)
+            # Handle organization (backward compatible with submitted_by)
+            org = data.get("organization", data.get("submitted_by", "unknown"))
+            link = data.get("link", "")
+            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"], link)
             # Ensure all required columns are present
             if EvalQueueColumn.model_type.name not in data:
                 data[EvalQueueColumn.model_type.name] = "unknown"
+            data[EvalQueueColumn.organization.name] = org
             all_evals.append(data)
         elif ".md" not in entry:
                 with open(file_path) as fp:
                     data = json.load(fp)
+                # Handle organization (backward compatible with submitted_by)
+                org = data.get("organization", data.get("submitted_by", "unknown"))
+                link = data.get("link", "")
+                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"], link)
                 # Ensure all required columns are present
                 if EvalQueueColumn.model_type.name not in data:
                     data[EvalQueueColumn.model_type.name] = "unknown"
+                data[EvalQueueColumn.organization.name] = org
                 all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]

src/submission/submit.py CHANGED Viewed

@@ -66,9 +66,10 @@ def validate_jsonl_submission(file_path):
 def add_new_eval(
     model_name: str,
-    submitted_by: str,
     model_type: str,
     predictions_file,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -81,8 +82,8 @@ def add_new_eval(
     if not model_name or model_name.strip() == "":
         return styled_error("Please provide a model name.")
-    if not submitted_by or submitted_by.strip() == "":
-        return styled_error("Please provide your name/organization.")
     if model_type is None or model_type == "":
         return styled_error("Please select a model type (API or Open-weight).")
@@ -107,8 +108,8 @@ def add_new_eval(
     print("Adding new eval")
     # Prepare directories
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{submitted_by}"
-    PREDICTIONS_DIR = f"{EVAL_RESULTS_PATH}/{submitted_by}"
     os.makedirs(OUT_DIR, exist_ok=True)
     os.makedirs(PREDICTIONS_DIR, exist_ok=True)
@@ -139,9 +140,10 @@ def add_new_eval(
             "cost_usd": 0.0,  # Placeholder
             "model_type": model_type.lower(),
         },
-        "submitted_by": submitted_by,
         "submission_date": current_time,
         "num_predictions": num_predictions,
     }
     # Save results file
@@ -152,10 +154,11 @@ def add_new_eval(
     # Create request entry for queue
     eval_request = {
         "model": model_name,
-        "submitted_by": submitted_by,
         "model_type": model_type,
         "status": "PENDING",  # Will be set to FINISHED after evaluation
         "submitted_time": current_time,
     }
     # Save request file

 def add_new_eval(
     model_name: str,
+    organization: str,
     model_type: str,
     predictions_file,
+    link: str = "",
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not model_name or model_name.strip() == "":
         return styled_error("Please provide a model name.")
+    if not organization or organization.strip() == "":
+        return styled_error("Please provide your organization name.")
     if model_type is None or model_type == "":
         return styled_error("Please select a model type (API or Open-weight).")
     print("Adding new eval")
     # Prepare directories
+    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{organization}"
+    PREDICTIONS_DIR = f"{EVAL_RESULTS_PATH}/{organization}"
     os.makedirs(OUT_DIR, exist_ok=True)
     os.makedirs(PREDICTIONS_DIR, exist_ok=True)
             "cost_usd": 0.0,  # Placeholder
             "model_type": model_type.lower(),
         },
+        "organization": organization,
         "submission_date": current_time,
         "num_predictions": num_predictions,
+        "link": link.strip() if link else "",
     }
     # Save results file
     # Create request entry for queue
     eval_request = {
         "model": model_name,
+        "organization": organization,
         "model_type": model_type,
         "status": "PENDING",  # Will be set to FINISHED after evaluation
         "submitted_time": current_time,
+        "link": link.strip() if link else "",
     }
     # Save request file