Spaces:

evaleval
/

every_eval_ever_space

Running

App Files Files Community

deepmage121 commited on 3 days ago

Commit

a92080e

1 Parent(s): 149ce10

moving to EEE hf org

Browse files

Files changed (7) hide show

.gitignore +9 -0
app.py +499 -0
data_loader.py +386 -0
eval.schema.json +282 -0
hf_operations.py +202 -0
pyproject.toml +10 -0
ui_components.py +1374 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+.DS_Store
+.secrets
+.actrc
+__pycache__/
+*.pyc
+parquet_output/
+*.venv*
+*.md
+*.ipynb_checkpoints

app.py ADDED Viewed

	@@ -0,0 +1,499 @@

+"""
+Evaluation Leaderboard - Gradio Interface
+Displays model evaluation results from HuggingFace datasets.
+"""
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+from data_loader import (
+    load_hf_dataset_on_startup,
+    get_available_leaderboards,
+    get_eval_metadata,
+    build_leaderboard_table,
+    clear_cache,
+    search_model_across_leaderboards,
+    get_all_model_names,
+    DATA_DIR
+)
+from ui_components import (
+    get_theme,
+    get_custom_css,
+    format_leaderboard_header,
+    format_metric_details,
+    format_model_card,
+    format_model_comparison,
+)
+PAGE_SIZE = 50
+def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()):
+    """Loads and aggregates data for the selected leaderboard."""
+    if not selected_leaderboard:
+        return (
+            pd.DataFrame(),
+            format_leaderboard_header(None, {}),
+            format_metric_details(None, {}),
+            gr.update(choices=[], value=None),
+            gr.update(interactive=False),
+            gr.update(interactive=False),
+            gr.update(choices=[], value=None),
+            "0 / 0",
+            gr.update(choices=[], value=[]),
+        )
+    metadata = get_eval_metadata(selected_leaderboard)
+    def progress_callback(value, desc):
+        progress(value, desc=desc)
+    df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
+    # Get all available columns BEFORE filtering (for column selector)
+    all_available_columns = list(df.columns) if not df.empty else []
+    # Filter columns if selected (if None or empty, show all columns)
+    if selected_columns is not None and len(selected_columns) > 0:
+        # Ensure Model column is always included
+        base_cols = ["Model"]
+        available_cols = list(df.columns)
+        cols_to_show = [col for col in base_cols if col in available_cols]
+        # Add Developer and other selected columns
+        cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show])
+        if cols_to_show:
+            df = df[cols_to_show]
+    if search_query and not df.empty:
+        mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
+        df = df[mask]
+    filtered_count = len(df)
+    if sort_column and sort_column in df.columns and not df.empty:
+        df = df.sort_values(by=sort_column, ascending=False, na_position='last')
+    total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1
+    current_page = max(1, min(current_page, total_pages))
+    start_idx = (current_page - 1) * PAGE_SIZE
+    end_idx = start_idx + PAGE_SIZE
+    df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df
+    page_choices = [str(i) for i in range(1, total_pages + 1)]
+    page_dropdown = gr.update(choices=page_choices, value=str(current_page))
+    prev_btn = gr.update(interactive=(current_page > 1))
+    next_btn = gr.update(interactive=(current_page < total_pages))
+    page_info = f"{current_page} / {total_pages}"
+    sort_choices = list(df.columns) if not df.empty else []
+    default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
+    sort_column_update = gr.update(choices=sort_choices, value=default_sort)
+    # Get all available columns for column selector (use full list, not filtered)
+    # Include all columns except Model in the selector (Model is always shown)
+    column_choices = [col for col in all_available_columns if col != "Model"]
+    # Preserve current selection, or default to all columns if None or empty
+    if selected_columns is None or len(selected_columns) == 0:
+        column_value = column_choices
+    else:
+        # Preserve user's selection, filtering out any invalid choices
+        column_value = [col for col in selected_columns if col in column_choices]
+    column_selector_update = gr.update(choices=column_choices, value=column_value)
+    return (
+        df_paginated,
+        format_leaderboard_header(selected_leaderboard, metadata),
+        format_metric_details(selected_leaderboard, metadata),
+        page_dropdown,
+        prev_btn,
+        next_btn,
+        sort_column_update,
+        page_info,
+        column_selector_update,
+    )
+def search_model(model_query):
+    """Search for a model and return formatted card."""
+    if not model_query or len(model_query) < 2:
+        return """
+        <div class="no-results">
+            <h3>Search for a model</h3>
+            <p>Enter a model name to see its benchmarks across all leaderboards</p>
+        </div>
+        """
+    results, _ = search_model_across_leaderboards(model_query)
+    if not results:
+        return f"""
+        <div class="no-results">
+            <h3>No results for "{model_query}"</h3>
+            <p>Try a different model name or check the spelling</p>
+        </div>
+        """
+    # Use the first matching model
+    model_name = list(results.keys())[0]
+    model_data = results[model_name]
+    return format_model_card(model_name, model_data)
+def compare_models(selected_models):
+    """Compare multiple selected models."""
+    if not selected_models or len(selected_models) == 0:
+        return """
+        <div class="no-results">
+            <h3>Select models to compare</h3>
+            <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
+        </div>
+        """
+    # Get data for all selected models
+    all_results = {}
+    for model_name in selected_models:
+        results, _ = search_model_across_leaderboards(model_name)
+        if results:
+            # Use the first matching model (exact match preferred)
+            matched_model = list(results.keys())[0]
+            all_results[matched_model] = results[matched_model]
+    if len(all_results) == 1:
+        # Single model - show card view
+        model_name = list(all_results.keys())[0]
+        return format_model_card(model_name, all_results[model_name])
+    elif len(all_results) > 1:
+        # Multiple models - show comparison
+        return format_model_comparison(list(all_results.keys()), all_results)
+    else:
+        return """
+        <div class="no-results">
+            <h3>No results found</h3>
+            <p>Try selecting different models</p>
+        </div>
+        """
+def get_model_suggestions(query):
+    """Get model name suggestions for autocomplete."""
+    if not query or len(query) < 2:
+        return gr.update(choices=[])
+    _, matches = search_model_across_leaderboards(query)
+    return gr.update(choices=matches[:15])
+# Load data at startup
+load_hf_dataset_on_startup()
+# Build interface
+with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
+    # Header
+    gr.HTML("""
+        <div class="app-header">
+            <div class="logo-mark">E³</div>
+            <div class="brand">
+                <h1>Every Eval Ever</h1>
+                <span class="tagline">Browse and compare model benchmarks</span>
+            </div>
+            <div class="header-right">
+                <span class="version-badge">beta</span>
+            </div>
+        </div>
+    """)
+    with gr.Tabs():
+        # === TAB 1: Leaderboard View ===
+        with gr.TabItem("📊 Leaderboards"):
+            with gr.Row(elem_classes="controls-bar"):
+                initial_choices = get_available_leaderboards()
+                initial_value = initial_choices[0] if initial_choices else None
+                with gr.Column(scale=2, min_width=200):
+                    leaderboard_selector = gr.Dropdown(
+                        choices=initial_choices,
+                        value=initial_value,
+                        label="Leaderboard",
+                        interactive=True
+                    )
+                with gr.Column(scale=3, min_width=250):
+                    search_box = gr.Textbox(
+                        label="Filter",
+                        placeholder="Filter models...",
+                        show_label=True
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
+            init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None)
+            header_view = gr.HTML(value=init_header)
+            # Hidden sort state (default to Average)
+            sort_column_dropdown = gr.Dropdown(
+                choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [],
+                value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None,
+                visible=False,
+            )
+            # Column selector
+            with gr.Row(elem_classes="controls-bar"):
+                column_selector = gr.CheckboxGroup(
+                    choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [],
+                    value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [],
+                    label="Columns to Display",
+                    interactive=True,
+                    show_label=True,
+                )
+            leaderboard_table = gr.Dataframe(
+                value=init_df,
+                label=None,
+                interactive=False,
+                wrap=False,
+                elem_classes="dataframe",
+            )
+            # Pagination below table - centered
+            with gr.Row(elem_classes="pagination-bar"):
+                prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
+                page_info = gr.Markdown(value=init_page_info, elem_classes="page-info")
+                next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
+                # Extract choices and value from gr.update() dict, ensuring value is in choices
+                if isinstance(init_page_dropdown, dict):
+                    page_choices = init_page_dropdown.get("choices", ["1"])
+                    page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1"
+                    # Ensure value exists in choices
+                    if page_value not in page_choices:
+                        page_value = page_choices[0] if page_choices else "1"
+                    if not page_choices:
+                        page_choices = ["1"]
+                else:
+                    page_choices = ["1"]
+                    page_value = "1"
+                page_dropdown = gr.Dropdown(
+                    choices=page_choices,
+                    value=page_value,
+                    visible=False,
+                )
+            metrics_view = gr.HTML(value=init_metrics)
+        # === TAB 2: Model View ===
+        with gr.TabItem("🔍 Model Lookup"):
+            gr.Markdown("### Find and compare models across all leaderboards")
+            selected_models_state = gr.State(value=[])
+            default_compare_html = """
+                <div class="no-results">
+                    <h3>Search for models to compare</h3>
+                    <p>Type in the dropdown above, then click a model to add it</p>
+                </div>
+            """
+            with gr.Row(elem_classes="controls-bar"):
+                with gr.Column(scale=4):
+                    all_models = get_all_model_names()
+                    model_dropdown = gr.Dropdown(
+                        choices=all_models,
+                        label="Search models to add",
+                        interactive=True,
+                        allow_custom_value=False,
+                        filterable=True,
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
+            selected_models_group = gr.CheckboxGroup(
+                choices=[],
+                value=[],
+                label="Selected Models (click to remove)",
+                interactive=True,
+                elem_classes="selected-models-group"
+            )
+            model_card_view = gr.HTML(value=default_compare_html)
+    # Submission guide
+    with gr.Accordion("📤 How to Submit Data", open=False):
+        gr.Markdown("""
+**Submit via GitHub Pull Request:**
+1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
+2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
+3. Open a PR — automated validation runs on submission
+4. After merge, data syncs to HuggingFace automatically
+[Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) · [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
+        """)
+    # === State ===
+    current_page_state = gr.State(value=1)
+    sort_column_state = gr.State(value="Average")
+    def go_prev(current):
+        return max(1, current - 1)
+    def go_next(current):
+        return current + 1
+    def reset_page():
+        return 1
+    def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns):
+        """Update table without modifying column selector (for column changes)."""
+        result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns)
+        # Return all outputs except the last one (column_selector)
+        return result[:-1]
+    # === Leaderboard Events ===
+    leaderboard_selector.change(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=lambda: "Average", outputs=[sort_column_state]
+    ).then(
+        fn=lambda: None, outputs=[column_selector]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
+    )
+    search_box.input(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    sort_column_dropdown.change(
+        fn=lambda col: col,
+        inputs=[sort_column_dropdown],
+        outputs=[sort_column_state]
+    ).then(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    column_selector.change(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    page_dropdown.change(
+        fn=lambda p: int(p) if p else 1,
+        inputs=[page_dropdown],
+        outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    prev_btn.click(
+        fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    next_btn.click(
+        fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
+    ).then(
+        fn=update_table_only,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    refresh_btn.click(
+        fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
+        outputs=[leaderboard_selector]
+    ).then(
+        fn=lambda: clear_cache()
+    ).then(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=lambda: "Average", outputs=[sort_column_state]
+    ).then(
+        fn=lambda: None, outputs=[column_selector]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
+    )
+    # === Model Search Events ===
+    def add_model_and_compare(selected_model, current_selected):
+        """Add a model and auto-compare."""
+        if not selected_model:
+            comparison_html = compare_models(current_selected) if current_selected else default_compare_html
+            return (
+                current_selected,
+                gr.update(value=None),
+                gr.update(choices=current_selected, value=current_selected),
+                comparison_html
+            )
+        if current_selected is None:
+            current_selected = []
+        if selected_model not in current_selected:
+            current_selected = current_selected + [selected_model]
+        comparison_html = compare_models(current_selected)
+        return (
+            current_selected,
+            gr.update(value=None),
+            gr.update(choices=current_selected, value=current_selected),
+            comparison_html
+        )
+    def update_selection(selected_list):
+        """Update selection from checkbox changes."""
+        selected_list = selected_list or []
+        comparison_html = compare_models(selected_list) if selected_list else default_compare_html
+        return selected_list, comparison_html
+    def clear_all_models():
+        """Clear all selected models."""
+        return (
+            [],
+            gr.update(value=None),
+            gr.update(choices=[], value=[]),
+            default_compare_html
+        )
+    # Select from dropdown adds model and auto-compares
+    model_dropdown.select(
+        fn=add_model_and_compare,
+        inputs=[model_dropdown, selected_models_state],
+        outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
+    )
+    selected_models_group.change(
+        fn=update_selection,
+        inputs=[selected_models_group],
+        outputs=[selected_models_state, model_card_view]
+    )
+    clear_models_btn.click(
+        fn=clear_all_models,
+        outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
+    )
+    DATA_DIR.mkdir(exist_ok=True)
+if __name__ == "__main__":
+    demo.launch()

data_loader.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""
+Data Loader: Load from HuggingFace, parse JSON files, and build tables.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from datasets import load_dataset
+# Global caches
+HF_DATASET_CACHE = {}
+LEADERBOARD_CACHE = {}
+DATA_DIR = Path("leaderboard_data")
+def load_hf_dataset_on_startup():
+    """Load all splits from HuggingFace dataset at startup."""
+    print("Loading dataset from HuggingFace...")
+    try:
+        dataset = load_dataset("evaleval/every_eval_ever")
+        for split_name, split_data in dataset.items():
+            print(f"Loading split: {split_name} ({len(split_data)} rows)")
+            df = split_data.to_pandas()
+            parsed_items = []
+            for _, row in df.iterrows():
+                evaluation_results = json.loads(row['evaluation_results'])
+                results = {}
+                for eval_result in evaluation_results:
+                    eval_name = eval_result.get("evaluation_name")
+                    score = eval_result.get("score_details", {}).get("score")
+                    if eval_name and score is not None:
+                        results[eval_name] = score
+                additional_details = {}
+                if pd.notna(row.get('additional_details')):
+                    additional_details = json.loads(row['additional_details'])
+                parsed_item = {
+                    "leaderboard": row['_leaderboard'],
+                    "provider": row['source_organization_name'],
+                    "model": row['model_id'],
+                    "developer": row['model_developer'],
+                    "params": additional_details.get('params_billions'),
+                    "architecture": additional_details.get('architecture', 'Unknown'),
+                    "precision": additional_details.get('precision', 'Unknown'),
+                    "results": results,
+                    "raw_data": {
+                        "schema_version": row['schema_version'],
+                        "evaluation_id": row['evaluation_id'],
+                        "retrieved_timestamp": row['retrieved_timestamp'],
+                        "source_data": json.loads(row['source_data']),
+                        "evaluation_source": {
+                            "evaluation_source_name": row['evaluation_source_name'],
+                            "evaluation_source_type": row['evaluation_source_type']
+                        },
+                        "source_metadata": {
+                            "source_organization_name": row['source_organization_name'],
+                            "evaluator_relationship": row['evaluator_relationship'],
+                        },
+                        "model_info": {
+                            "name": row['model_name'],
+                            "id": row['model_id'],
+                            "developer": row['model_developer'],
+                        },
+                        "evaluation_results": evaluation_results,
+                        "additional_details": additional_details
+                    }
+                }
+                if pd.notna(row.get('source_organization_url')):
+                    parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
+                if pd.notna(row.get('source_organization_logo_url')):
+                    parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
+                if pd.notna(row.get('model_inference_platform')):
+                    parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
+                parsed_items.append(parsed_item)
+            HF_DATASET_CACHE[split_name] = parsed_items
+        print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
+        return True
+    except Exception as e:
+        print(f"Warning: Could not load HuggingFace dataset: {e}")
+        print("Falling back to local file system...")
+        return False
+def parse_eval_json(file_path):
+    """Parses a single JSON file to extract model, provider, and results."""
+    try:
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
+        provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
+        model_id = data.get("model_info", {}).get("id", "Unknown Model")
+        developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
+        params = data.get("model_info", {}).get("params_billions", None)
+        architecture = data.get("model_info", {}).get("architecture", "Unknown")
+        precision = data.get("additional_details", {}).get("precision", "Unknown")
+        if precision == "Unknown":
+             precision = data.get("model_info", {}).get("precision", "Unknown")
+        results = {}
+        if "evaluation_results" in data:
+            for res in data["evaluation_results"]:
+                eval_name = res.get("evaluation_name", "Unknown Metric")
+                score = res.get("score_details", {}).get("score", None)
+                if score is not None:
+                    results[eval_name] = score
+        return {
+            "leaderboard": leaderboard_name,
+            "provider": provider_name,
+            "model": model_id,
+            "developer": developer_name,
+            "params": params,
+            "architecture": architecture,
+            "precision": precision,
+            "results": results,
+            "raw_data": data
+        }
+    except Exception as e:
+        print(f"Error parsing {file_path}: {e}")
+        return None
+def get_available_leaderboards():
+    """Returns available leaderboards from HF cache or local directory."""
+    if HF_DATASET_CACHE:
+        return list(HF_DATASET_CACHE.keys())
+    if not DATA_DIR.exists():
+        return []
+    return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
+def walk_eval_files(leaderboard_name):
+    """Generator that walks through Leaderboard directory recursively."""
+    lb_path = DATA_DIR / leaderboard_name
+    if not lb_path.exists():
+        return
+    yield from lb_path.rglob("*.json")
+def get_eval_metadata(selected_leaderboard):
+    """Extracts evaluation metadata from the leaderboard data."""
+    if not selected_leaderboard:
+        return {}
+    eval_metadata = {"evals": {}, "source_info": {}}
+    if selected_leaderboard in HF_DATASET_CACHE:
+        parsed_items = HF_DATASET_CACHE[selected_leaderboard]
+        if parsed_items:
+            parsed = parsed_items[0]
+            source_meta = parsed["raw_data"].get("source_metadata", {})
+            source_data_list = parsed["raw_data"].get("source_data", [])
+            url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
+            eval_metadata["source_info"] = {
+                "organization": source_meta.get("source_organization_name", "Unknown"),
+                "relationship": source_meta.get("evaluator_relationship", "Unknown"),
+                "url": url
+            }
+            if "evaluation_results" in parsed["raw_data"]:
+                for res in parsed["raw_data"]["evaluation_results"]:
+                    eval_name = res.get("evaluation_name", "Unknown Metric")
+                    if eval_name not in eval_metadata["evals"]:
+                        metric_config = res.get("metric_config", {})
+                        eval_metadata["evals"][eval_name] = {
+                            "description": metric_config.get("evaluation_description", "No description available"),
+                            "score_type": metric_config.get("score_type", "unknown"),
+                            "lower_is_better": metric_config.get("lower_is_better", False),
+                            "min_score": metric_config.get("min_score"),
+                            "max_score": metric_config.get("max_score"),
+                            "level_names": metric_config.get("level_names", []),
+                            "level_metadata": metric_config.get("level_metadata", []),
+                            "has_unknown_level": metric_config.get("has_unknown_level", False)
+                        }
+        return eval_metadata
+    # Fall back to file system
+    for json_file in walk_eval_files(selected_leaderboard):
+        parsed = parse_eval_json(json_file)
+        if parsed:
+            if not eval_metadata["source_info"]:
+                 source_meta = parsed["raw_data"].get("source_metadata", {})
+                 source_data_list = parsed["raw_data"].get("source_data", [])
+                 url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
+                 eval_metadata["source_info"] = {
+                     "organization": source_meta.get("source_organization_name", "Unknown"),
+                     "relationship": source_meta.get("evaluator_relationship", "Unknown"),
+                     "url": url
+                 }
+            if "evaluation_results" in parsed["raw_data"]:
+                for res in parsed["raw_data"]["evaluation_results"]:
+                    eval_name = res.get("evaluation_name", "Unknown Metric")
+                    if eval_name not in eval_metadata["evals"]:
+                        metric_config = res.get("metric_config", {})
+                        eval_metadata["evals"][eval_name] = {
+                            "description": metric_config.get("evaluation_description", "No description available"),
+                            "score_type": metric_config.get("score_type", "unknown"),
+                            "lower_is_better": metric_config.get("lower_is_better", False),
+                            "min_score": metric_config.get("min_score"),
+                            "max_score": metric_config.get("max_score"),
+                            "level_names": metric_config.get("level_names", []),
+                            "level_metadata": metric_config.get("level_metadata", []),
+                            "has_unknown_level": metric_config.get("has_unknown_level", False)
+                        }
+            break
+    return eval_metadata
+def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
+    """Builds the leaderboard DataFrame from cache or files."""
+    if not selected_leaderboard:
+        return pd.DataFrame()
+    if selected_leaderboard in LEADERBOARD_CACHE:
+        df, _ = LEADERBOARD_CACHE[selected_leaderboard]
+    else:
+        rows = []
+        if selected_leaderboard in HF_DATASET_CACHE:
+            if progress_callback:
+                progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
+            parsed_items = HF_DATASET_CACHE[selected_leaderboard]
+            for i, parsed in enumerate(parsed_items):
+                if i % 100 == 0 and progress_callback:
+                    progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
+                row = {
+                    "Model": parsed["model"],
+                    "Developer": parsed["developer"],
+                    "Params (B)": parsed["params"],
+                    "Arch": parsed["architecture"],
+                    "Precision": parsed["precision"]
+                }
+                row.update(parsed["results"])
+                rows.append(row)
+        else:
+            # Fall back to file system
+            if progress_callback:
+                progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
+            all_files = list(walk_eval_files(selected_leaderboard))
+            total_files = len(all_files)
+            for i, json_file in enumerate(all_files):
+                if i % 100 == 0 and progress_callback:
+                     progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
+                parsed = parse_eval_json(json_file)
+                if parsed:
+                    row = {
+                        "Model": parsed["model"],
+                        "Developer": parsed["developer"],
+                        "Params (B)": parsed["params"],
+                        "Arch": parsed["architecture"],
+                        "Precision": parsed["precision"]
+                    }
+                    row.update(parsed["results"])
+                    rows.append(row)
+        if not rows:
+            df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
+            LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
+            return df
+        df = pd.DataFrame(rows)
+        df = df.dropna(axis=1, how='all')
+        if df.empty:
+             LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
+             return df
+        numeric_cols = df.select_dtypes(include=['float', 'int']).columns
+        df[numeric_cols] = df[numeric_cols].round(2)
+        # Add Average Score
+        eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
+        if len(eval_only_cols) > 0:
+            df["Average"] = df[eval_only_cols].mean(axis=1).round(2)
+        # Base columns: Model, Developer, Params, Average
+        # Eval columns: all evaluation scores
+        # Model detail columns: Arch, Precision (moved to end)
+        base_cols = ["Model", "Developer", "Params (B)", "Average"]
+        model_detail_cols = ["Arch", "Precision"]
+        eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols]
+        base_cols = [c for c in base_cols if c in df.columns]
+        model_detail_cols = [c for c in model_detail_cols if c in df.columns]
+        final_cols = base_cols + sorted(eval_cols) + model_detail_cols
+        df = df[final_cols]
+        if "Average" in df.columns:
+            df = df.sort_values("Average", ascending=False)
+        LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
+    return df
+def clear_cache():
+    """Clears all caches."""
+    LEADERBOARD_CACHE.clear()
+def search_model_across_leaderboards(model_query):
+    """Search for a model across all leaderboards and return aggregated results."""
+    if not model_query or not HF_DATASET_CACHE:
+        return {}, []
+    model_query_lower = model_query.lower().strip()
+    results = {}
+    all_matches = []
+    for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
+        for item in parsed_items:
+            model_id = item.get("model", "")
+            # Check if query matches model name (case insensitive, partial match)
+            if model_query_lower in model_id.lower():
+                all_matches.append(model_id)
+                # Exact match gets priority
+                if model_id.lower() == model_query_lower or model_id == model_query:
+                    if model_id not in results:
+                        results[model_id] = {}
+                    results[model_id][leaderboard_name] = {
+                        "developer": item.get("developer"),
+                        "params": item.get("params"),
+                        "architecture": item.get("architecture"),
+                        "precision": item.get("precision"),
+                        "results": item.get("results", {})
+                    }
+    # If no exact match, use partial matches
+    if not results and all_matches:
+        # Get the first partial match
+        for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
+            for item in parsed_items:
+                model_id = item.get("model", "")
+                if model_query_lower in model_id.lower():
+                    if model_id not in results:
+                        results[model_id] = {}
+                    results[model_id][leaderboard_name] = {
+                        "developer": item.get("developer"),
+                        "params": item.get("params"),
+                        "architecture": item.get("architecture"),
+                        "precision": item.get("precision"),
+                        "results": item.get("results", {})
+                    }
+    # Return unique matches for autocomplete
+    unique_matches = sorted(set(all_matches))[:20]  # Limit to 20 suggestions
+    return results, unique_matches
+def get_all_model_names():
+    """Get all unique model names across all leaderboards."""
+    if not HF_DATASET_CACHE:
+        return []
+    models = set()
+    for parsed_items in HF_DATASET_CACHE.values():
+        for item in parsed_items:
+            models.add(item.get("model", ""))
+    return sorted(models)

eval.schema.json ADDED Viewed

	@@ -0,0 +1,282 @@

+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "0.0.1",
+    "type": "object",
+    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "evaluation_source",
+        "retrieved_timestamp",
+        "source_data",
+        "source_metadata",
+        "model_info",
+        "evaluation_results"
+    ],
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this evaluation data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
+        },
+        "retrieved_timestamp": {
+            "type": "string",
+            "description": "Timestamp for when this record was created"
+        },
+        "source_data": {
+            "type": "array",
+            "description": "URLs for the source of the evaluation data",
+            "items": {
+                "type": "string"
+            }
+        },
+        "evaluation_source": {
+            "type": "object",
+            "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
+            "required": [
+                "evaluation_source_name",
+                "evaluation_source_type"
+            ],
+            "properties": {
+                "evaluation_source_name": {
+                    "type": "string",
+                    "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
+                },
+                "evaluation_source_type": {
+                    "type": "string",
+                    "enum": [
+                        "leaderboard",
+                        "evaluation_platform"
+                    ],
+                    "description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
+                }
+            }
+        },
+        "source_metadata": {
+            "type": "object",
+            "description": "Metadata about the source of the leaderboard data",
+            "required": [
+                "source_organization_name",
+                "evaluator_relationship"
+            ],
+            "properties": {
+                "source_organization_name": {
+                    "type": "string",
+                    "description": "Name of the organization that provides the data"
+                },
+                "source_organization_url": {
+                    "type": "string",
+                    "description": "URL for the organization that provides the data"
+                },
+                "source_organization_logo_url": {
+                    "type": "string",
+                    "description": "URL for the Logo for the organization that provides the data"
+                },
+                "evaluator_relationship": {
+                    "type": "string",
+                    "description": "Relationship between the evaluator and the model",
+                    "enum": [
+                        "first_party",
+                        "third_party",
+                        "collaborative",
+                        "other"
+                    ]
+                }
+            }
+        },
+        "model_info": {
+            "type": "object",
+            "description": "Complete model specification including basic information, technical configuration and inference settings",
+            "required": [
+                "name",
+                "id"
+            ],
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "Model name provided by evaluation source"
+                },
+                "id": {
+                    "type": "string",
+                    "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
+                },
+                "developer": {
+                    "type": "string",
+                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
+                },
+                "inference_platform": {
+                    "type": "string",
+                    "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
+                }
+            }
+        },
+        "evaluation_results": {
+            "type": "array",
+            "description": "Array of evaluation results",
+            "items": {
+                "type": "object",
+                "required": [
+                    "evaluation_name",
+                    "metric_config",
+                    "score_details"
+                ],
+                "properties": {
+                    "evaluation_name": {
+                        "type": "string",
+                        "description": "Name of the evaluation"
+                    },
+                    "evaluation_timestamp": {
+                        "type": "string",
+                        "description": "Timestamp for when the evaluations were run"
+                    },
+                    "metric_config": {
+                        "type": "object",
+                        "description": "Details about the metric",
+                        "required": [
+                            "lower_is_better"
+                        ],
+                        "properties": {
+                            "evaluation_description": {
+                                "type": "string",
+                                "description": "Description of the evaluation"
+                            },
+                            "lower_is_better": {
+                                "type": "boolean",
+                                "description": "Whether a lower score is better"
+                            },
+                            "score_type": {
+                                "type": "string",
+                                "description": "Type of score",
+                                "enum": [
+                                    "binary",
+                                    "continuous",
+                                    "levels"
+                                ]
+                            },
+                            "level_names": {
+                                "type": "array",
+                                "description": "Names of the score levels",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "level_metadata": {
+                                "type": "array",
+                                "description": "Additional Description for each Score Level",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "has_unknown_level": {
+                                "type": "boolean",
+                                "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
+                            },
+                            "min_score": {
+                                "type": "number",
+                                "description": "Minimum possible score for continuous metric"
+                            },
+                            "max_score": {
+                                "type": "number",
+                                "description": "Maximum possible score for continuous metric"
+                            }
+                        },
+                        "if": {
+                            "properties": {
+                                "score_type": {
+                                    "const": "levels"
+                                }
+                            }
+                        },
+                        "then": {
+                            "required": [
+                                "level_names",
+                                "has_unknown_level"
+                            ]
+                        },
+                        "else": {
+                            "if": {
+                                "properties": {
+                                    "score_type": {
+                                        "const": "continuous"
+                                    }
+                                }
+                            },
+                            "then": {
+                                "required": [
+                                    "min_score",
+                                    "max_score"
+                                ]
+                            }
+                        }
+                    },
+                    "score_details": {
+                        "type": "object",
+                        "description": "The score for the evaluation and related details",
+                        "required": [
+                            "score"
+                        ],
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "description": "The score for the evaluation"
+                            },
+                            "details": {
+                                "type": "object",
+                                "description": "Any additional details about the score",
+                                "additionalProperties": true
+                            }
+                        }
+                    },
+                    "detailed_evaluation_results_url": {
+                        "type": "string",
+                        "description": "Link to detailed evaluation data"
+                    },
+                    "generation_config": {
+                        "type": "object",
+                        "generation_args": {
+                                "type": "object",
+                                "description": "Parameters used to generate results - properties may vary by model type",
+                                "properties": {
+                                    "temperature": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Sampling temperature"
+                                    },
+                                    "top_p": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Nucleus sampling parameter"
+                                    },
+                                    "top_k": {
+                                        "type": [
+                                            "null",
+                                            "number"
+                                        ],
+                                        "description": "Top-k sampling parameter"
+                                    },
+                                    "max_tokens": {
+                                        "type": "integer",
+                                        "minimum": 1,
+                                        "description": "Maximum number of tokens to generate"
+                                    }
+                                },
+                                "additionalProperties": true
+                        },
+                        "additional_details": {
+                            "type": "string",
+                            "description": "Additional details about how the results for this metric were generated."
+                        }
+                    }
+                }
+            }
+        }
+    }
+}

hf_operations.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+HuggingFace Operations: Upload data, create PRs, validate schemas.
+"""
+from huggingface_hub import HfApi, login
+import pandas as pd
+import json
+from pathlib import Path
+from jsonschema import validate, ValidationError, Draft7Validator
+# Load schema once at module level
+SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
+with open(SCHEMA_PATH, 'r') as f:
+    EVAL_SCHEMA = json.load(f)
+def validate_json_against_schema(json_data):
+    """
+    Validate a JSON object against eval.schema.json.
+    Args:
+        json_data: Dict containing the evaluation data
+    Returns:
+        (bool, str): (is_valid, error_message)
+    """
+    try:
+        validate(instance=json_data, schema=EVAL_SCHEMA)
+        return True, "Schema validation passed"
+    except ValidationError as e:
+        # Extract the most relevant error message
+        error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
+        return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
+    except Exception as e:
+        return False, f"❌ Validation error: {str(e)}"
+def upload_to_hf_dataset(parquet_file, split_name, repo_id):
+    """
+    Upload a parquet file as a new split to the HF dataset.
+    Args:
+        parquet_file: Path to parquet file
+        split_name: Name of the split (leaderboard name)
+        repo_id: HuggingFace dataset repository ID
+    """
+    # TODO: Implement upload logic
+    pass
+def check_hf_authentication():
+    """
+    Check if user is authenticated with HuggingFace.
+    Returns:
+        (bool, str): (is_authenticated, username or error_message)
+    """
+    try:
+        api = HfApi()
+        user_info = api.whoami()
+        return True, user_info['name']
+    except Exception as e:
+        return False, "Not authenticated. Run: huggingface-cli login"
+def check_duplicate_pr_exists(leaderboard_name, repo_id):
+    """
+    Check if a PR already exists for this leaderboard.
+    Args:
+        leaderboard_name: Name of the leaderboard
+        repo_id: HuggingFace dataset repository ID
+    Returns:
+        (bool, str or None): (exists, pr_url if exists)
+    """
+    try:
+        api = HfApi()
+        discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
+        # Check for open PRs with matching title
+        pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
+        for discussion in discussions:
+            if discussion.is_pull_request and discussion.status == "open":
+                if pr_title_pattern in discussion.title.lower():
+                    pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
+                    return True, pr_url
+        return False, None
+    except Exception as e:
+        # If we can't check, assume no duplicate (fail open)
+        print(f"Warning: Could not check for duplicate PRs: {e}")
+        return False, None
+def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
+    """
+    Create a pull request to add a new leaderboard split.
+    Args:
+        leaderboard_name: Name of the new leaderboard
+        parquet_file: Path to parquet file
+        repo_id: HuggingFace dataset repository ID
+    Returns:
+        (success, pr_url or error_message)
+    """
+    # 1. Check authentication
+    is_auth, auth_result = check_hf_authentication()
+    if not is_auth:
+        return False, f"❌ {auth_result}"
+    # 2. Check for duplicate PR
+    has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
+    if has_duplicate:
+        return False, f"⚠️ PR already exists: {duplicate_url}"
+    # 3. Validate parquet file exists and has data
+    parquet_path = Path(parquet_file)
+    if not parquet_path.exists():
+        return False, "❌ Parquet file not found"
+    df = pd.read_parquet(parquet_file)
+    if len(df) == 0:
+        return False, "❌ Parquet file is empty"
+    # 4. Create PR
+    try:
+        api = HfApi()
+        # Upload the parquet file to the branch
+        commit_message = f"Add new leaderboard: {leaderboard_name}"
+        # Upload file and create PR
+        commit_info = api.upload_file(
+            path_or_fileobj=parquet_file,
+            path_in_repo=f"data/{leaderboard_name}.parquet",
+            repo_id=repo_id,
+            repo_type="dataset",
+            commit_message=commit_message,
+            create_pr=True,
+        )
+        # Extract PR URL from commit info
+        pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
+        return True, f"PR created ({len(df)} rows): {pr_url}"
+    except Exception as e:
+        return False, f"❌ Failed to create PR: {str(e)}"
+def validate_schema(parquet_file):
+    """
+    Validate that a parquet file matches the expected schema.
+    Args:
+        parquet_file: Path to parquet file to validate
+    Returns:
+        (bool, str): (is_valid, error_message)
+    """
+    try:
+        df = pd.read_parquet(parquet_file)
+        # Required columns
+        required_cols = [
+            '_leaderboard', '_developer', '_model', '_uuid',
+            'schema_version', 'evaluation_id', 'retrieved_timestamp',
+            'source_data', 'evaluation_source_name', 'evaluation_source_type',
+            'source_organization_name', 'evaluator_relationship',
+            'model_name', 'model_id', 'model_developer',
+            'evaluation_results'
+        ]
+        missing = [col for col in required_cols if col not in df.columns]
+        if missing:
+            return False, f"Missing required columns: {', '.join(missing)}"
+        # Check data types (all should be strings)
+        for col in df.columns:
+            if df[col].dtype not in ['object', 'string']:
+                return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
+        return True, "Schema validation passed"
+    except Exception as e:
+        return False, f"Validation error: {str(e)}"
+def export_to_json(parquet_file, output_dir):
+    """
+    Export parquet data back to JSON files.
+    Uses the parquet_to_folder function from json_to_parquet.py
+    Args:
+        parquet_file: Path to parquet file
+        output_dir: Directory to write JSON files to
+    """
+    from json_to_parquet import parquet_to_folder
+    parquet_to_folder(parquet_file, output_dir)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,10 @@

+[project]
+name = "eee-test"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "gradio>=5.49.1",
+    "pandas>=2.3.2",
+]

ui_components.py ADDED Viewed

	@@ -0,0 +1,1374 @@

+"""
+UI Components: Themes, CSS, and HTML formatters for the Gradio interface.
+Nord color theme with balanced contrast.
+"""
+import gradio as gr
+def get_theme():
+    """Returns the Nord-themed Gradio theme, locked to dark mode."""
+    return gr.themes.Base(
+        primary_hue="blue",
+        neutral_hue="slate",
+        font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"],
+        font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
+    ).set(
+        body_background_fill="#2E3440",
+        body_background_fill_dark="#2E3440",
+        body_text_color="#ECEFF4",
+        body_text_color_dark="#ECEFF4",
+        body_text_color_subdued="#4C566A",
+        body_text_color_subdued_dark="#4C566A",
+        block_background_fill="#3B4252",
+        block_background_fill_dark="#3B4252",
+        block_border_width="1px",
+        block_border_color="#434C5E",
+        block_border_color_dark="#434C5E",
+        block_label_text_color="#D8DEE9",
+        block_label_text_color_dark="#D8DEE9",
+        block_title_text_color="#ECEFF4",
+        block_title_text_color_dark="#ECEFF4",
+        input_background_fill="#2E3440",
+        input_background_fill_dark="#2E3440",
+        input_border_color="#4C566A",
+        input_border_color_dark="#4C566A",
+        button_primary_background_fill="#88C0D0",
+        button_primary_background_fill_dark="#88C0D0",
+        button_primary_text_color="#2E3440",
+        button_primary_text_color_dark="#2E3440",
+        button_secondary_background_fill="#434C5E",
+        button_secondary_background_fill_dark="#434C5E",
+        button_secondary_text_color="#ECEFF4",
+        button_secondary_text_color_dark="#ECEFF4",
+    )
+def get_custom_css():
+    """Returns custom CSS with Nord colors."""
+    return """
+/* === Nord Theme ===
+   Polar Night: #2E3440 (bg), #3B4252 (surface), #434C5E, #4C566A
+   Snow Storm:  #D8DEE9, #E5E9F0, #ECEFF4
+   Frost:       #8FBCBB, #88C0D0, #81A1C1, #5E81AC
+   Aurora:      #BF616A, #D08770, #EBCB8B, #A3BE8C, #B48EAD
+*/
+/* Lock the UI to dark Nord regardless of OS preference */
+:root {
+    color-scheme: dark;
+    background-color: #2E3440;
+}
+body {
+    background: #2E3440 !important;
+    color: #ECEFF4 !important;
+}
+/* === Base === */
+.gradio-container {
+    max-width: 100% !important;
+    margin: 0 !important;
+    padding: 1.25rem 2.5rem 2rem !important;
+    background: #2E3440 !important;
+    color: #ECEFF4 !important;
+    font-family: 'DM Sans', system-ui, sans-serif !important;
+    font-size: 16px !important;
+}
+/* === Header === */
+.app-header {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+    padding: 1.25rem 1.5rem;
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+}
+.app-header .logo-mark {
+    width: 48px;
+    height: 48px;
+    background: linear-gradient(135deg, #88C0D0 0%, #81A1C1 100%);
+    border-radius: 12px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-weight: 800;
+    font-size: 1.1rem;
+    color: #2E3440;
+}
+.app-header .brand {
+    display: flex;
+    flex-direction: column;
+    gap: 0.125rem;
+}
+.app-header h1 {
+    margin: 0;
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: #ECEFF4;
+    letter-spacing: -0.02em;
+}
+.app-header .tagline {
+    color: #D8DEE9;
+    font-size: 0.85rem;
+}
+.app-header .header-right {
+    margin-left: auto;
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+.app-header .version-badge {
+    background: rgba(136, 192, 208, 0.2);
+    border: 1px solid rgba(136, 192, 208, 0.4);
+    border-radius: 6px;
+    padding: 0.25rem 0.625rem;
+    font-size: 0.7rem;
+    font-family: 'JetBrains Mono', monospace;
+    color: #88C0D0;
+}
+/* === Tabs === */
+.tabs {
+    border: none !important;
+    background: transparent !important;
+}
+.tab-nav {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    padding: 0.25rem !important;
+    gap: 0.25rem !important;
+    margin-bottom: 1.25rem !important;
+    display: inline-flex !important;
+}
+.tab-nav button {
+    background: transparent !important;
+    border: none !important;
+    color: #D8DEE9 !important;
+    padding: 0.75rem 1.5rem !important;
+    font-size: 0.95rem !important;
+    font-weight: 500 !important;
+    border-radius: 8px !important;
+    transition: all 0.15s ease !important;
+}
+.tab-nav button.selected {
+    color: #2E3440 !important;
+    background: #88C0D0 !important;
+}
+.tab-nav button:hover:not(.selected) {
+    background: #434C5E !important;
+    color: #ECEFF4 !important;
+}
+.tabitem {
+    background: transparent !important;
+    border: none !important;
+    padding: 0 !important;
+}
+/* === Controls bar === */
+.controls-bar {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    padding: 0.75rem 1.25rem !important;
+    margin-bottom: 1rem !important;
+    gap: 0.75rem !important;
+}
+.controls-bar label {
+    font-size: 0.75rem !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.04em !important;
+    color: #D8DEE9 !important;
+    font-weight: 500 !important;
+}
+/* === Info banner === */
+.info-banner {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-left: 3px solid #88C0D0 !important;
+    border-radius: 0 10px 10px 0 !important;
+    padding: 0.75rem 1rem !important;
+    margin-bottom: 1rem !important;
+}
+.info-banner h3 {
+    margin: 0;
+    font-size: 1.1rem;
+    font-weight: 600;
+    color: #ECEFF4;
+}
+.info-banner .eval-tags {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.375rem;
+}
+.info-banner .eval-tag {
+    background: rgba(143, 188, 187, 0.15);
+    border: 1px solid rgba(143, 188, 187, 0.3);
+    border-radius: 4px;
+    padding: 0.3rem 0.6rem;
+    font-size: 0.8rem;
+    font-family: 'JetBrains Mono', monospace;
+    color: #8FBCBB;
+}
+/* === Dataframe - seamless styling === */
+.dataframe,
+.dataframe > div,
+.dataframe > div > div,
+.dataframe .table-wrap,
+.dataframe .svelte-1gfkn6j {
+    background: #2E3440 !important;
+    border: none !important;
+    box-shadow: none !important;
+    border-radius: 0 !important;
+}
+.dataframe table {
+    width: 100% !important;
+    border-collapse: collapse !important;
+    font-size: 0.95rem !important;
+    table-layout: auto !important;
+    background: #2E3440 !important;
+}
+.dataframe thead,
+.dataframe thead tr {
+    background: #2E3440 !important;
+    position: sticky;
+    top: 0;
+    z-index: 10;
+}
+.dataframe thead th {
+    padding: 0.875rem 1rem !important;
+    font-weight: 600 !important;
+    font-size: 0.75rem !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.05em !important;
+    color: #81A1C1 !important;
+    border-bottom: 1px solid #434C5E !important;
+    border-top: none !important;
+    text-align: left !important;
+    background: #2E3440 !important;
+}
+.dataframe tbody,
+.dataframe tbody tr {
+    background: #2E3440 !important;
+}
+.dataframe tbody tr {
+    border-bottom: 1px solid #3B4252 !important;
+}
+.dataframe tbody tr:hover {
+    background: rgba(136, 192, 208, 0.04) !important;
+}
+.dataframe tbody td {
+    padding: 0.75rem 1rem !important;
+    color: #E5E9F0 !important;
+    background: #2E3440 !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+    border: none !important;
+}
+/* === Pagination bar === */
+.pagination-bar {
+    margin-top: 1rem !important;
+    padding: 1rem 0 !important;
+    border-top: 1px solid #3B4252 !important;
+    display: flex !important;
+    justify-content: center !important;
+    align-items: center !important;
+    gap: 1rem !important;
+}
+.page-info {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 1rem !important;
+    color: #D8DEE9 !important;
+    min-width: 80px !important;
+    text-align: center !important;
+}
+/* Model name - white, readable */
+.dataframe tbody td:first-child {
+    font-weight: 500 !important;
+    color: #ECEFF4 !important;
+    white-space: nowrap !important;
+}
+/* All other columns - use monospace for numbers */
+.dataframe tbody td:not(:first-child) {
+    font-family: 'JetBrains Mono', monospace !important;
+    color: #8FBCBB !important;
+    text-align: left !important;
+}
+.dataframe tbody td:nth-child(2) {
+    color: #88C0D0 !important;
+    white-space: nowrap !important;
+}
+.dataframe tbody td:nth-child(3) {
+    color: #D08770 !important;
+}
+.dataframe tbody td:nth-child(4) {
+    font-weight: 600 !important;
+    color: #A3BE8C !important;
+}
+.dataframe tbody td:nth-child(n+5) {
+    white-space: nowrap !important;
+}
+/* === Status text === */
+.status-text {
+    font-size: 0.9rem !important;
+    color: #D8DEE9 !important;
+    padding: 0.5rem 0 !important;
+    font-family: 'JetBrains Mono', monospace !important;
+}
+/* === Model Card === */
+.model-card-container {
+    display: flex;
+    flex-direction: column;
+    gap: 1.25rem;
+}
+.model-card-header {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+    padding: 1.5rem 2rem;
+}
+.model-card-header h2 {
+    margin: 0 0 0.5rem 0;
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: #ECEFF4;
+}
+.model-card-header .model-meta {
+    display: flex;
+    gap: 1.5rem;
+    color: #D8DEE9;
+    font-size: 0.95rem;
+}
+.model-card-header .model-meta strong {
+    color: #8FBCBB;
+}
+.leaderboard-section {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 10px;
+    overflow: hidden;
+}
+.leaderboard-section-header {
+    background: #434C5E;
+    padding: 1rem 1.25rem;
+    border-bottom: 1px solid #4C566A;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.leaderboard-section-header h3 {
+    margin: 0;
+    font-size: 1rem;
+    font-weight: 600;
+    color: #88C0D0;
+}
+.leaderboard-section-header .lb-avg {
+    background: rgba(163, 190, 140, 0.15);
+    border: 1px solid rgba(163, 190, 140, 0.3);
+    border-radius: 8px;
+    padding: 0.5rem 1rem;
+    font-size: 0.85rem;
+    color: #D8DEE9;
+}
+.leaderboard-section-header .lb-avg strong {
+    color: #A3BE8C;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 1.1rem;
+    font-weight: 700;
+}
+.scores-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+    gap: 1px;
+    background: #434C5E;
+}
+.score-item {
+    background: #3B4252;
+    padding: 1rem 1.25rem;
+}
+.score-item .score-label {
+    font-size: 0.8rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: #D8DEE9;
+    margin-bottom: 0.375rem;
+}
+.score-item .score-value {
+    font-size: 1.5rem;
+    font-weight: 600;
+    font-family: 'JetBrains Mono', monospace;
+    color: #A3BE8C;
+}
+.score-item.highlight .score-value {
+    color: #88C0D0;
+}
+.no-results {
+    text-align: center;
+    padding: 3rem 1rem;
+    color: #D8DEE9;
+}
+.no-results h3 {
+    color: #ECEFF4;
+    margin-bottom: 0.5rem;
+}
+/* === New Comparison View === */
+.comparison-container {
+    display: flex;
+    flex-direction: column;
+    gap: 1.5rem;
+}
+.comparison-summary {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+    padding: 1.5rem;
+}
+.comparison-summary h2 {
+    margin: 0 0 1rem 0;
+    color: #ECEFF4;
+    font-size: 1.25rem;
+}
+.summary-cards {
+    display: flex;
+    gap: 1rem;
+    flex-wrap: wrap;
+}
+.summary-card {
+    flex: 1;
+    min-width: 200px;
+    background: #2E3440;
+    border-radius: 8px;
+    padding: 1rem;
+}
+.summary-card-header {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    margin-bottom: 0.75rem;
+}
+.model-dot {
+    width: 10px;
+    height: 10px;
+    border-radius: 50%;
+}
+.model-name {
+    font-weight: 600;
+    color: #ECEFF4;
+    font-size: 0.9rem;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+}
+.summary-card-body {
+    display: flex;
+    flex-direction: column;
+    gap: 0.5rem;
+}
+.summary-stat {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.summary-stat .stat-label {
+    font-size: 0.75rem;
+    color: #D8DEE9;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.summary-stat .stat-value {
+    font-family: 'JetBrains Mono', monospace;
+    color: #8FBCBB;
+}
+.summary-stat.primary .stat-value.large {
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: #A3BE8C;
+}
+.leaderboard-comparison-card {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+    overflow: hidden;
+}
+.lb-card-header {
+    background: #434C5E;
+    padding: 0.875rem 1.25rem;
+}
+.lb-card-header h3 {
+    margin: 0;
+    color: #88C0D0;
+    font-size: 1rem;
+    font-weight: 600;
+}
+.lb-card-body {
+    padding: 1rem 1.25rem;
+    display: flex;
+    flex-direction: column;
+    gap: 0.75rem;
+}
+.metric-comparison {
+    display: flex;
+    flex-direction: column;
+    gap: 0.375rem;
+}
+.metric-name-row {
+    margin-bottom: 0.25rem;
+}
+.metric-title {
+    font-size: 0.85rem;
+    font-weight: 600;
+    color: #ECEFF4;
+}
+.metric-title.sub {
+    font-size: 0.75rem;
+    font-weight: 500;
+    color: #D8DEE9;
+}
+.model-score-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    padding: 0.375rem 0;
+}
+.model-score-row.compact {
+    padding: 0.25rem 0;
+}
+.model-score-row.best-score {
+    background: rgba(163, 190, 140, 0.1);
+    border-radius: 4px;
+    padding-left: 0.5rem;
+    margin-left: -0.5rem;
+}
+.model-score-row.no-data {
+    opacity: 0.5;
+}
+.model-indicator {
+    width: 8px;
+    height: 8px;
+    border-radius: 2px;
+    flex-shrink: 0;
+}
+.model-indicator.small {
+    width: 6px;
+    height: 6px;
+}
+.score-bar-container {
+    flex: 1;
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    height: 24px;
+    background: #2E3440;
+    border-radius: 4px;
+    padding: 0 0.5rem;
+    position: relative;
+}
+.score-bar {
+    position: absolute;
+    left: 0;
+    top: 0;
+    bottom: 0;
+    border-radius: 4px;
+    opacity: 0.3;
+}
+.score-bar.thin {
+    opacity: 0.2;
+}
+.score-value {
+    position: relative;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.9rem;
+    font-weight: 600;
+    color: #ECEFF4;
+    z-index: 1;
+}
+.score-value.small {
+    font-size: 0.8rem;
+    font-weight: 500;
+}
+.score-value.dim {
+    color: #4C566A;
+}
+/* === Selected Models Chips === */
+.selected-models-group label {
+    display: inline-flex !important;
+    align-items: center !important;
+    background: #434C5E;
+    border: 1px solid #4C566A;
+    border-radius: 16px;
+    padding: 0.35rem 0.85rem;
+    font-size: 0.85rem;
+    color: #ECEFF4;
+    gap: 0.4rem;
+    cursor: pointer;
+    margin: 0.15rem 0.3rem 0.15rem 0 !important;
+}
+.selected-models-group label span::before {
+    content: "×";
+    font-size: 0.75rem;
+    color: #EBCB8B;
+    opacity: 0;
+    transition: opacity 0.15s ease;
+}
+.selected-models-group label:hover span::before {
+    opacity: 1;
+}
+.selected-models-group input[type="checkbox"] {
+    display: none;
+}
+/* === Heat Map Table === */
+.heatmap-table-wrapper {
+    overflow-x: auto;
+    margin-top: 1rem;
+}
+.heatmap-table {
+    width: 100%;
+    border-collapse: collapse;
+    font-size: 0.85rem;
+}
+.heatmap-table thead {
+    position: sticky;
+    top: 0;
+    z-index: 10;
+}
+.heatmap-table th {
+    background: #434C5E;
+    padding: 0.625rem 0.75rem;
+    font-weight: 600;
+    font-size: 0.7rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: #81A1C1;
+    text-align: left;
+    border-bottom: 2px solid #4C566A;
+    white-space: nowrap;
+}
+.heatmap-table th.metric-header {
+    min-width: 120px;
+}
+.heatmap-table th.model-header {
+    text-align: center;
+    max-width: 150px;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+.heatmap-table td {
+    padding: 0.5rem 0.75rem;
+    border-bottom: 1px solid #3B4252;
+}
+.heatmap-table td.metric-name {
+    font-weight: 500;
+    color: #D8DEE9;
+    background: #2E3440;
+}
+.heatmap-table td.score-cell {
+    text-align: center;
+    font-family: 'JetBrains Mono', monospace;
+    font-weight: 500;
+    transition: all 0.15s ease;
+}
+.heatmap-table td.score-cell.best {
+    background: rgba(163, 190, 140, 0.25);
+    color: #A3BE8C;
+    font-weight: 700;
+}
+.heatmap-table td.score-cell.good {
+    background: rgba(163, 190, 140, 0.12);
+    color: #A3BE8C;
+}
+.heatmap-table td.score-cell.mid {
+    background: rgba(235, 203, 139, 0.12);
+    color: #EBCB8B;
+}
+.heatmap-table td.score-cell.low {
+    background: rgba(208, 135, 112, 0.12);
+    color: #D08770;
+}
+.heatmap-table td.score-cell.worst {
+    background: rgba(191, 97, 106, 0.15);
+    color: #BF616A;
+}
+.heatmap-table td.score-cell.na {
+    color: #4C566A;
+    font-style: italic;
+}
+.heatmap-table tr.avg-row {
+    background: rgba(136, 192, 208, 0.08);
+}
+.heatmap-table tr.avg-row td.metric-name {
+    font-weight: 700;
+    color: #88C0D0;
+    background: rgba(136, 192, 208, 0.08);
+}
+/* === Buttons === */
+button {
+    border-radius: 8px !important;
+    font-weight: 500 !important;
+    font-size: 0.95rem !important;
+    transition: all 0.15s ease !important;
+}
+button.primary {
+    background: #88C0D0 !important;
+    color: #2E3440 !important;
+    border: none !important;
+}
+button.primary:hover:not(:disabled) {
+    background: #8FBCBB !important;
+}
+button.secondary,
+button[variant="secondary"] {
+    background: #434C5E !important;
+    color: #ECEFF4 !important;
+    border: 1px solid #4C566A !important;
+}
+button.secondary:hover:not(:disabled),
+button[variant="secondary"]:hover:not(:disabled) {
+    background: #4C566A !important;
+}
+button:disabled {
+    opacity: 0.35 !important;
+}
+/* === Inputs === */
+input[type="text"],
+select {
+    background: #2E3440 !important;
+    border: 1px solid #4C566A !important;
+    border-radius: 8px !important;
+    color: #ECEFF4 !important;
+    font-size: 1rem !important;
+}
+input[type="text"]:focus,
+select:focus {
+    border-color: #88C0D0 !important;
+    box-shadow: 0 0 0 3px rgba(136, 192, 208, 0.15) !important;
+    outline: none !important;
+}
+input::placeholder {
+    color: #4C566A !important;
+}
+/* === Accordion === */
+.accordion {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    margin-top: 1.5rem !important;
+}
+.accordion > .label-wrap {
+    background: transparent !important;
+    padding: 1rem 1.25rem !important;
+    color: #D8DEE9 !important;
+    font-size: 0.95rem !important;
+}
+.accordion > .wrap {
+    padding: 0.5rem 1.25rem 1.25rem !important;
+    color: #D8DEE9 !important;
+    font-size: 0.95rem !important;
+    line-height: 1.6 !important;
+}
+.accordion code {
+    background: #434C5E !important;
+    padding: 0.125rem 0.375rem !important;
+    border-radius: 4px !important;
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 0.8rem !important;
+    color: #8FBCBB !important;
+}
+/* === Metrics section === */
+.metrics-section {
+    margin-top: 1.5rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid #434C5E;
+}
+.metrics-section h3 {
+    font-size: 0.85rem;
+    font-weight: 600;
+    color: #D8DEE9;
+    margin: 0 0 1rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.metrics-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
+    gap: 0.75rem;
+}
+.metric-card {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 8px;
+    overflow: hidden;
+}
+.metric-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0.75rem 1rem;
+    cursor: pointer;
+    list-style: none;
+}
+.metric-card-header::-webkit-details-marker {
+    display: none;
+}
+.metric-card-name {
+    font-weight: 500;
+    font-size: 0.95rem;
+    color: #ECEFF4;
+}
+.metric-card-direction {
+    font-size: 0.8rem;
+    color: #D8DEE9;
+}
+.metric-card-direction .arrow {
+    color: #A3BE8C;
+    font-weight: 600;
+}
+.metric-card-body {
+    padding: 0.875rem 1.25rem;
+    border-top: 1px solid #434C5E;
+    font-size: 0.9rem;
+    color: #D8DEE9;
+    line-height: 1.5;
+}
+.metric-type-badge {
+    font-size: 0.65rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    padding: 0.15rem 0.4rem;
+    background: rgba(180, 142, 173, 0.2);
+    border: 1px solid rgba(180, 142, 173, 0.35);
+    border-radius: 4px;
+    color: #B48EAD;
+    font-family: 'JetBrains Mono', monospace;
+}
+/* === Scrollbar === */
+::-webkit-scrollbar {
+    width: 8px;
+    height: 8px;
+}
+::-webkit-scrollbar-track {
+    background: #2E3440;
+}
+::-webkit-scrollbar-thumb {
+    background: #4C566A;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: #5E81AC;
+}
+/* === Responsive === */
+@media (max-width: 768px) {
+    .gradio-container {
+        padding: 1rem !important;
+    }
+    .scores-grid {
+        grid-template-columns: repeat(2, 1fr);
+    }
+}
+/* === Overrides === */
+.gradio-container footer {
+    display: none !important;
+}
+.block {
+    background: #3B4252 !important;
+}
+.gradio-radio label {
+    background: #434C5E !important;
+    border: 1px solid #4C566A !important;
+    color: #ECEFF4 !important;
+    border-radius: 8px !important;
+    font-size: 0.85rem !important;
+}
+.gradio-radio label.selected {
+    background: #88C0D0 !important;
+    border-color: #88C0D0 !important;
+    color: #2E3440 !important;
+}
+"""
+def format_leaderboard_header(selected_leaderboard, metadata):
+    """Formats the leaderboard header info section."""
+    if not selected_leaderboard:
+        return """
+        <div style="text-align: center; padding: 2rem 1rem; color: #D8DEE9;">
+            <div style="font-size: 1.1rem;">Select a leaderboard to explore</div>
+        </div>
+        """
+    if not metadata or not metadata.get("evals"):
+        return f"""
+        <div class="info-banner">
+            <h3>{selected_leaderboard}</h3>
+        </div>
+        """
+    source_info = metadata.get("source_info", {})
+    org = source_info.get("organization", "Unknown")
+    url = source_info.get("url", "#")
+    eval_names = list(metadata["evals"].keys())
+    eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
+    return f"""
+    <div class="info-banner">
+        <div style="display: flex; justify-content: space-between; align-items: center; gap: 1rem;">
+            <div style="display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;">
+                <h3 style="margin: 0;">{selected_leaderboard}</h3>
+                <span style="color: #D8DEE9; font-size: 0.8rem;">by {org}</span>
+                <div class="eval-tags" style="margin: 0;">{eval_tags}</div>
+            </div>
+            <a href="{url}" target="_blank" style="
+                font-size: 0.75rem;
+                color: #88C0D0;
+                text-decoration: none;
+                padding: 0.375rem 0.75rem;
+                border: 1px solid rgba(136, 192, 208, 0.4);
+                border-radius: 6px;
+                white-space: nowrap;
+            ">Source →</a>
+        </div>
+    </div>
+    """
+def format_metric_details(selected_leaderboard, metadata):
+    """Formats metric detail cards."""
+    if not selected_leaderboard or not metadata or not metadata.get("evals"):
+        return ""
+    evals = metadata.get("evals", {})
+    html = """
+    <div class="metrics-section">
+        <h3>Metric Reference</h3>
+        <div class="metrics-grid">
+    """
+    for eval_name, info in evals.items():
+        score_type = info['score_type'].upper() if info.get('score_type') else "—"
+        direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
+        arrow = "↓" if info.get('lower_is_better') else "↑"
+        details = ""
+        if info.get('score_type') == "continuous" and info.get('min_score') is not None:
+            details = f"Range: [{info['min_score']} – {info['max_score']}]"
+        elif info.get('score_type') == "levels" and info.get('level_names'):
+            details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
+        html += f"""
+        <details class="metric-card">
+            <summary class="metric-card-header">
+                <span class="metric-card-name">{eval_name}</span>
+                <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
+            </summary>
+            <div class="metric-card-body">
+                <div>{info.get('description', 'No description')}</div>
+                <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
+                    <span style="font-size: 0.75rem; color: #D8DEE9;">{details}</span>
+                    <span class="metric-type-badge">{score_type}</span>
+                </div>
+            </div>
+        </details>
+        """
+    html += "</div></div>"
+    return html
+def format_model_card(model_name, model_data):
+    """Formats a model card showing all evals across leaderboards."""
+    if not model_data:
+        return """
+        <div class="no-results">
+            <h3>No results found</h3>
+            <p>Try searching for a different model name</p>
+        </div>
+        """
+    first = list(model_data.values())[0]
+    developer = first.get("developer", "Unknown")
+    params = first.get("params")
+    arch = first.get("architecture", "Unknown")
+    params_str = f"{params}B" if params else "—"
+    html = f"""
+    <div class="model-card-container">
+        <div class="model-card-header">
+            <h2>{model_name}</h2>
+            <div class="model-meta">
+                <span><strong>Developer:</strong> {developer}</span>
+                <span><strong>Parameters:</strong> {params_str}</span>
+                <span><strong>Architecture:</strong> {arch}</span>
+            </div>
+        </div>
+    """
+    for leaderboard_name, data in model_data.items():
+        results = data.get("results", {})
+        if not results:
+            continue
+        scores = [v for v in results.values() if v is not None]
+        avg = sum(scores) / len(scores) if scores else None
+        avg_str = f"{avg:.2f}" if avg else "—"
+        html += f"""
+        <div class="leaderboard-section">
+            <div class="leaderboard-section-header">
+                <h3>{leaderboard_name}</h3>
+                <span class="lb-avg">Avg: <strong>{avg_str}</strong></span>
+            </div>
+            <div class="scores-grid">
+        """
+        sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
+        for i, (metric_name, score) in enumerate(sorted_results):
+            score_display = f"{score:.2f}" if score is not None else "—"
+            highlight_class = "highlight" if i == 0 else ""
+            html += f"""
+                <div class="score-item {highlight_class}">
+                    <div class="score-label">{metric_name}</div>
+                    <div class="score-value">{score_display}</div>
+                </div>
+            """
+        html += "</div></div>"
+    html += "</div>"
+    return html
+def format_model_comparison(selected_models, all_results):
+    """Formats a comparison view showing multiple models with visual indicators."""
+    if not selected_models or not all_results:
+        return """
+        <div class="no-results">
+            <h3>Select models to compare</h3>
+            <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
+        </div>
+        """
+    # Get all unique leaderboards across selected models
+    all_leaderboards = set()
+    model_data_dict = {}
+    for model_name in selected_models:
+        if model_name in all_results:
+            model_data_dict[model_name] = all_results[model_name]
+            for leaderboard_name in all_results[model_name].keys():
+                all_leaderboards.add(leaderboard_name)
+    if not model_data_dict:
+        return """
+        <div class="no-results">
+            <h3>No data found for selected models</h3>
+            <p>Try selecting different models</p>
+        </div>
+        """
+    all_leaderboards = sorted(all_leaderboards)
+    model_colors = ['#88C0D0', '#A3BE8C', '#EBCB8B', '#D08770', '#B48EAD', '#8FBCBB', '#81A1C1', '#BF616A']
+    # Calculate overall averages for summary
+    overall_avgs = {}
+    for model_name in selected_models:
+        if model_name in model_data_dict:
+            all_scores = []
+            for lb_data in model_data_dict[model_name].values():
+                all_scores.extend([v for v in lb_data.get("results", {}).values() if v is not None])
+            overall_avgs[model_name] = sum(all_scores) / len(all_scores) if all_scores else None
+    html = """
+    <div class="comparison-container">
+        <div class="comparison-summary">
+            <h2>Model Comparison</h2>
+            <div class="summary-cards">
+    """
+    # Summary cards for each model
+    for i, model_name in enumerate(selected_models):
+        color = model_colors[i % len(model_colors)]
+        avg = overall_avgs.get(model_name)
+        avg_str = f"{avg:.2f}" if avg is not None else "—"
+        # Get model info
+        model_info = list(model_data_dict.get(model_name, {}).values())
+        developer = model_info[0].get("developer", "Unknown") if model_info else "Unknown"
+        html += f"""
+            <div class="summary-card" style="border-left: 4px solid {color};">
+                <div class="summary-card-header">
+                    <span class="model-dot" style="background: {color};"></span>
+                    <span class="model-name">{model_name}</span>
+                </div>
+                <div class="summary-card-body">
+                    <div class="summary-stat">
+                        <span class="stat-label">Developer</span>
+                        <span class="stat-value">{developer}</span>
+                    </div>
+                    <div class="summary-stat primary">
+                        <span class="stat-label">Overall Avg</span>
+                        <span class="stat-value large">{avg_str}</span>
+                    </div>
+                </div>
+            </div>
+        """
+    html += """
+            </div>
+        </div>
+    """
+    # Leaderboard comparison cards
+    for leaderboard_name in all_leaderboards:
+        leaderboard_metrics = set()
+        for model_data in model_data_dict.values():
+            if leaderboard_name in model_data:
+                results = model_data[leaderboard_name].get("results", {})
+                leaderboard_metrics.update(results.keys())
+        leaderboard_metrics = sorted(leaderboard_metrics)
+        if not leaderboard_metrics:
+            continue
+        # Calculate averages for ranking
+        model_avgs = {}
+        for model_name in selected_models:
+            if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
+                results = model_data_dict[model_name][leaderboard_name].get("results", {})
+                scores = [v for v in results.values() if v is not None]
+                model_avgs[model_name] = sum(scores) / len(scores) if scores else None
+        html += f"""
+        <div class="leaderboard-comparison-card">
+            <div class="lb-card-header">
+                <h3>{leaderboard_name}</h3>
+            </div>
+            <div class="lb-card-body">
+        """
+        # Compact heat-map table
+        html += '<div class="heatmap-table-wrapper">'
+        html += '<table class="heatmap-table">'
+        # Header with model names
+        html += '<thead><tr><th class="metric-header">Metric</th>'
+        for i, model_name in enumerate(selected_models):
+            # Truncate long names
+            short_name = model_name if len(model_name) <= 20 else model_name[:18] + "…"
+            html += f'<th class="model-header" title="{model_name}">{short_name}</th>'
+        html += '</tr></thead>'
+        html += '<tbody>'
+        # Average row first
+        html += '<tr class="avg-row"><td class="metric-name">Average</td>'
+        valid_avgs_list = [model_avgs.get(m) for m in selected_models if model_avgs.get(m) is not None]
+        max_avg_val = max(valid_avgs_list) if valid_avgs_list else None
+        for model_name in selected_models:
+            avg = model_avgs.get(model_name)
+            if avg is not None:
+                cell_class = "best" if avg == max_avg_val and len(valid_avgs_list) > 1 else ""
+                html += f'<td class="score-cell {cell_class}">{avg:.2f}</td>'
+            else:
+                html += '<td class="score-cell na">—</td>'
+        html += '</tr>'
+        # Individual metric rows
+        for metric_name in leaderboard_metrics:
+            html += f'<tr><td class="metric-name">{metric_name}</td>'
+            # Get all scores for this metric
+            metric_scores = {}
+            for model_name in selected_models:
+                if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
+                    results = model_data_dict[model_name][leaderboard_name].get("results", {})
+                    metric_scores[model_name] = results.get(metric_name)
+            valid_scores = [v for v in metric_scores.values() if v is not None]
+            if valid_scores:
+                max_score = max(valid_scores)
+                min_score = min(valid_scores)
+                score_range = max_score - min_score if max_score > min_score else 1
+            else:
+                max_score = min_score = score_range = None
+            for model_name in selected_models:
+                score = metric_scores.get(model_name)
+                if score is not None and score_range is not None:
+                    # Determine color class based on relative position
+                    if len(valid_scores) > 1:
+                        pct = (score - min_score) / score_range if score_range > 0 else 1
+                        if score == max_score:
+                            cell_class = "best"
+                        elif pct >= 0.75:
+                            cell_class = "good"
+                        elif pct >= 0.5:
+                            cell_class = "mid"
+                        elif pct >= 0.25:
+                            cell_class = "low"
+                        else:
+                            cell_class = "worst"
+                    else:
+                        cell_class = ""
+                    html += f'<td class="score-cell {cell_class}">{score:.2f}</td>'
+                else:
+                    html += '<td class="score-cell na">—</td>'
+            html += '</tr>'
+        html += '</tbody></table></div>'
+        html += """
+            </div>
+        </div>
+        """
+    html += "</div>"
+    return html