Spaces:

newmindai
/

Mizan

Running

App Files Files Community

nmmursit commited on Jan 23

Commit

bc37111

1 Parent(s): 28277a5

Refactor codebase structure

Browse files

Files changed (36) hide show

.gitattributes +0 -35
.gitignore +53 -0
api_client.py +0 -103
app.py +79 -111
config.py +0 -28
data_processor.py +0 -208
evaluation_service.py +0 -190
leaderboard_data.csv +52 -33
requirements.txt +3 -2
src/__init__.py +19 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/api/__init__.py +7 -0
src/api/__pycache__/__init__.cpython-312.pyc +0 -0
src/api/__pycache__/client.cpython-312.pyc +0 -0
src/api/client.py +74 -0
src/components/__init__.py +11 -0
src/components/__pycache__/__init__.cpython-312.pyc +0 -0
src/components/__pycache__/dataset.cpython-312.pyc +0 -0
src/components/__pycache__/leaderboard.cpython-312.pyc +0 -0
src/components/__pycache__/submit.cpython-312.pyc +0 -0
src/components/dataset.py +270 -0
src/components/leaderboard.py +461 -0
src/components/submit.py +186 -0
src/core/__init__.py +12 -0
src/core/__pycache__/__init__.cpython-312.pyc +0 -0
src/core/__pycache__/columns.cpython-312.pyc +0 -0
src/core/__pycache__/config.cpython-312.pyc +0 -0
src/core/columns.py +402 -0
src/core/config.py +53 -0
src/data/__init__.py +11 -0
src/data/__pycache__/__init__.cpython-312.pyc +0 -0
src/data/__pycache__/styler.cpython-312.pyc +0 -0
src/data/__pycache__/transformer.cpython-312.pyc +0 -0
src/data/styler.py +164 -0
src/data/transformer.py +280 -0
ui_components.py +0 -259

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,53 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+ENV/
+env/
+.venv/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Environment variables
+.env
+.env.local
+.env.*.local
+# Logs
+*.log
+logs/
+# Docker
+.docker/
+# OS
+.DS_Store
+Thumbs.db
+# Gradio
+flagged/

api_client.py DELETED Viewed

@@ -1,103 +0,0 @@
-#!/usr/bin/env python3
-"""
-API Client module for MTEB Turkish Leaderboard
-"""
-from typing import Optional, Dict, Any
-import traceback
-import requests
-from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
-def check_api_health() -> bool:
-    """Check if API is available"""
-    try:
-        response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
-        return response.status_code == 200
-    except:
-        return False
-def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
-    """
-    Send an evaluation request to the API for the specified model.
-    Returns the API response as a dictionary if successful, otherwise None.
-    """
-    try:
-        payload = {
-            "model_name": model_name,
-            "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
-            "batch_size": batch_size,
-            "email": email,
-            "model_type": "sentence-transformer"
-        }
-        # Authentication credentials
-        auth = (USERNAME, PASSWORD)
-        response = requests.post(
-            f"{API_URL}/api/mteb/request",
-            json=payload,
-            timeout=API_TIMEOUT,
-            auth=auth
-        )
-        print(f"Response Status: {response.status_code}")
-        if response.status_code == 200:
-            result = response.json()
-            return result
-        else:
-            print(f"API Error: {response.status_code}")
-            try:
-                error_detail = response.json()
-                print(f"   Error Detail: {error_detail}")
-            except:
-                print(f"   Raw Response: {response.text}")
-            return None
-    except Exception as e:
-        print(f"API Call Error: {e}")
-        traceback.print_exc()
-        return None
-def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
-    """Get evaluation status from"""
-    try:
-        auth = (USERNAME, PASSWORD)
-        response = requests.get(
-            f"{API_URL}/api/mteb/status/{request_id}",
-            timeout=API_TIMEOUT,
-            auth=auth
-        )
-        if response.status_code == 200:
-            return response.json()
-        else:
-            print(f"Status check error: {response.status_code}")
-            return None
-    except Exception as e:
-        print(f"Status check error: {e}")
-        return None
-def cancel_evaluation_request(request_id: str) -> bool:
-    """Cancel an evaluation request"""
-    try:
-        auth = (USERNAME, PASSWORD)
-        response = requests.delete(
-            f"{API_URL}/api/mteb/request/{request_id}",
-            timeout=API_TIMEOUT,
-            auth=auth
-        )
-        return response.status_code == 200
-    except Exception as e:
-        print(f"Cancel request error: {e}")
-        return False

app.py CHANGED Viewed

@@ -1,136 +1,104 @@
 #!/usr/bin/env python3
 """
-Mizan Leaderboard - Enhanced Version with Submit Functionality
-Includes leaderboard display, model submission, and evaluation tracking
 """
 import gradio as gr
-from ui_components import (
-    create_leaderboard_tab, create_dataset_tab, create_submit_evaluation_tab
-    )
-from data_processor import load_leaderboard_from_csv
-from evaluation_service import submit_evaluation
-# Global data storage
-current_data = None
-def create_leaderboard_demo():
-    """Create enhanced leaderboard demo interface with submit functionality"""
-    global current_data
-    # Setup directories
-    # Load data from CSV file
-    current_data = load_leaderboard_from_csv()
-    with gr.Blocks(
-        title="Mizan",
-        theme=gr.themes.Soft()
-    ) as demo:
-        gr.Markdown("""
-        # Mizan Leaderboard
-        Performance comparison for Turkish embedding models
-        """)
-        with gr.Tabs():
-            # Tab 1: Leaderboard
-            with gr.Tab("📊 Leaderboard"):
-                leaderboard_table = create_leaderboard_tab(current_data)
-                        # Tab 2: Submit
-            with gr.Tab("🚀 Submit"):
-                (model_input, email_input, submit_btn, login_button, result_output) = create_submit_evaluation_tab()
-                # Submit evaluation functionality with authentication
-                def handle_submit_evaluation(model_name, email, profile, progress=gr.Progress()):
-                    import logging
-                    # Authentication check
-                    if profile is None:
-                        logging.warning("Unauthorized submission attempt with no profile")
-                        return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
-                    # IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
-                    # This is NOT a real authentication, just a placeholder for local testing
-                    if isinstance(profile, str) and profile == "Sign in with Hugging Face":
-                        # Block submission in local dev with mock auth
-                        return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
-                    # Email is required
-                    if not email or email.strip() == "":
-                        return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
-                    global current_data
-                    batch_size = 32  # Always use default batch size
-                    result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
-                    # Note: For now, we don't update the leaderboard since evaluation is async
-                    # The leaderboard will be updated manually when results are available
-                    logging.info(f"Submission processed for model: {model_name} by user: {profile}")
-                    return result_msg
-                submit_btn.click(
-                    fn=handle_submit_evaluation,
-                    inputs=[model_input, email_input, login_button],
-                    outputs=[result_output]
-                )
-            # Tab 3: Dataset Information
-            with gr.Tab("📊 Dataset Information"):
-                dataset_table = create_dataset_tab()
-                gr.Markdown("""
-        ---
-        ### 📊 Metrics Explanation:
-        - **Mean (Task)**: Average performance across all individual tasks
-        - **Mean (TaskType)**: Average performance by task categories
-        - **Classification**: Performance on Turkish classification tasks
-        - **Clustering**: Performance on Turkish clustering tasks
-        - **Pair Classification**: Performance on pair classification tasks (like NLI)
-        - **Retrieval**: Performance on information retrieval tasks
-        - **STS**: Performance on Semantic Textual Similarity tasks
-        - **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
-        - **Parameters**: Number of model parameters
-        - **Embed Dim**: Embedding dimension size
-        - **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
-        - **Vocab Size**: Size of the model's vocabulary
-        ### 📖 About Mizan:
-        This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
-        on Turkish language tasks across multiple domains including:
-        - Text classification and sentiment analysis
-        - Information retrieval and search
-        - Semantic textual similarity
-        - Text clustering and pair classification
-        ### 🚀 Submit Your Model:
-        Use the **Submit** tab to submit your Turkish embedding model for evaluation.
-        Your request will be reviewed by administrators and you'll receive email notifications about the progress.
-        ### Contact:
-        For any questions or feedback, please contact info@newmind.ai
-        ### Links:
-        - **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
-        """)
-    return demo
 def main():
-    """Main entry point"""
-    print("🚀 Starting Mizan Leaderboard...")
-    demo = create_leaderboard_demo()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )
 if __name__ == "__main__":

 #!/usr/bin/env python3
 """
+Mizan Turkish Leaderboard - HuggingFace Space Version
+Clean entry point that wires together all components.
 """
+import logging
+import sys
 import gradio as gr
+from src.core.config import settings
+from src.data import DataTransformer
+from src.components import LeaderboardTab, DatasetTab, SubmitTab
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG if settings.ui.debug else logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+    ]
+)
+logger = logging.getLogger(__name__)
+class MizanApp:
+    """
+    Main application class.
+    Orchestrates all components and creates the Gradio interface.
+    """
+    def __init__(self):
+        # Load data
+        self.transformer = DataTransformer()
+        self.data = self.transformer.load_from_csv()
+        # UI components (will be initialized during build)
+        self._leaderboard_tab: LeaderboardTab = None
+        self._dataset_tab: DatasetTab = None
+        self._submit_tab: SubmitTab = None
+        logger.info(f"Application initialized with {len(self.data)} models")
+    def build_interface(self) -> gr.Blocks:
+        """
+        Build the complete Gradio interface.
+        Returns:
+            Gradio Blocks application.
+        """
+        with gr.Blocks(
+            title="🇹🇷 Mizan Turkish Leaderboard",
+            theme=gr.themes.Soft()
+        ) as demo:
+            # Header
+            gr.Markdown("""
+            # 🇹🇷 Mizan Turkish Evaluation Leaderboard
+            Performance comparison for Turkish embedding models
+            """)
+            with gr.Tabs():
+                # Tab 1: Leaderboard
+                with gr.Tab("Leaderboard"):
+                    self._leaderboard_tab = LeaderboardTab(data=self.data)
+                    self._leaderboard_tab.build()
+                # Tab 2: Submit
+                with gr.Tab("Submit"):
+                    self._submit_tab = SubmitTab()
+                    self._submit_tab.build()
+                # Tab 3: Dataset Information
+                with gr.Tab("Dataset Information"):
+                    self._dataset_tab = DatasetTab()
+                    self._dataset_tab.build()
+        return demo
+    def run(self):
+        """Run the application."""
+        logger.info("Starting Mizan Turkish Leaderboard...")
+        # Build and launch
+        demo = self.build_interface()
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=settings.ui.port,
+            share=False
+        )
 def main():
+    """Main entry point."""
+    app = MizanApp()
+    app.run()
 if __name__ == "__main__":

config.py DELETED Viewed

@@ -1,28 +0,0 @@
-#!/usr/bin/env python3
-"""
-Configuration module for MTEB Turkish Leaderboard
-Centralizes environment variables and configuration settings
-"""
-import os
-from dotenv import load_dotenv
-# Load environment variables from .env file
-load_dotenv()
-# API Configuration
-API_URL = os.environ.get("API_URL")
-USERNAME = os.environ.get("API_USERNAME")
-PASSWORD = os.environ.get("API_PASSWORD")
-# API Configuration (public settings)
-API_BASE_URL = "http://localhost:8000"
-API_TIMEOUT = 30
-# Polling and refresh intervals (public settings)
-POLL_INTERVAL = 5  # seconds
-LEADERBOARD_REFRESH_INTERVAL = 30  # seconds
-# CSV file path for leaderboard data
-CSV_FILE_PATH = "leaderboard_data.csv"

data_processor.py DELETED Viewed

@@ -1,208 +0,0 @@
-#!/usr/bin/env python3
-"""
-Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
-Simplified version for loading and processing CSV data
-"""
-import os
-import pandas as pd
-from pandas.io.formats.style import Styler
-from matplotlib.colors import LinearSegmentedColormap
-import html
-# CSV file path
-CSV_FILE_PATH = "leaderboard_data.csv"
-def load_leaderboard_from_csv() -> pd.DataFrame:
-    """Load leaderboard data from CSV file"""
-    try:
-        if not os.path.exists(CSV_FILE_PATH):
-            print(f"❌ CSV file not found: {CSV_FILE_PATH}")
-            return create_empty_leaderboard_dataframe()
-        df = pd.read_csv(CSV_FILE_PATH)
-        print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")
-        # Convert to leaderboard format
-        leaderboard_df = csv_to_leaderboard_format(df)
-        # Sort by Mean (Task) score and add rankings
-        leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
-        leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
-        return leaderboard_df
-    except Exception as e:
-        print(f"❌ Error loading CSV: {e}")
-        return create_empty_leaderboard_dataframe()
-def create_empty_leaderboard_dataframe() -> pd.DataFrame:
-    """Create an empty DataFrame with proper leaderboard column structure"""
-    return pd.DataFrame(columns=[
-        "Rank",
-        "Model",
-        "Mean (Task)",
-        "Mean (TaskType)",
-        "Classification",
-        "Clustering",
-        "Pair Classification",
-        "Retrieval",
-        "STS",
-        "Correlation",
-        "Parameters",
-        "Embed Dim",
-        "Max Sequence Length",
-        "Vocab Size",
-    ])
-def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
-    """Convert CSV data to leaderboard format"""
-    data = []
-    for idx, row in df.iterrows():
-        model_name = row['Model']
-        # Prepare model name for display
-        model_name_clean = html.escape(model_name)
-        # Create clickable HuggingFace link for model name
-        hf_link = f"https://huggingface.co/{model_name_clean}"
-        clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
-        # Handle different column name variations
-        embedding_dim_col = 'Embedding Dim'
-        max_seq_col = 'Max Seq Length'
-        pair_classification_col = 'Pair Classification'
-        data_row = {
-            "Rank": idx + 1,  # Initial ranking, will be recalculated
-            "Model": clickable_model,
-            "Mean (Task)": round(float(row['Mean (Task)']), 2),
-            "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
-            "Classification": round(float(row['Classification']), 2),
-            "Clustering": round(float(row['Clustering']), 2),
-            "Pair Classification": round(float(row[pair_classification_col]), 2),
-            "Retrieval": round(float(row['Retrieval']), 2),
-            "STS": round(float(row['STS']), 2),
-            "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
-            "Parameters": row['Number of Parameters'],
-            "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
-            "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
-            "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
-        }
-        data.append(data_row)
-    result_df = pd.DataFrame(data)
-    return result_df
-def create_excel_like_cmap():
-    """Create Excel-like colormap for score visualization"""
-    colors = [
-        (0.9, 0.1, 0.2),       # Red
-        (1.0, 1.0, 0.0),       # Yellow
-        (0/255, 176/255, 80/255)  # Excel-style Green
-    ]
-    return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
-def rgb_to_hex(rgb_tuple):
-    """Convert RGB tuple to hex color"""
-    r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
-    return f"#{r:02x}{g:02x}{b:02x}"
-def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
-    """Create colored cell HTML for score visualization"""
-    if pd.isna(value) or value == "N/A":
-        return str(value)
-    try:
-        # Normalize value to 0-1 range
-        if max_val > min_val:
-            normalized = (float(value) - min_val) / (max_val - min_val)
-        else:
-            normalized = 0.5
-        # Get color from colormap
-        color_rgba = colormap(normalized)
-        color_hex = rgb_to_hex(color_rgba)
-        # Create colored cell HTML with data-sort attribute for proper numeric sorting
-        return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
-    except (ValueError, TypeError):
-        return str(value)
-def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
-    """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
-    Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
-    """
-    if df.empty:
-        return df.style
-    colormap = create_excel_like_cmap()
-    # Score columns to colorize
-    score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
-                    "Pair Classification", "Retrieval", "STS", "Correlation"]
-    # Calculate min/max for each score column for normalization
-    color_ranges = {}
-    for col in score_columns:
-        if col in df.columns:
-            numeric_values = pd.to_numeric(df[col], errors='coerce')
-            if not numeric_values.isna().all():
-                color_ranges[col] = {
-                    'min': numeric_values.min(),
-                    'max': numeric_values.max()
-                }
-    # Create styler with background colors for score columns
-    def apply_color_gradient(val, col_name):
-        """Apply background color based on value"""
-        if col_name not in color_ranges:
-            return ''
-        if pd.isna(val) or val == "N/A":
-            return ''
-        try:
-            min_val = color_ranges[col_name]['min']
-            max_val = color_ranges[col_name]['max']
-            # Normalize value to 0-1 range
-            if max_val > min_val:
-                normalized = (float(val) - min_val) / (max_val - min_val)
-            else:
-                normalized = 0.5
-            # Get color from colormap
-            color_rgba = colormap(normalized)
-            color_hex = rgb_to_hex(color_rgba)
-            return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
-        except (ValueError, TypeError):
-            return ''
-    # Apply styling to score columns using map (applymap is deprecated)
-    styler = df.style
-    for col in score_columns:
-        if col in df.columns:
-            styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
-    # Format score columns to 2 decimal places
-    format_dict = {}
-    for col in score_columns:
-        if col in df.columns:
-            format_dict[col] = '{:.2f}'
-    if format_dict:
-        styler = styler.format(format_dict, na_rep='N/A')
-    return styler

evaluation_service.py DELETED Viewed

@@ -1,190 +0,0 @@
-#!/usr/bin/env python3
-"""
-Evaluation Service module for MTEB Turkish Leaderboard
-Handles evaluation submissions and status tracking
-"""
-import time
-import re
-from typing import Optional, Tuple, List
-import traceback
-import pandas as pd
-import gradio as gr
-from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
-# Global state management for active evaluations
-active_evaluations = {}  # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
-def get_active_evaluations_status() -> str:
-    """Show status of active evaluations"""
-    if not active_evaluations:
-        return "🟢 No active evaluation requests"
-    status_lines = []
-    for request_id, info in active_evaluations.items():
-        model_name = info["model_name"]
-        email = info["email"]
-        elapsed = int(time.time() - info["start_time"])
-        status = info.get("status", "PENDING")
-        status_lines.append(f"🔄 {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
-    return "\n".join(status_lines)
-def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
-    """Get active evaluations status and cancellation options"""
-    status_text = get_active_evaluations_status()
-    cancel_options = []
-    for request_id, info in active_evaluations.items():
-        model_name = info["model_name"]
-        cancel_options.append(f"{request_id} - {model_name}")
-    return status_text, cancel_options
-def clear_active_evaluations() -> str:
-    """Clear all active evaluations from tracking"""
-    global active_evaluations
-    count = len(active_evaluations)
-    active_evaluations.clear()
-    return f"✅ Cleared {count} active evaluation(s) from tracking"
-def cancel_active_evaluation(selection: str) -> str:
-    """Cancel a selected active evaluation"""
-    if not selection:
-        return "❌ No evaluation selected for cancellation"
-    try:
-        request_id = selection.split(" - ")[0]
-        if request_id not in active_evaluations:
-            return f"❌ Evaluation {request_id} not found in active evaluations"
-        # Try to cancel via API
-        success = cancel_evaluation_request(request_id)
-        if success:
-            model_name = active_evaluations[request_id]["model_name"]
-            del active_evaluations[request_id]
-            return f"✅ Successfully cancelled evaluation for {model_name} (ID: {request_id})"
-        else:
-            return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
-    except Exception as e:
-        return f"❌ Error cancelling evaluation: {str(e)}"
-def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
-    """Validate evaluation request parameters"""
-    # Model name validation
-    if not model_name or not model_name.strip():
-        return "❌ Model name cannot be empty!"
-    model_name = model_name.strip()
-    # Check model name length (format: org/model-name)
-    if len(model_name) < 3:
-        return "❌ Model name too short!"
-    if len(model_name) > 256:
-        return "❌ Model name too long (maximum 256 characters)!"
-    # Check for valid HuggingFace model name format (must be org/model)
-    if '/' not in model_name:
-        return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
-    if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
-        return "❌ Invalid model name format! Use format: organization/model-name"
-    # Email validation
-    if not email or not email.strip():
-        return "❌ Email address cannot be empty!"
-    email = email.strip()
-    if len(email) > 254:
-        return "❌ Email address too long!"
-    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
-    if not re.match(email_pattern, email):
-        return "❌ Invalid email address format!"
-    return None
-def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
-    try:
-        # Input validation
-        error_msg = _validate_evaluation_request(model_name, email)
-        if error_msg:
-            return error_msg, None
-        # Show progress
-        progress(0.1, desc="Sending evaluation request to API...")
-        # Send request to API - regardless of backend response, show success to user
-        api_response = send_evaluation_request_to_api(model_name, batch_size, email)
-        # Always show success message to user
-        # Backend errors (like duplicate requests) are handled by API and communicated via email
-        progress(1.0, desc="Request submitted successfully!")
-        # Return success message regardless of backend response
-        success_msg = f"""
-✅ Evaluation request submitted successfully!
-🤖 Model: {model_name}
-📧 Email: {email}
-📋 Next Steps:
-⏱️ Your request will be reviewed by our system
-📧 You will receive email notifications about the status of your evaluation
-🔄 If you've submitted this model before, you'll be notified via email
-Thank you for contributing to the Mizan Leaderboard!
-        """
-        return success_msg.strip(), current_data
-    except Exception as e:
-        # Log error for debugging
-        print(f"❌ Error submitting evaluation: {str(e)}")
-        traceback.print_exc()
-        error_msg = f"""
-❌ Failed to submit evaluation request
-🤖 Model: {model_name}
-📧 Email: {email}
-⚠️ Error: Unable to connect to the evaluation service.
-Please try again later or contact support if the problem persists.
-        """
-        return error_msg.strip(), None
-def refresh_evaluation_status() -> str:
-    """Refresh status of all active evaluations"""
-    if not active_evaluations:
-        return "🟢 No active evaluations to refresh"
-    updated_count = 0
-    for request_id, info in active_evaluations.items():
-        try:
-            status_data = get_evaluation_status(request_id)
-            if status_data and "status" in status_data:
-                old_status = info.get("status", "UNKNOWN")
-                new_status = status_data["status"]
-                if old_status != new_status:
-                    info["status"] = new_status
-                    updated_count += 1
-                    print(f"Status updated for {request_id}: {old_status} -> {new_status}")
-        except Exception as e:
-            print(f"Error refreshing status for {request_id}: {e}")
-    return f"🔄 Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."

leaderboard_data.csv CHANGED Viewed

@@ -1,33 +1,52 @@
-Model,Number of Parameters,Embedding Dim,Max Seq Length,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Correlation,Vocab Size
-BAAI/bge-m3,567M,1024,8192,69.39,63.51,75.68,35.26,78.88,57.89,69.83,0.61,250002
-intfloat/multilingual-e5-large,559M,1024,512,66.61,62.08,71.8,41.2,72.76,57.17,67.49,0.58,250002
-newmindai/TurkEmbed4STS,305M,768,8192,65.66,62.03,69.69,44.29,81.77,47.6,66.79,0.68,250048
-ytu-ce-cosmos/turkish-e5-large,559M,1024,512,64.93,59.73,72.42,38.51,70.86,47.6,69.24,0.56,250002
-intfloat/multilingual-e5-large-instruct,559M,1024,512,64.33,58.57,72.25,33.16,72.92,44.95,69.56,0.57,250002
-nomic-ai/nomic-embed-text-v2-moe,475M,768,512,64.28,60.15,70.07,41.28,63.87,56.4,69.16,0.53,250048
-Alibaba-NLP/gte-multilingual-base,305M,768,32768,63.86,60.04,68.0,39.16,76.0,50.12,66.94,0.62,250048
-sentence-transformers/paraphrase-multilingual-mpnet-base-v2,278M,768,512,63.33,57.63,70.88,41.35,83.6,33.81,58.51,0.65,250002
-newmindai/modernbert-base-tr-uncased-allnli-stsb,134M,768,8192,61.29,54.09,71.47,35.46,82.83,24.81,55.89,0.66,32000
-numind/NuSentiment-multilingual,278M,768,512,60.52,49.65,73.67,14.96,76.89,32.76,49.96,0.52,250002
-newmindai/TurkEmbed4Retrieval,305M,768,512,60.5,58.04,64.78,47.47,64.04,47.82,66.1,0.57,250048
-Qwen/Qwen3-Embedding-0.6B,595M,1024,131072,60.18,56.53,64.68,33.36,66.02,50.06,68.55,0.48,151669
-sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,117M,384,512,59.95,54.8,67.21,42.31,79.3,29.95,55.24,0.6,250037
-newmindai/TurkEmbed4STS-HD,305M,768,8192,59.94,53.06,67.61,34.24,80.08,35.88,47.47,0.65,250048
-emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,110M,768,512,59.92,52.65,68.38,24.61,74.94,39.0,56.3,0.62,32000
-ibm-granite/granite-embedding-278m-multilingual,278M,768,512,55.9,54.48,58.64,41.98,60.13,45.08,66.57,0.41,250002
-newmindai/ModernBERT-tr-uncased-stsb-HD,134M,768,8192,54.51,43.94,67.17,17.96,82.51,16.08,35.98,0.61,32000
-ibm-granite/granite-embedding-107m-multilingual,106M,384,512,52.68,50.72,55.75,34.17,59.86,39.97,63.85,0.38,250002
-minishlab/potion-multilingual-128M,128M,256,N/A,50.39,44.47,58.34,23.47,59.76,30.84,49.93,0.43,500358
-google/embeddinggemma-300m,307M,768,2048,49.08,44.98,55.23,22.84,61.02,26.92,58.91,0.27,262144
-nomic-ai/nomic-embed-text-v1,136M,768,8192,45.12,41.46,48.3,9.45,59.75,32.9,56.88,0.42,30528
-nomic-ai/nomic-embed-text-v1.5,136M,768,8192,44.63,40.04,48.92,9.69,58.53,32.19,50.89,0.41,30528
-mixedbread-ai/mxbai-embed-large-v1,335M,1024,512,44.0,39.23,49.49,15.99,56.66,27.75,46.25,0.37,30522
-sentence-transformers/multi-qa-MiniLM-L6-cos-v1,22M,384,512,38.82,32.39,44.08,5.55,58.29,25.16,28.88,0.34,30522
-boun-tabi-LMG/TURNA,495M,1024,1024,38.36,30.96,47.17,10.26,56.62,13.04,27.73,0.22,32128
-sentence-transformers/all-MiniLM-L12-v2,33M,384,512,38.28,31.13,44.77,7.82,58.2,21.64,23.24,0.36,30522
-nielsr/lilt-xlm-roberta-base,284M,768,512,38.01,29.57,50.1,12.79,55.35,2.45,27.14,0.22,250002
-sentence-transformers/all-MiniLM-L6-v2,22M,384,512,37.95,31.97,44.46,6.58,56.75,16.48,35.55,0.31,30522
-sentence-transformers/all-mpnet-base-v2,109M,768,512,37.21,31.31,43.75,10.56,55.99,15.16,31.08,0.31,30527
-minishlab/potion-base-8M,7M,256,N/A,36.85,30.01,42.51,2.26,57.86,21.75,25.64,0.36,29528
-sentence-transformers/paraphrase-MiniLM-L6-v2,22M,384,512,36.26,28.19,44.02,4.53,56.62,17.47,18.29,0.33,30522
-newmindai/lettucedect-210m-eurobert-tr-v1,211M,768,8192,27.66,21.55,34.32,1.54,52.34,0.22,19.34,0.1,128256

+Rank (Borda),Model,Model Architecture,Tokenizer Type,Unique Token Count,Turkish Token Count,Turkish Token %,Pure Token Count,Pure Token %,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Contracts,Regulation,Caselaw,Score(Legal),Memory Usage (MB),Number of Parameters,Embed Dim,Vocab Size,Max Seq Length,Correlation,Model Type
+1,google/embeddinggemma-300m,Gemma3TextModel,GemmaTokenizer,13697.0,5910.0,43.15,3980.0,29.06,67.23,65.42,77.74,45.05,80.02,55.06,69.22,83.97,39.56,28.38,50.63,1173.0,307M,768.0,262144.0,2048,0.51,Embedding
+2,newmindai/bge-m3-stsb,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.46,63.53068666666667,74.24768333333334,43.9295,78.50975,50.142,70.8245,82.609,38.141000000000005,29.167,49.97233333333333,2165.0,567M,1024.0,250002.0,8194,0.6350506506291465,Embedding
+3,BAAI/bge-m3,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,64.75,62.87,75.35,35.86,78.88,54.42,69.83,86.08,38.09,29.3,51.16,2165.0,567M,1024.0,250002.0,8194,0.61,Embedding
+4,Lajavaness/bilingual-embedding-large,BilingualModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.62,62.468826666666665,74.15278333333333,42.2467,73.0609,52.248250000000006,70.6355,82.14099999999999,35.399,24.551,47.36366666666666,2135.0,559M,1024.0,250002.0,514,0.611419419101738,Embedding
+5,newmindai/TurkEmbed4STS,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.67,62.41829666666666,69.69163333333334,44.2897,81.76675,49.135,67.2084,78.877,35.18,27.635,47.23066666666666,1164.0,305M,768.0,250048.0,8192,0.6839028854791485,Embedding
+6,intfloat/multilingual-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.14,61.50873666666666,71.79943333333334,41.1967,72.76185000000001,54.29849999999999,67.4872,85.38,33.178000000000004,22.299,46.952333333333335,2135.0,559M,1024.0,250002.0,514,0.5844910512151045,Embedding
+7,ytu-ce-cosmos/turkish-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.0,60.36150666666667,72.41818333333333,38.1709,70.86345,51.114,69.241,80.729,37.384,26.476,48.196333333333335,2135.0,559M,1024.0,250002.0,514,0.5608614724386807,Embedding
+8,Alibaba-NLP/gte-multilingual-base,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,61.18,60.12285333333333,67.99526666666667,39.1645,75.99780000000001,50.516000000000005,66.94069999999999,76.012,36.391,27.066000000000003,46.489666666666665,1164.0,305M,768.0,250048.0,8192,0.6170556873432124,Embedding
+9,nomic-ai/nomic-embed-text-v2-moe,NomicBertModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,60.63,59.54449333333332,68.53571666666666,43.3523,64.42945,52.6895,68.71549999999999,84.466,39.939,27.849,50.75133333333333,1813.0,475M,768.0,250048.0,2048,0.530989593067926,Embedding
+10,magibu/embeddingmagibu-200m,Gemma3TextModel,GemmaTokenizer,29799.0,18946.0,63.58,8515.0,28.57,59.989025,59.247110000000006,66.4086,40.1472,74.98685,48.2505,66.4424,75.745,33.984,27.033,45.587,789.0,206M,768.0,131072.0,8192,0.585573508421718,Embedding
+11,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.62,58.92842666666667,70.87778333333333,41.799,83.59875000000001,39.8555,58.511100000000006,65.403,7.61,1.289,24.767333333333337,1060.0,278M,768.0,250002.0,514,0.6495769869027372,Embedding
+12,intfloat/multilingual-e5-large-instruct,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.91,58.85126,72.24580000000002,31.5179,72.91635,48.01275,69.5635,78.985,35.735,25.351000000000003,46.690333333333335,2135.0,559M,1024.0,250002.0,514,0.5663941110812728,Embedding
+13,newmindai/TurkEmbed4Retrieval,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.1,58.36369333333333,64.78041666666665,47.468700000000005,64.0415,48.86425,66.6636,74.626,36.121,28.898000000000003,46.54833333333334,1164.0,305M,768.0,250048.0,512,0.5743432298546475,Embedding
+14,newmindai/Mursit-Embed-Qwen3-1.7B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,58.08,56.84,68.46,42.22,59.67,50.1,63.77,70.22,17.94,16.11,34.76,6563.0,1.7B,2048.0,151936.0,40960,0.44,CLM-Embedding
+15,newmindai/Mursit-Large-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.57,56.43,67.47,38.76,59.88,51.59,64.44,81.63,32.39,25.24,46.42,1539.0,403M,1024.0,59008.0,2048,0.49,Embedding
+16,newmindai/modernbert-base-tr-uncased-allnli-stsb,ModernBertModel,PreTrainedTokenizerFast,20502.0,16007.0,78.08,6077.0,29.64,56.35,56.31918666666665,71.45993333333332,35.4615,82.83494999999999,35.11075,56.7288,62.937,15.297,17.466,31.899999999999995,514.0,134M,768.0,32000.0,8192,0.6637952581670423,Embedding
+17,newmindai/Mursit-Base-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.01,55.86,66.25,39.75,61.31,50.07,61.9,80.4,34.1,28.07,47.52,593.0,155M,768.0,59008.0,1024,0.49,Embedding
+18,emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,56.03,54.33,68.42,23.64,74.94,42.29,62.39,72.83,22.88,20.78,38.83,421.0,110M,768.0,32000.0,512,0.62,Embedding
+19,newmindai/TurkEmbed4STS-HD,NewForTokenClassification,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,56.14,54.25491999999999,67.61245,36.856100000000005,80.07815000000001,39.2535,47.4744,70.233,4.837000000000001,6.1690000000000005,27.079666666666668,1164.0,305M,768.0,250048.0,8192,0.6504462482545317,Embedding
+20,ibm-granite/granite-embedding-278m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.68,53.93412333333333,58.37791666666666,39.4453,60.1335,45.139,66.5749,67.254,24.53,16.229,36.004333333333335,1060.0,278M,768.0,250002.0,514,0.4137480806327822,Embedding
+21,newmindai/Mursit-Embed-Qwen3-4B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,56.47,53.65,67.29,36.68,58.36,51.12,54.77,69.25,24.21,17.56,37.0,15344.0,4B,2560.0,151936.0,40960,0.34,CLM-Embedding
+22,nvidia/llama-embed-nemotron-8b,LlamaBidirectionalModel,PreTrainedTokenizerFast,12041.0,5485.0,45.55,3507.0,29.13,51.06448333333333,53.52449666666666,68.51398333333334,39.8189,58.1497,30.656,70.4839,52.095,28.802,16.756999999999998,32.55133333333333,28629.0,8B,4096.0,128256.0,131072,0.3817553384080386,CLM-Embedding
+23,KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5,Qwen2Model,Qwen2TokenizerFast,10262.0,3234.0,31.51,2294.0,22.35,52.71,52.83622666666668,64.64263333333334,37.6148,57.5669,35.511500000000005,68.8453,32.014,35.608000000000004,30.239,32.62033333333334,1884.0,494M,896.0,151936.0,131072,0.4053257465375148,CLM-Embedding
+24,ibm-granite/granite-embedding-107m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,50.52,51.07249,55.654500000000006,34.6266,59.86395,41.3655,63.85189999999999,60.72,20.033,11.705,30.819333333333333,408.0,106M,384.0,250002.0,514,0.3807947055039975,Embedding
+25,sentence-transformers/LaBSE,BertModel,BertTokenizerFast,19595.0,11061.0,56.45,5800.0,29.6,51.83,50.72844,63.18349999999999,25.5499,64.0111,38.4625,62.4352,63.809000000000005,15.122,13.838,30.923,1798.0,471M,768.0,501153.0,512,0.4794392790632775,Embedding
+26,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,BertModel,BertTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.99,50.29915666666667,67.18508333333332,42.3102,79.30365,35.82925,26.867600000000003,56.875,0.8410000000000001,0.713,19.476333333333333,448.0,117M,384.0,250037.0,512,0.6043096711195243,Embedding
+27,numind/NuSentiment-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,54.0,50.16527553055566,73.67280773306626,14.960431297201202,76.8943051047361,35.343,49.95583351777477,64.037,10.431,10.38,28.282666666666668,1060.0,278M,768.0,250002.0,514,0.5183345151582207,Embedding
+28,dbmdz/bert-base-turkish-uncased,BertModel,BertTokenizerFast,14807.0,10953.0,73.97,5876.0,39.68,51.99,46.44,67.93,34.76,60.54,31.98,37.01,52.48,12.02,10.09,24.86,421.0,110M,768.0,32000.0,512,0.36,MLM
+29,minishlab/potion-multilingual-128M,StaticModel,XLMRobertaTokenizerFast,18943.0,12657.0,66.82,5986.0,31.6,47.96,45.95582333333334,58.34376666666668,25.4021,59.76105,36.3395,49.9327,65.022,21.481,14.031,33.51133333333334,488.0,128M,256.0,500358.0,∞,0.4306555947403001,Embedding
+30,ytu-ce-cosmos/turkish-large-bert-cased,BertForPreTraining,BertTokenizerFast,21076.0,16830.0,79.85,8670.0,41.14,50.7,45.3,67.43,34.24,60.11,28.68,36.04,47.57,5.93,3.85,19.12,1286.0,337M,1024.0,32000.0,1024,0.33,MLM
+31,dbmdz/bert-base-turkish-cased,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,47.89,45.17,66.39,35.28,60.05,30.52,33.62,54.03,10.13,9.07,24.41,421.0,110M,768.0,32000.0,512,0.33,MLM
+32,newmindai/TurkEmbed4STS-Static,StaticModel,Tokenizer,13258.0,7304.0,55.09,4075.0,30.74,45.45,43.05512,57.01745,19.3065,65.30815000000001,32.834500000000006,40.809,63.33800000000001,19.964,12.687,31.996333333333336,244.0,64M,256.0,250002.0,∞,0.4254717954565192,Embedding
+33,KocLab-Bilkent/BERTurk-Legal,BertForMaskedLM,BertTokenizerFast,27482.0,19590.0,71.28,8228.0,29.94,46.44,42.02,60.61,26.24,59.51,25.8,37.94,61.4,15.51,20.99,32.63,703.0,184M,768.0,128000.0,512,0.34,MLM
+34,newmindai/Mursit-Large,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,44.65,41.75,62.95,25.34,58.04,27.4,35.01,42.74,11.29,17.1,23.71,1539.0,403M,1024.0,59008.0,2048,0.28,MLM
+35,nomic-ai/nomic-embed-text-v1,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.75,41.66643666666667,47.90213333333333,9.1279,60.08205,34.3415,56.8786,58.672,23.771,15.572,32.67166666666667,521.0,136M,768.0,30528.0,8192,0.426704518889946,Embedding
+36,ytu-ce-cosmos/turkish-base-bert-uncased,BertForPreTraining,BertTokenizerFast,17128.0,14329.0,83.66,6062.0,35.39,50.54,40.95,66.2,25.68,58.21,20.46,34.2,45.94,10.21,6.28,20.81,421.0,110M,768.0,32000.0,512,0.3,MLM
+37,nomic-ai/nomic-embed-text-v1.5,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.21,40.30043666666667,48.92313333333334,9.3571,58.52505,33.8085,50.8884,56.711,13.358,5.783,25.284,521.0,136M,768.0,30528.0,8192,0.4147406606805225,Embedding
+38,newmindai/Mursit-Base,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,41.34,40.23,59.78,25.48,58.65,20.82,36.45,36.0,7.4,10.4,17.93,593.0,155M,768.0,59008.0,1024,0.28,MLM
+39,mixedbread-ai/mxbai-embed-large-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,40.92,40.03663,49.5437,15.9903,56.6587,31.74075,46.2497,43.591,10.564,9.052,21.069,1278.0,335M,1024.0,30522.0,512,0.3720971359650719,Embedding
+40,jhu-clsp/mmBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,13585.0,5611.0,41.3,5710.0,42.03,43.87,39.65,61.84,26.77,59.25,15.83,34.56,34.45,1.33,0.68,12.15,1170.0,306M,768.0,256000.0,8192,0.34,MLM
+41,boun-tabilab/TabiBERT,ModernBertForMaskedLM,PreTrainedTokenizerFast,32444.0,20388.0,62.84,12186.0,37.56,42.15,37.77,59.63,25.75,58.19,14.96,30.32,32.02,1.86,0.63,11.5,567.0,148M,768.0,50176.0,8192,0.32,MLM
+42,sentence-transformers/all-MiniLM-L12-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.56,33.19119,44.84295,7.693999999999999,58.1998,20.928,34.2912,38.948,2.771,2.557,14.758666666666668,127.0,33M,384.0,30522.0,512,0.3620264346982647,Embedding
+43,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.81,32.343716666666666,44.079283333333336,5.5512,58.2895,24.92,28.8786,36.243,4.816,5.283,15.447333333333336,86.0,22M,384.0,30522.0,512,0.3352620465291676,Embedding
+44,boun-tabi-LMG/TURNA,T5ForConditionalGeneration,T5TokenizerFast,21630.0,18600.0,85.99,7923.0,36.63,31.74,31.622866666666663,47.17373333333333,10.2619,56.6155,16.333,27.7302,34.89,8.883000000000001,4.55,16.107666666666667,1889.0,495M,1024.0,32128.0,1024,0.2188615462224458,Seq2Seq
+45,sentence-transformers/all-mpnet-base-v2,MPNetForMaskedLM,MPNetTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.51,31.580113333333333,43.75221666666667,10.0253,55.9924,17.051750000000002,31.0789,32.477000000000004,2.243,3.31,12.67666666666667,417.0,109M,768.0,30527.0,514,0.3072208676420578,Embedding
+46,sentence-transformers/all-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,30.84,30.223826666666668,44.49228333333334,6.576,56.7533,16.46825,26.8293,32.039,3.052,3.514,12.868333333333334,86.0,22M,384.0,30522.0,512,0.3117993950335187,Embedding
+47,minishlab/potion-base-8M,StaticModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.26,30.1419,42.5097,2.2195,57.8614,22.4745,25.6444,46.72,13.243,9.77,23.244333333333334,28.0,7M,256.0,29528.0,∞,0.363850332504128,Embedding
+48,sentence-transformers/paraphrase-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,29.68,28.88314666666667,44.08553333333333,5.963100000000001,56.6191,14.424,23.324,22.977,4.347,2.266,9.863333333333337,86.0,22M,384.0,30522.0,512,0.3273012423895394,Embedding
+49,answerdotai/ModernBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.33,23.8,39.06,2.01,53.95,2.1,21.91,7.92,0.62,0.43,2.99,568.0,149M,768.0,50368.0,8192,0.23,MLM
+50,answerdotai/ModernBERT-large,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.46,23.74,39.44,3.9,53.73,1.8,19.85,6.12,0.62,0.59,2.44,1505.0,394M,1024.0,50368.0,8192,0.2,MLM
+51,google-bert/bert-base-uncased,BertForMaskedLM,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,22.86,23.49519,40.2581,2.7069,53.06465,2.8455,18.6008,8.535,0.393,0.912,3.2800000000000007,417.0,109M,768.0,30522.0,512,0.1652374209194242,MLM

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
-gradio>=5.49.1
 pandas>=2.3.3
 numpy>=2.3.4
-matplotlib>=3.10.7
 requests>=2.32.5
 python-dotenv>=1.1.1
 itsdangerous>=2.2.0

+gradio==5.50.0
 pandas>=2.3.3
 numpy>=2.3.4
+plotly>=6.5.0
+matplotlib>=3.10.0
 requests>=2.32.5
 python-dotenv>=1.1.1
 itsdangerous>=2.2.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Mizan Turkish Leaderboard - HuggingFace Space Version
+Clean, modular architecture for the public leaderboard.
+"""
+from .core import column_registry, settings
+from .data import DataTransformer, LeaderboardStyler
+from .components import LeaderboardTab, DatasetTab, SubmitTab
+__all__ = [
+    "column_registry",
+    "settings",
+    "DataTransformer",
+    "LeaderboardStyler",
+    "LeaderboardTab",
+    "DatasetTab",
+    "SubmitTab",
+]

src/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/__init__.cpython-312.pyc and b/src/__pycache__/__init__.cpython-312.pyc differ

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""API client modules."""
+from .client import EvaluationApiClient
+__all__ = [
+    "EvaluationApiClient",
+]

src/api/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src/api/__pycache__/__init__.cpython-312.pyc and b/src/api/__pycache__/__init__.cpython-312.pyc differ

src/api/__pycache__/client.cpython-312.pyc CHANGED Viewed

Binary files a/src/api/__pycache__/client.cpython-312.pyc and b/src/api/__pycache__/client.cpython-312.pyc differ

src/api/client.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+API Client Module
+Handles communication with the evaluation backend.
+"""
+import logging
+from typing import Optional
+import requests
+from ..core.config import settings
+logger = logging.getLogger(__name__)
+class EvaluationApiClient:
+    """
+    Client for evaluation API operations.
+    Handles submission of evaluation requests to the backend.
+    """
+    def __init__(self):
+        self.api_url = settings.api.url
+        self.auth = (settings.api.username, settings.api.password)
+        self.timeout = settings.api.timeout
+    def submit_evaluation(
+        self,
+        model_name: str,
+        email: str,
+        batch_size: int = 32
+    ) -> bool:
+        """
+        Submit an evaluation request to the API.
+        Args:
+            model_name: HuggingFace model identifier.
+            email: Email for notifications.
+            batch_size: Batch size for evaluation.
+        Returns:
+            True if submission was successful.
+        """
+        if not settings.api.is_configured:
+            logger.error("API not configured - cannot submit evaluation")
+            return False
+        try:
+            payload = {
+                "model_name": model_name,
+                "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
+                "batch_size": batch_size,
+                "email": email,
+                "model_type": "sentence-transformer"
+            }
+            response = requests.post(
+                f"{self.api_url}/api/mteb/request",
+                json=payload,
+                timeout=self.timeout,
+                auth=self.auth
+            )
+            if response.status_code == 200:
+                logger.info(f"Evaluation submitted successfully for {model_name}")
+                return True
+            else:
+                logger.error(f"API returned status {response.status_code}")
+                return False
+        except Exception as e:
+            logger.error(f"Error submitting evaluation: {e}")
+            return False

src/components/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""UI Components for Gradio interface."""
+from .leaderboard import LeaderboardTab
+from .dataset import DatasetTab
+from .submit import SubmitTab
+__all__ = [
+    "LeaderboardTab",
+    "DatasetTab",
+    "SubmitTab",
+]

src/components/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src/components/__pycache__/__init__.cpython-312.pyc and b/src/components/__pycache__/__init__.cpython-312.pyc differ

src/components/__pycache__/dataset.cpython-312.pyc CHANGED Viewed

Binary files a/src/components/__pycache__/dataset.cpython-312.pyc and b/src/components/__pycache__/dataset.cpython-312.pyc differ

src/components/__pycache__/leaderboard.cpython-312.pyc CHANGED Viewed

Binary files a/src/components/__pycache__/leaderboard.cpython-312.pyc and b/src/components/__pycache__/leaderboard.cpython-312.pyc differ

src/components/__pycache__/submit.cpython-312.pyc CHANGED Viewed

Binary files a/src/components/__pycache__/submit.cpython-312.pyc and b/src/components/__pycache__/submit.cpython-312.pyc differ

src/components/dataset.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+Dataset Tab Component
+Displays task and dataset information.
+"""
+import gradio as gr
+import pandas as pd
+import html
+class DatasetTab:
+    """
+    Dataset information tab component.
+    Shows details about the evaluation tasks and datasets.
+    """
+    def build(self) -> None:
+        """Build the dataset tab UI."""
+        gr.Markdown("### MTEB Turkish + Turkish Legal Dataset Overview")
+        # Task name to dataset path mapping
+        task_to_dataset = {
+            'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
+            'XQuADRetrieval': 'google/xquad',
+            'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
+            'MKQARetrieval': 'apple/mkqa',
+            'MassiveIntentClassification': 'mteb/amazon_massive_intent',
+            'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
+            'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
+            'SIB200Classification': 'mteb/sib200',
+            'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
+            'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
+            'SIB200ClusteringS2S': 'mteb/sib200',
+            'XNLI': 'mteb/xnli',
+            'XNLIV2': 'mteb/xnli2.0-multi-pair',
+            'STS22.v2': 'mteb/sts22-crosslingual-sts'
+        }
+        # Create clickable task names
+        clickable_task_names = []
+        task_list = [
+            'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
+            'MassiveIntentClassification', 'MassiveScenarioClassification',
+            'MultilingualSentimentClassification', 'SIB200Classification',
+            'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
+            'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
+        ]
+        for task_name in task_list:
+            dataset_path = task_to_dataset[task_name]
+            hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
+            clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
+            clickable_task_names.append(clickable_name)
+        # Create dataset information table
+        dataset_data = pd.DataFrame({
+            'Task Name': clickable_task_names,
+            'Task Type': [
+                'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
+                'Classification', 'Classification',
+                'Classification', 'Classification',
+                'Classification', 'Classification',
+                'Clustering', 'PairClassification', 'PairClassification', 'STS'
+            ],
+            'Description': [
+                'Turkish FAQ retrieval task',
+                'Turkish question answering retrieval',
+                'Historical Turkish document retrieval',
+                'Multilingual knowledge QA retrieval',
+                'Intent classification for Turkish',
+                'Scenario classification for Turkish',
+                'Multilingual sentiment classification',
+                'SIB200 language identification',
+                'Turkish movie review sentiment',
+                'Turkish product review sentiment',
+                'SIB200 clustering task',
+                'Turkish natural language inference',
+                'Enhanced Turkish NLI task',
+                'Turkish semantic textual similarity'
+            ],
+            'Domain': [
+                'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
+                'Intent', 'Scenario',
+                'Sentiment', 'Language ID',
+                'Movies', 'Products',
+                'Language ID', 'NLI', 'NLI', 'STS'
+            ],
+            'Samples': [
+                '~145K', '~1.19K', '~1.33K', '~10K',
+                '~5K', '~5K',
+                '211', '~899',
+                '~2.64K', '800',
+                '99', '~7.5K', '~5.01K', '~208'
+            ]
+        })
+        gr.Dataframe(
+            value=dataset_data,
+            label="MTEB Turkish Task Details",
+            interactive=False,
+            wrap=True,
+            datatype=["html", "str", "str", "str", "str"]
+        )
+        # Turkish Legal Tasks Section
+        self._build_legal_tasks_section()
+        # Task distribution
+        self._build_task_distribution_section()
+        # Metrics explanation
+        self._build_metrics_explanation_section()
+    def _build_legal_tasks_section(self):
+        """Build the Turkish Legal Tasks section."""
+        gr.Markdown("---")
+        gr.Markdown("### Turkish Legal Tasks")
+        legal_task_to_dataset = {
+            'TurkishLegalQA': 'newmindai/contract-retrieval',
+            'TurkishTaxRulings': 'newmindai/regulation-retrieval',
+            'TurkishCourtOfCassation': 'newmindai/caselaw-retrieval'
+        }
+        clickable_legal_task_names = []
+        for task_name in ['TurkishLegalQA', 'TurkishTaxRulings', 'TurkishCourtOfCassation']:
+            dataset_path = legal_task_to_dataset[task_name]
+            hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
+            clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
+            clickable_legal_task_names.append(clickable_name)
+        legal_task_data = pd.DataFrame({
+            'Task Name': clickable_legal_task_names,
+            'Task Type': ['Contracts', 'Regulation', 'Case Law'],
+            'Description': [
+                'Turkish legal question answering retrieval',
+                'Turkish legal tax rulings retrieval',
+                'Turkish Court of Cassation caselaw retrieval'
+            ],
+            'Domain': ['Contracts', 'Regulation', 'Caselaw'],
+            'Samples': ['272', '~120K', '~1.39K']
+        })
+        gr.Dataframe(
+            value=legal_task_data,
+            label="Turkish Legal Task Details",
+            interactive=False,
+            wrap=True,
+            datatype=["html", "str", "str", "str", "str"]
+        )
+    def _build_task_distribution_section(self):
+        """Build the task distribution section."""
+        gr.Markdown("""
+        ### Task Distribution:
+        **Turkish Tasks (14):**
+        - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
+        - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
+        - **Pair Classification**: 2 tasks (natural language inference)
+        - **Clustering**: 1 task (language clustering)
+        - **STS**: 1 task (semantic textual similarity)
+        **Turkish Legal Tasks (3):**
+        - **Contracts**: 1 task (Turkish legal QA retrieval)
+        - **Regulation**: 1 task (Turkish tax rulings retrieval)
+        - **Caselaw**: 1 task (Turkish Court of Cassation case law retrieval)
+        **Total: 17 tasks across 8 categories**
+        """)
+        # Statistics summary
+        stats_data = pd.DataFrame({
+            'Metric': [
+                'Total Tasks',
+                'Turkish Tasks',
+                'Legal Tasks',
+                'Task Categories',
+                'Languages',
+                'Avg. Tokens per Sample'
+            ],
+            'Value': [
+                '17 tasks',
+                '14 tasks',
+                '3 tasks',
+                '8 categories',
+                'Turkish',
+                '~150 tokens'
+            ],
+            'Notes': [
+                'Comprehensive evaluation: Turkish NLP + Legal',
+                'Classification, Retrieval, STS, NLI, Clustering',
+                'Contracts, Regulation, Caselaw',
+                'Turkish: 5 types, Legal: 3 types',
+                'Turkish-focused',
+                'Varies by task type and domain'
+            ]
+        })
+        gr.Dataframe(
+            value=stats_data,
+            label="Dataset Statistics Summary",
+            interactive=False
+        )
+    def _build_metrics_explanation_section(self):
+        """Build the metrics explanation section."""
+        gr.Markdown("""
+        ---
+        ### Metrics Explanation:
+        **Task Categories:**
+        - **MTEB Score**: Average performance by task categories (refers to Mean (TaskType))
+        - **Mean (Task)**: Average performance across all individual tasks
+        - **Classification**: Performance on Turkish classification tasks
+        - **Clustering**: Performance on Turkish clustering tasks
+        - **Pair Classification**: Performance on pair classification tasks (like NLI)
+        - **Retrieval**: Performance on Turkish information retrieval tasks
+        - **STS**: Performance on Semantic Textual Similarity tasks
+        **Turkish Legal Categories:**
+        - **Contracts**: Performance on Turkish legal contract analysis tasks
+        - **Regulation**: Performance on Turkish legal regulation analysis tasks
+        - **Caselaw**: Performance on Turkish Court of Cassation case law retrieval tasks
+        ### Tokenizer Quality Metrics:
+        - **Unique Token Count**: Number of unique tokens generated by the tokenizer on Turkish MMLU dataset
+        - **Turkish Token Count**: How many unique tokens are valid Turkish words/morphemes
+        - **Turkish Token %**: Percentage of unique tokens that are linguistically valid Turkish
+        - **Pure Token Count**: How many unique tokens are morphologically pure (root words)
+        - **Pure Token %**: Percentage of unique tokens that are root words without suffixes
+        ### Model Information:
+        - **Parameters**: Number of model parameters
+        - **Embed Dim**: Embedding dimension size
+        - **Max Seq Length**: Maximum sequence length the model can process
+        - **Vocab Size**: Size of the model's vocabulary
+        - **Model Architecture**: The underlying model architecture
+        - **Tokenizer Type**: The tokenizer implementation used
+        """)
+        # About, Contact, and Links section
+        self._build_about_section()
+    def _build_about_section(self):
+        """Build the about, contact, and links section."""
+        gr.Markdown("""
+        ---
+        ### About Mizan:
+        This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
+        on Turkish language tasks across multiple domains including:
+        - Text classification and sentiment analysis
+        - Information retrieval and search
+        - Semantic textual similarity
+        - Text clustering and pair classification
+        - **Turkish Legal**: Contract analysis, regulation, and case law retrieval
+        ### Submit Your Model:
+        Use the **Submit** tab to submit your Turkish embedding model for evaluation.
+        Your request will be reviewed by administrators and you'll receive email notifications about the progress.
+        ### Contact:
+        For any questions or feedback, please contact info@newmind.ai
+        ### Links:
+        - **GitHub**: [embeddings-benchmark/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
+        - **Github**: [malibayram/tokenizer_benchmark](https://github.com/malibayram/tokenizer_benchmark) - Tokenizer evaluation is done with code from this repository, developed by Mehmet Ali Bayram, which utilizes ITU NLP tools for Turkish linguistic analysis.
+        """)

src/components/leaderboard.py ADDED Viewed

	@@ -0,0 +1,461 @@

+"""
+Leaderboard Tab Component
+Main leaderboard display with column filtering.
+"""
+import logging
+from typing import Dict, List, Optional
+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from ..core.columns import column_registry, ColumnGroup
+from ..core.config import settings
+from ..data import DataTransformer, LeaderboardStyler
+logger = logging.getLogger(__name__)
+class LeaderboardTab:
+    """
+    Leaderboard tab component.
+    Displays the main ranking table with:
+    - Color-coded scores
+    - Column filtering via checkbox groups
+    - Clickable model links
+    """
+    def __init__(self, data: pd.DataFrame):
+        self.data = data
+        self.transformer = DataTransformer()
+        self.styler = LeaderboardStyler()
+        # UI components (will be set during build)
+        self.leaderboard: Optional[gr.Dataframe] = None
+        self._column_checkboxes: Dict[str, gr.CheckboxGroup] = {}
+        self._selected_columns_state: Optional[gr.State] = None
+        self._model_type_filter_state: Optional[gr.State] = None
+        self._search_state: Optional[gr.State] = None
+    def _get_styled_data(
+        self,
+        columns: List[str],
+        model_type_filter: str = "All"
+    ) -> "pd.io.formats.style.Styler":
+        """Get styled DataFrame for given columns."""
+        if self.data is None or self.data.empty:
+            empty = self.transformer.create_empty_dataframe()
+            return empty.style
+        # Apply model type filter
+        filtered_data = self.data.copy()
+        if model_type_filter != "All" and "Model Type" in filtered_data.columns:
+            filtered_data = filtered_data[filtered_data["Model Type"] == model_type_filter]
+        filtered = self.transformer.prepare_for_display(filtered_data, columns, add_links=False)
+        return self.styler.apply_styling(filtered)
+    def _get_column_groups(self) -> Dict[str, List[str]]:
+        """Get optional columns organized by group (exclude default columns)."""
+        groups = {}
+        # Get default column names to exclude
+        default_cols = set(column_registry.default_columns)
+        # MTEB Task Scores (only optional ones)
+        mteb_cols = [col for col in column_registry.get_group_names(ColumnGroup.MTEB) if col not in default_cols]
+        if mteb_cols:
+            groups["MTEB Scores"] = mteb_cols
+        # Legal Task Scores (only optional ones)
+        legal_cols = [col for col in column_registry.get_group_names(ColumnGroup.LEGAL) if col not in default_cols]
+        if legal_cols:
+            groups["Legal Scores"] = legal_cols
+        # Correlation
+        corr_cols = [col for col in column_registry.get_group_names(ColumnGroup.CORRELATION) if col not in default_cols]
+        if corr_cols:
+            groups["Correlation"] = corr_cols
+        # Tokenizer Quality (only optional ones)
+        tok_cols = [col for col in column_registry.get_group_names(ColumnGroup.TOKENIZER) if col not in default_cols]
+        if tok_cols:
+            groups["Tokenizer Quality"] = tok_cols
+        # Additional Model Info (only optional ones)
+        model_info_cols = [col for col in column_registry.get_group_names(ColumnGroup.MODEL_INFO) if col not in default_cols]
+        if model_info_cols:
+            groups["Model Info"] = model_info_cols
+        return groups
+    def _filter_columns_handler(
+        self,
+        previous_selected: List[str],
+        model_type_filter: str,
+        *checkbox_values
+    ) -> tuple:
+        """Handle checkbox group changes with click-order tracking."""
+        # Collect all currently selected columns from all checkbox groups
+        currently_selected = set()
+        for selected_list in checkbox_values:
+            if selected_list:
+                for col_name in selected_list:
+                    currently_selected.add(col_name)
+        previous_set = set(previous_selected)
+        # Find newly added columns (in current but not in previous)
+        newly_added = currently_selected - previous_set
+        # Find removed columns (in previous but not in current)
+        removed = previous_set - currently_selected
+        # Update the ordered list: keep previous order, remove deselected, append new
+        updated_selected = [col for col in previous_selected if col not in removed]
+        for col in newly_added:
+            updated_selected.append(col)
+        # Build final column list: defaults + selected optional in order
+        ordered_columns = list(column_registry.default_columns) + updated_selected
+        # Get styled data with model type filter
+        styled = self._get_styled_data(ordered_columns, model_type_filter)
+        datatypes = self.styler.get_datatypes(ordered_columns)
+        widths = self.styler.get_column_widths(ordered_columns)
+        return gr.update(value=styled, datatype=datatypes, column_widths=widths), updated_selected
+    def _model_type_filter_handler(self, previous_selected: List[str], model_type_filter: str) -> tuple:
+        """Handle model type filter changes."""
+        # Build final column list: defaults + selected optional in order
+        ordered_columns = list(column_registry.default_columns) + previous_selected
+        # Get styled data with model type filter
+        styled = self._get_styled_data(ordered_columns, model_type_filter)
+        datatypes = self.styler.get_datatypes(ordered_columns)
+        widths = self.styler.get_column_widths(ordered_columns)
+        return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter
+    def _model_type_and_plots_handler(
+        self,
+        previous_selected: List[str],
+        model_type_filter: str
+    ) -> tuple:
+        """Handle model type filter changes and update both leaderboard and plots."""
+        # Build final column list: defaults + selected optional in order
+        ordered_columns = list(column_registry.default_columns) + previous_selected
+        # Get styled data with model type filter
+        styled = self._get_styled_data(ordered_columns, model_type_filter)
+        datatypes = self.styler.get_datatypes(ordered_columns)
+        widths = self.styler.get_column_widths(ordered_columns)
+        # Update plots with filtered data
+        plot1 = self._get_pure_vs_mean_task_plot(model_type_filter)
+        plot2 = self._get_pure_vs_legal_score_plot(model_type_filter)
+        return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter, plot1, plot2
+    def _create_bubble_plot(self, x_col: str, y_col: str, size_col: str,
+                           title: str, xlabel: str, ylabel: str, model_type_filter: str = "All") -> Optional[go.Figure]:
+        """
+        Create an interactive Plotly bubble plot for tokenizer visualization.
+        Features:
+        - Interactive hover, zoom, pan
+        - Text annotations on bubbles
+        - Viridis colormap
+        - Matches matplotlib styling
+        - Model type filtering
+        """
+        try:
+            # Load leaderboard summary
+            file_path = settings.data.csv_file
+            if not file_path.exists():
+                logger.warning(f"Leaderboard data not found: {file_path}")
+                return None
+            df = pd.read_csv(file_path)
+            # Apply column name mappings from CSV to display names
+            csv_mapping = column_registry.get_csv_mapping()
+            df = df.rename(columns=csv_mapping)
+            # Apply model type filter
+            if model_type_filter != "All" and "Model Type" in df.columns:
+                df = df[df["Model Type"] == model_type_filter]
+            # Filter rows that have the required columns
+            required_cols = [x_col, y_col, size_col, 'Model']
+            if not all(col in df.columns for col in required_cols):
+                logger.warning(f"Missing required columns for plot")
+                return None
+            # Filter out rows with missing data
+            plot_df = df[required_cols].copy()
+            plot_df = plot_df.dropna(subset=[x_col, y_col, size_col])
+            if plot_df.empty:
+                logger.warning(f"No data available for plotting {x_col} vs {y_col}")
+                return None
+            # Prepare data
+            x = plot_df[x_col]
+            y = plot_df[y_col]
+            sizes = plot_df[size_col]
+            models = plot_df['Model']
+            # Normalize sizes for bubble plot (smaller bubbles for cleaner look)
+            size_min, size_max = sizes.min(), sizes.max()
+            if size_max > size_min:
+                normalized_sizes = 8 + (sizes - size_min) / (size_max - size_min) * 35
+            else:
+                normalized_sizes = np.full(len(sizes), 20)
+            # Create Plotly figure
+            fig = go.Figure()
+            # Add scatter trace with bubbles
+            fig.add_trace(go.Scatter(
+                x=x,
+                y=y,
+                mode='markers',
+                marker=dict(
+                    size=normalized_sizes,
+                    color=sizes,  # Color by Turkish Token Count
+                    colorscale='Viridis',
+                    showscale=True,
+                    colorbar=dict(
+                        title=dict(text="Turkish<br>Token<br>Count", font=dict(size=12, family='Arial, sans-serif')),
+                        thickness=12,
+                        len=1
+                    ),
+                    line=dict(width=0.5, color='rgba(0,0,0,0.3)'),
+                    opacity=0.7
+                ),
+                text=models,
+                hovertemplate='<b>%{text}</b><br>' +
+                             f'{xlabel}: %{{x:.2f}}<br>' +
+                             f'{ylabel}: %{{y:.0f}}<br>' +
+                             f'{size_col}: %{{marker.color:.0f}}<br>' +
+                             '<extra></extra>',
+                name='',
+                showlegend=False
+            ))
+            # Get top 5 models by Pure Token Count for custom legend
+            top_5_df = plot_df.nlargest(5, y_col)
+            top_5_models = top_5_df['Model'].tolist()
+            # Build custom legend text using annotations (pixel-perfect control)
+            legend_lines = ["<b>Top 5 Models</b>"] + [f"{i}. {name}" for i, name in enumerate(top_5_models, 1)]
+            legend_text = "<br>".join(legend_lines)
+            # Update layout for responsive, clean display
+            fig.update_layout(
+                title=dict(
+                    text=title,
+                    font=dict(size=14, family='Arial, sans-serif', color='black'),
+                    x=0.5,
+                    xanchor='center',
+                    y=0.98,
+                    yanchor='top'
+                ),
+                xaxis=dict(
+                    title=dict(text=xlabel, font=dict(size=12, family='Arial, sans-serif')),
+                    gridcolor='rgba(128,128,128,0.2)',
+                    gridwidth=0.5,
+                    showgrid=True,
+                    zeroline=False
+                ),
+                yaxis=dict(
+                    title=dict(text=ylabel, font=dict(size=12, family='Arial, sans-serif')),
+                    gridcolor='rgba(128,128,128,0.2)',
+                    gridwidth=0.5,
+                    showgrid=True,
+                    zeroline=False
+                ),
+                plot_bgcolor='white',
+                paper_bgcolor='white',
+                autosize=True,
+                hovermode='closest',
+                showlegend=False,
+                margin=dict(l=60, r=60, t=80, b=60)
+            )
+            # Add custom legend as annotation
+            fig.add_annotation(
+                text=legend_text,
+                xref='paper',
+                yref='paper',
+                x=1.14,
+                y=1.255,
+                xanchor='right',
+                yanchor='top',
+                showarrow=False,
+                font=dict(size=9, family='Arial, sans-serif', color='#333'),
+                align='left',
+                bgcolor='rgba(255,255,255,0.9)',
+                bordercolor='rgba(0,0,0,0.15)',
+                borderwidth=1,
+                borderpad=4
+            )
+            # Expand x-axis range for better spacing
+            x_min, x_max = x.min(), x.max()
+            x_range = x_max - x_min
+            fig.update_xaxes(range=[x_min - x_range * 0.05, x_max + x_range * 0.05])
+            return fig
+        except Exception as e:
+            logger.error(f"Error creating bubble plot: {e}")
+            return None
+    def _get_pure_vs_mean_task_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
+        """Get Plotly figure for Pure Token Count vs MTEB Score plot."""
+        return self._create_bubble_plot(
+            x_col='MTEB Score',
+            y_col='Pure Token Count',
+            size_col='Turkish Token Count',
+            title='Pure Token Count vs MTEB Score',
+            xlabel='MTEB Score (%)',
+            ylabel='Pure Token Count',
+            model_type_filter=model_type_filter
+        )
+    def _get_pure_vs_legal_score_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
+        """Get Plotly figure for Pure Token Count vs Legal Score plot."""
+        return self._create_bubble_plot(
+            x_col='Legal Score',
+            y_col='Pure Token Count',
+            size_col='Turkish Token Count',
+            title='Pure Token Count vs Legal Score',
+            xlabel='Legal Score (%)',
+            ylabel='Pure Token Count',
+            model_type_filter=model_type_filter
+        )
+    def build(self) -> gr.Dataframe:
+        """
+        Build the leaderboard tab UI.
+        Returns:
+            The main leaderboard Dataframe component.
+        """
+        # Initial styled data (filter to All by default)
+        initial_columns = column_registry.default_columns
+        initial_styled = self._get_styled_data(initial_columns, "All")
+        initial_datatypes = self.styler.get_datatypes(initial_columns)
+        initial_widths = self.styler.get_column_widths(initial_columns)
+        # State to track selected columns in click order
+        self._selected_columns_state = gr.State([])
+        # State to track model type filter
+        self._model_type_filter_state = gr.State("All")
+        # Get column groups
+        column_groups = self._get_column_groups()
+        # Model Type Filter (Radio buttons)
+        model_type_choices = ["All", "CLM-Embedding", "Embedding", "MLM", "Seq2Seq"]
+        model_type_radio = gr.Radio(
+            choices=model_type_choices,
+            value="All",
+            label="Filter by Model Type",
+            container=True,
+        )
+        # Create checkbox groups in a compact accordion layout
+        checkbox_components = []
+        with gr.Accordion("Optional Columns", open=False):
+            with gr.Row():
+                for group_name, columns in column_groups.items():
+                    checkbox = gr.CheckboxGroup(
+                        choices=columns,
+                        value=[],
+                        label=group_name,
+                        container=True,
+                    )
+                    self._column_checkboxes[group_name] = checkbox
+                    checkbox_components.append(checkbox)
+        # Main leaderboard
+        self.leaderboard = gr.Dataframe(
+            value=initial_styled,
+            datatype=initial_datatypes,
+            column_widths=initial_widths,
+            interactive=False,
+            wrap=True,
+            max_height=settings.ui.max_table_height,
+            show_search=True,
+            show_copy_button=True,
+            show_fullscreen_button=True,
+        )
+        # Tokenizer visualizations
+        gr.Markdown("### Tokenizer Quality Visualizations")
+        gr.Markdown("""
+        Interactive bubble plots showing tokenizer quality metrics vs model performance.
+        Bubble size and color represent Turkish Token Count. Hover for details, zoom, and pan.
+        """)
+        with gr.Row():
+            # Plot 1: Pure Token Count vs Mean Task
+            self.plot_mean_task = gr.Plot(
+                value=self._get_pure_vs_mean_task_plot("All"),
+                label="Pure Token Count vs Mean Task (MTEB)",
+                show_label=False,
+            )
+            # Plot 2: Pure Token Count vs Legal Score
+            self.plot_legal_score = gr.Plot(
+                value=self._get_pure_vs_legal_score_plot("All"),
+                label="Pure Token Count vs Score(Legal)",
+                show_label=False,
+            )
+        # Usage instructions
+        gr.Markdown("""
+        ### How to Use:
+        - **Search**: Use the search box to find specific models
+        - **Color Coding**: Scores are color-coded from red (low) to green (high)
+        - **Sorting**: Click on column headers to sort
+        - **Rankings**: Models ranked by MTEB Score
+        - **Toggle Columns**: Use the checkboxes above to show/hide additional metrics
+        - **Filter by Model Type**: Use the radio buttons to filter models by their type
+        """)
+        # Wire up events
+        self._setup_events(checkbox_components, model_type_radio)
+        return self.leaderboard
+    def _setup_events(
+        self,
+        checkbox_components: List[gr.CheckboxGroup],
+        model_type_radio: gr.Radio
+    ):
+        """Set up event handlers."""
+        # Each checkbox group triggers column filtering with state tracking
+        for checkbox in checkbox_components:
+            checkbox.change(
+                fn=self._filter_columns_handler,
+                inputs=[self._selected_columns_state, self._model_type_filter_state] + checkbox_components,
+                outputs=[self.leaderboard, self._selected_columns_state]
+            )
+        # Model type radio triggers filtering and plot updates
+        model_type_radio.change(
+            fn=self._model_type_and_plots_handler,
+            inputs=[self._selected_columns_state, model_type_radio],
+            outputs=[self.leaderboard, self._model_type_filter_state, self.plot_mean_task, self.plot_legal_score]
+        )

src/components/submit.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Submit Tab Component
+Model evaluation submission with HuggingFace authentication.
+"""
+import logging
+import re
+from typing import Optional, Tuple
+import gradio as gr
+from ..api import EvaluationApiClient
+logger = logging.getLogger(__name__)
+class SubmitTab:
+    """
+    Submit evaluation tab component.
+    Provides:
+    - HuggingFace OAuth login
+    - Model submission form
+    - Email notification setup
+    """
+    def __init__(self):
+        self.api_client = EvaluationApiClient()
+        # UI components (will be set during build)
+        self.model_input: Optional[gr.Textbox] = None
+        self.email_input: Optional[gr.Textbox] = None
+        self.submit_btn: Optional[gr.Button] = None
+        self.login_button: Optional[gr.LoginButton] = None
+        self.result_output: Optional[gr.HTML] = None
+    def _validate_model_name(self, model_name: str) -> Optional[str]:
+        """Validate model name format."""
+        if not model_name or not model_name.strip():
+            return "Model name cannot be empty!"
+        model_name = model_name.strip()
+        if len(model_name) < 3:
+            return "Model name too short!"
+        if len(model_name) > 256:
+            return "Model name too long (maximum 256 characters)!"
+        if '/' not in model_name:
+            return "Invalid format! Must include organization (e.g., organization/model-name)"
+        if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
+            return "Invalid format! Use format: organization/model-name"
+        return None
+    def _validate_email(self, email: str) -> Optional[str]:
+        """Validate email format."""
+        if not email or not email.strip():
+            return "Email address cannot be empty!"
+        email = email.strip()
+        if len(email) > 254:
+            return "Email address too long!"
+        email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
+        if not re.match(email_pattern, email):
+            return "Invalid email address format!"
+        return None
+    def _handle_submit(self, model_name: str, email: str, profile) -> str:
+        """Handle evaluation submission."""
+        # Authentication check
+        if profile is None:
+            return "<p style='color: red; font-weight: bold;'>⚠️ Authentication required. Please log in with your Hugging Face account.</p>"
+        # Check for local dev mock auth
+        if isinstance(profile, str) and profile == "Sign in with Hugging Face":
+            return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
+        # Validate model name
+        model_error = self._validate_model_name(model_name)
+        if model_error:
+            return f"<p style='color: red; font-weight: bold;'>❌ {model_error}</p>"
+        # Validate email
+        email_error = self._validate_email(email)
+        if email_error:
+            return f"<p style='color: red; font-weight: bold;'>❌ {email_error}</p>"
+        # Submit to API
+        model_name = model_name.strip()
+        email = email.strip()
+        try:
+            success = self.api_client.submit_evaluation(model_name, email)
+            if success:
+                return f"""
+                <div style='padding: 16px; background: #d4edda; border-radius: 8px; border: 1px solid #c3e6cb; color: #155724;'>
+                    <h3 style='color: #155724; margin: 0 0 12px 0;'>✅ Evaluation Request Submitted!</h3>
+                    <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Model:</strong> {model_name}</p>
+                    <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Email:</strong> {email}</p>
+                    <hr style='margin: 12px 0; border-color: #c3e6cb;'>
+                    <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Next Steps:</strong></p>
+                    <ul style='color: #155724; margin: 8px 0; padding-left: 20px;'>
+                        <li style='color: #155724;'>Your request will be reviewed by our system</li>
+                        <li style='color: #155724;'>You will receive email notifications about the status</li>
+                        <li style='color: #155724;'>Results will appear on the leaderboard when complete</li>
+                    </ul>
+                    <p style='color: #155724; margin-top: 12px; font-style: italic;'>Thank you for contributing to the Mizan Leaderboard!</p>
+                </div>
+                """
+            else:
+                return """
+                <div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
+                    <h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Submission Failed</h3>
+                    <p>Unable to connect to the evaluation service. Please try again later.</p>
+                </div>
+                """
+        except Exception as e:
+            logger.error(f"Error submitting evaluation: {e}")
+            return f"""
+            <div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
+                <h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Error</h3>
+                <p>An unexpected error occurred. Please try again later.</p>
+            </div>
+            """
+    def build(self) -> None:
+        """Build the submit tab UI."""
+        gr.Markdown("### Submit Model for Evaluation")
+        gr.Markdown("""
+        Submit your Turkish embedding model for evaluation on the Mizan benchmark.
+        **Authentication with Hugging Face is required to submit evaluations.**
+        """)
+        # OAuth login button
+        self.login_button = gr.LoginButton(value="Sign in with Hugging Face")
+        self.model_input = gr.Textbox(
+            label="Model Name",
+            placeholder="sentence-transformers/your-model",
+            info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
+        )
+        self.email_input = gr.Textbox(
+            label="Email Address",
+            placeholder="your.email@example.com",
+            info="Email for notifications about evaluation status and results"
+        )
+        self.submit_btn = gr.Button(
+            "Submit",
+            variant="primary",
+            size="lg"
+        )
+        # Result output
+        self.result_output = gr.HTML(label="Status")
+        # Wire up submit button
+        self.submit_btn.click(
+            fn=self._handle_submit,
+            inputs=[self.model_input, self.email_input, self.login_button],
+            outputs=[self.result_output]
+        )
+        # Information about the evaluation process
+        gr.Markdown("""
+        ### Evaluation Process:
+        1. **Sign In**: First, sign in with your Hugging Face account using the button above
+        2. **Submit Request**: Fill out the form with your model details and email
+        3. **Admin Review**: Your request will be reviewed by administrators
+        4. **Evaluation**: If approved, your model will be evaluated on Mizan benchmark
+        5. **Results**: You'll receive email notifications and results will appear on the leaderboard
+        ### Important Notes:
+        - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
+        - You'll receive email updates about your request status
+        - Make sure your model is publicly available on HuggingFace
+        - Valid email address is required for receiving results
+        """)

src/core/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Core modules - configuration and column definitions."""
+from .columns import column_registry, ColumnType, ColumnGroup, ColumnDefinition
+from .config import settings
+__all__ = [
+    "column_registry",
+    "ColumnType",
+    "ColumnGroup",
+    "ColumnDefinition",
+    "settings",
+]

src/core/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/__init__.cpython-312.pyc and b/src/core/__pycache__/__init__.cpython-312.pyc differ

src/core/__pycache__/columns.cpython-312.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/columns.cpython-312.pyc and b/src/core/__pycache__/columns.cpython-312.pyc differ

src/core/__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/config.cpython-312.pyc and b/src/core/__pycache__/config.cpython-312.pyc differ

src/core/columns.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""
+Centralized Column Definitions
+Single source of truth for all leaderboard columns.
+Add new columns here and they propagate everywhere automatically.
+"""
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import List, Dict, Optional
+class ColumnType(Enum):
+    """Column data types for Gradio."""
+    NUMBER = "number"
+    STRING = "str"
+    HTML = "html"
+class ColumnGroup(Enum):
+    """Column groupings for organization and filtering."""
+    CORE = auto()           # Always visible: Rank, Model
+    LEGAL = auto()          # Legal benchmark scores
+    MTEB = auto()           # MTEB task type scores
+    TOKENIZER = auto()      # Tokenizer quality metrics
+    MODEL_INFO = auto()     # Model metadata
+    CORRELATION = auto()    # Correlation metrics
+@dataclass
+class ColumnDefinition:
+    """
+    Complete definition for a leaderboard column.
+    This is the single source of truth - all column metadata lives here.
+    """
+    name: str
+    api_name: Optional[str] = None
+    column_type: ColumnType = ColumnType.STRING
+    group: ColumnGroup = ColumnGroup.CORE
+    width: str = "120px"
+    decimals: int = 2
+    default_visible: bool = True
+    colorize: bool = False
+    description: str = ""
+    @property
+    def csv_key(self) -> str:
+        """Get the key used in CSV files."""
+        return self.api_name or self.name
+COLUMN_DEFINITIONS: List[ColumnDefinition] = [
+    # 1. Rank (always first)
+    ColumnDefinition(
+        name="Rank",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.CORE,
+        width="50px",
+        decimals=0,
+        default_visible=True,
+        description="Rank by MTEB Score (Mean TaskType)"
+    ),
+    # 2. Model (always second)
+    ColumnDefinition(
+        name="Model",
+        column_type=ColumnType.HTML,
+        group=ColumnGroup.CORE,
+        width="280px",
+        default_visible=True,
+        colorize=False,
+        description="Model name with HuggingFace link"
+    ),
+    # 3. MTEB Score - default
+    ColumnDefinition(
+        name="MTEB Score",
+        api_name="Mean (TaskType)",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="140px",
+        default_visible=True,
+        colorize=True,
+        description="MTEB Score: Average of task type category scores"
+    ),
+    # 4. Legal Score - default
+    ColumnDefinition(
+        name="Legal Score",
+        api_name="Score(Legal)",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.LEGAL,
+        width="120px",
+        default_visible=True,
+        colorize=True,
+        description="Mean of legal benchmark scores (Contracts, Regulation, Caselaw)"
+    ),
+    # 5. Pure Token Count - default
+    ColumnDefinition(
+        name="Pure Token Count",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.TOKENIZER,
+        width="150px",
+        decimals=0,
+        default_visible=True,
+        description="Tokens that are morphologically pure"
+    ),
+    # 6. Max Sequence Length - default
+    ColumnDefinition(
+        name="Max Sequence Length",
+        api_name="Max Tokens",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MODEL_INFO,
+        width="160px",
+        decimals=0,
+        default_visible=True,
+        description="Maximum sequence length"
+    ),
+    # 7. Parameters - default
+    ColumnDefinition(
+        name="Parameters",
+        api_name="Number of Parameters",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MODEL_INFO,
+        width="120px",
+        decimals=0,
+        default_visible=True,
+        description="Number of model parameters (e.g., 1.2B)"
+    ),
+    # 8. Model Architecture - default
+    ColumnDefinition(
+        name="Model Architecture",
+        column_type=ColumnType.STRING,
+        group=ColumnGroup.MODEL_INFO,
+        width="180px",
+        default_visible=True,
+        description="Underlying model architecture (e.g., XLMRobertaModel)"
+    ),
+    # 9. Mean (Task) - optional
+    ColumnDefinition(
+        name="Mean (Task)",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="120px",
+        default_visible=False,
+        colorize=True,
+        description="Average of all individual task scores"
+    ),
+    # 10. Contracts - optional
+    ColumnDefinition(
+        name="Contracts",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.LEGAL,
+        width="110px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Turkish legal contract analysis"
+    ),
+    # 11. Regulation - optional
+    ColumnDefinition(
+        name="Regulation",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.LEGAL,
+        width="110px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Turkish tax rulings retrieval"
+    ),
+    # 12. Caselaw - optional
+    ColumnDefinition(
+        name="Caselaw",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.LEGAL,
+        width="110px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Court of Cassation case retrieval"
+    ),
+    # 13. Classification - optional
+    ColumnDefinition(
+        name="Classification",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="130px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Turkish classification tasks"
+    ),
+    # 14. Clustering - optional
+    ColumnDefinition(
+        name="Clustering",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="120px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Turkish clustering tasks"
+    ),
+    # 15. Pair Classification - optional
+    ColumnDefinition(
+        name="Pair Classification",
+        api_name="PairClassification",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="150px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on pair classification tasks (NLI)"
+    ),
+    # 16. Retrieval - optional
+    ColumnDefinition(
+        name="Retrieval",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="120px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on information retrieval tasks"
+    ),
+    # 17. STS - optional
+    ColumnDefinition(
+        name="STS",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MTEB,
+        width="100px",
+        default_visible=False,
+        colorize=True,
+        description="Performance on Semantic Textual Similarity tasks"
+    ),
+    # 18. Correlation - optional
+    ColumnDefinition(
+        name="Correlation",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.CORRELATION,
+        width="120px",
+        decimals=3,
+        default_visible=False,
+        colorize=True,
+        description="Weighted average of correlation metrics"
+    ),
+    # 19. Tokenizer Type - optional
+    ColumnDefinition(
+        name="Tokenizer Type",
+        column_type=ColumnType.STRING,
+        group=ColumnGroup.TOKENIZER,
+        width="180px",
+        default_visible=False,
+        description="Tokenizer implementation type"
+    ),
+    # 20. Unique Token Count - optional
+    ColumnDefinition(
+        name="Unique Token Count",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.TOKENIZER,
+        width="160px",
+        decimals=0,
+        default_visible=False,
+        description="Number of unique tokens on Turkish MMLU"
+    ),
+    # 21. Turkish Token Count - optional
+    ColumnDefinition(
+        name="Turkish Token Count",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.TOKENIZER,
+        width="170px",
+        decimals=0,
+        default_visible=False,
+        description="Unique tokens that are valid Turkish"
+    ),
+    # 22. Turkish Token % - optional
+    ColumnDefinition(
+        name="Turkish Token %",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.TOKENIZER,
+        width="140px",
+        default_visible=False,
+        description="Percentage of valid Turkish tokens"
+    ),
+    # 23. Pure Token % - optional
+    ColumnDefinition(
+        name="Pure Token %",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.TOKENIZER,
+        width="130px",
+        default_visible=False,
+        description="Percentage of pure root word tokens"
+    ),
+    # 24. Embed Dim - optional
+    ColumnDefinition(
+        name="Embed Dim",
+        api_name="Embedding Dimensions",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MODEL_INFO,
+        width="120px",
+        decimals=0,
+        default_visible=False,
+        description="Embedding dimension size"
+    ),
+    # 25. Vocab Size - optional
+    ColumnDefinition(
+        name="Vocab Size",
+        column_type=ColumnType.NUMBER,
+        group=ColumnGroup.MODEL_INFO,
+        width="120px",
+        decimals=0,
+        default_visible=False,
+        description="Vocabulary size"
+    ),
+    # 26. Model Type - optional
+    ColumnDefinition(
+        name="Model Type",
+        column_type=ColumnType.STRING,
+        group=ColumnGroup.MODEL_INFO,
+        width="130px",
+        default_visible=False,
+        description="Model type: Embedding, MLM, CLM-Embedding, or Seq2Seq"
+    ),
+]
+class ColumnRegistry:
+    """
+    Central registry for column definitions.
+    Provides convenient access methods for column metadata.
+    """
+    def __init__(self, definitions: List[ColumnDefinition] = None):
+        self._definitions = definitions or COLUMN_DEFINITIONS
+        self._by_name: Dict[str, ColumnDefinition] = {
+            col.name: col for col in self._definitions
+        }
+        self._by_csv_key: Dict[str, ColumnDefinition] = {
+            col.csv_key: col for col in self._definitions
+        }
+    @property
+    def all_columns(self) -> List[str]:
+        """All column names in order."""
+        return [col.name for col in self._definitions]
+    @property
+    def default_columns(self) -> List[str]:
+        """Columns visible by default."""
+        return [col.name for col in self._definitions if col.default_visible]
+    @property
+    def optional_columns(self) -> List[str]:
+        """Columns that can be toggled on/off."""
+        return [col.name for col in self._definitions if not col.default_visible]
+    @property
+    def score_columns(self) -> List[str]:
+        """Columns that should be colorized."""
+        return [col.name for col in self._definitions if col.colorize]
+    @property
+    def numeric_columns(self) -> List[str]:
+        """Columns with numeric type."""
+        return [col.name for col in self._definitions if col.column_type == ColumnType.NUMBER]
+    def get(self, name: str) -> Optional[ColumnDefinition]:
+        """Get column definition by name."""
+        return self._by_name.get(name)
+    def get_by_csv_key(self, csv_key: str) -> Optional[ColumnDefinition]:
+        """Get column definition by CSV key."""
+        return self._by_csv_key.get(csv_key)
+    def get_by_group(self, group: ColumnGroup) -> List[ColumnDefinition]:
+        """Get all columns in a group."""
+        return [col for col in self._definitions if col.group == group]
+    def get_group_names(self, group: ColumnGroup) -> List[str]:
+        """Get column names for a group."""
+        return [col.name for col in self.get_by_group(group)]
+    def get_datatypes(self, columns: List[str]) -> List[str]:
+        """Get Gradio datatypes for given columns."""
+        return [
+            self._by_name[col].column_type.value
+            for col in columns
+            if col in self._by_name
+        ]
+    def get_widths(self, columns: List[str]) -> List[str]:
+        """Get column widths for given columns."""
+        return [
+            self._by_name[col].width
+            for col in columns
+            if col in self._by_name
+        ]
+    def get_csv_mapping(self) -> Dict[str, str]:
+        """Get mapping from CSV keys to display names."""
+        return {
+            col.csv_key: col.name
+            for col in self._definitions
+            if col.csv_key != col.name
+        }
+# Global registry instance
+column_registry = ColumnRegistry()

src/core/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Configuration Module for HuggingFace Space
+Simplified configuration for the public Mizan leaderboard.
+"""
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+@dataclass
+class ApiSettings:
+    """API settings for evaluation submissions."""
+    url: str = field(default_factory=lambda: os.environ.get("API_URL", ""))
+    username: str = field(default_factory=lambda: os.environ.get("API_USERNAME", ""))
+    password: str = field(default_factory=lambda: os.environ.get("API_PASSWORD", ""))
+    timeout: int = 30
+    @property
+    def is_configured(self) -> bool:
+        """Check if API is fully configured."""
+        return bool(self.url and self.username and self.password)
+@dataclass
+class UISettings:
+    """UI-specific settings."""
+    port: int = 7860
+    max_table_height: int = 600
+    debug: bool = field(default_factory=lambda: os.environ.get("DEBUG", "false").lower() == "true")
+@dataclass
+class DataSettings:
+    """Data file settings."""
+    csv_file: Path = field(default_factory=lambda: Path("leaderboard_data.csv"))
+@dataclass
+class Settings:
+    """Main application settings container."""
+    api: ApiSettings = field(default_factory=ApiSettings)
+    ui: UISettings = field(default_factory=UISettings)
+    data: DataSettings = field(default_factory=DataSettings)
+# Global settings instance
+settings = Settings()

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Data processing modules."""
+from .transformer import DataTransformer, parse_parameter_string, format_parameter_count
+from .styler import LeaderboardStyler
+__all__ = [
+    "DataTransformer",
+    "LeaderboardStyler",
+    "parse_parameter_string",
+    "format_parameter_count",
+]

src/data/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src/data/__pycache__/__init__.cpython-312.pyc and b/src/data/__pycache__/__init__.cpython-312.pyc differ

src/data/__pycache__/styler.cpython-312.pyc CHANGED Viewed

Binary files a/src/data/__pycache__/styler.cpython-312.pyc and b/src/data/__pycache__/styler.cpython-312.pyc differ

src/data/__pycache__/transformer.cpython-312.pyc CHANGED Viewed

Binary files a/src/data/__pycache__/transformer.cpython-312.pyc and b/src/data/__pycache__/transformer.cpython-312.pyc differ

src/data/styler.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Leaderboard Styling Module
+Handles color gradients and visual styling for the leaderboard.
+"""
+import logging
+import html
+from typing import Dict, Tuple, List
+import pandas as pd
+from matplotlib.colors import LinearSegmentedColormap
+from ..core.columns import column_registry
+from .transformer import format_parameter_count
+logger = logging.getLogger(__name__)
+class LeaderboardStyler:
+    """
+    Applies visual styling to leaderboard DataFrames.
+    Uses Excel-like Red-Yellow-Green color gradients for score columns.
+    """
+    # Excel-style color gradient: Red -> Yellow -> Green
+    GRADIENT_COLORS = [
+        (0.9, 0.1, 0.2),       # Red (low scores)
+        (1.0, 1.0, 0.0),       # Yellow (medium scores)
+        (0/255, 176/255, 80/255)  # Excel Green (high scores)
+    ]
+    def __init__(self):
+        self._colormap = LinearSegmentedColormap.from_list(
+            "ExcelRedYellowGreen",
+            self.GRADIENT_COLORS,
+            N=256
+        )
+    @staticmethod
+    def rgb_to_hex(rgb: Tuple[float, float, float]) -> str:
+        """Convert RGB tuple (0-1 range) to hex color."""
+        r = int(rgb[0] * 255)
+        g = int(rgb[1] * 255)
+        b = int(rgb[2] * 255)
+        return f"#{r:02x}{g:02x}{b:02x}"
+    def get_color_for_value(self, value: float, min_val: float, max_val: float) -> str:
+        """Get hex color for a value within a range."""
+        if max_val == min_val:
+            normalized = 0.5
+        else:
+            normalized = (value - min_val) / (max_val - min_val)
+        # Clamp to [0, 0.999] to avoid edge case at exactly 1.0
+        normalized = max(0, min(0.999, normalized))
+        rgba = self._colormap(normalized)
+        return self.rgb_to_hex(rgba[:3])
+    def calculate_color_ranges(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
+        """Calculate min/max for each score column."""
+        ranges = {}
+        for col_name in column_registry.score_columns:
+            if col_name not in df.columns:
+                continue
+            numeric_values = pd.to_numeric(df[col_name], errors='coerce')
+            if numeric_values.isna().all():
+                continue
+            ranges[col_name] = {
+                'min': numeric_values.min(),
+                'max': numeric_values.max()
+            }
+        return ranges
+    def apply_styling(self, df: pd.DataFrame) -> "pd.io.formats.style.Styler":
+        """
+        Apply color styling to DataFrame.
+        Returns a pandas Styler object that Gradio can render.
+        """
+        if df.empty:
+            return df.style
+        df_copy = df.copy()
+        # Convert "N/A" to NaN for proper formatting
+        for col in column_registry.score_columns:
+            if col in df_copy.columns:
+                df_copy[col] = df_copy[col].replace("N/A", pd.NA)
+                df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
+        # Calculate color ranges
+        color_ranges = self.calculate_color_ranges(df_copy)
+        # Create style function
+        def apply_gradient(val, col_name: str):
+            if col_name not in color_ranges:
+                return ''
+            if pd.isna(val):
+                return ''
+            try:
+                min_val = color_ranges[col_name]['min']
+                max_val = color_ranges[col_name]['max']
+                color_hex = self.get_color_for_value(float(val), min_val, max_val)
+                return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
+            except (ValueError, TypeError):
+                return ''
+        # Apply styling
+        styler = df_copy.style
+        for col in column_registry.score_columns:
+            if col in df_copy.columns:
+                styler = styler.map(
+                    lambda val, c=col: apply_gradient(val, c),
+                    subset=[col]
+                )
+        # Format numeric columns
+        format_dict = {}
+        for col_name in column_registry.numeric_columns:
+            if col_name in df_copy.columns:
+                col_def = column_registry.get(col_name)
+                # Special handling for Parameters column - use human-readable format
+                if col_name == "Parameters":
+                    format_dict[col_name] = format_parameter_count
+                elif col_def and col_def.decimals == 0:
+                    format_dict[col_name] = '{:.0f}'
+                elif col_def and col_def.decimals == 3:
+                    format_dict[col_name] = '{:.3f}'
+                else:
+                    format_dict[col_name] = '{:.2f}'
+        # Format model column as hyperlink without mutating the underlying data
+        if "Model" in df_copy.columns:
+            def _model_link_formatter(value: object) -> str:
+                model_name = html.escape(str(value))
+                return (
+                    f'<a href="https://huggingface.co/{model_name}" target="_blank" '
+                    f'style="color: #2563eb; text-decoration: underline;">{model_name}</a>'
+                )
+            format_dict["Model"] = _model_link_formatter
+        if format_dict:
+            # Don't replace NA values - let them display as they are in the CSV
+            styler = styler.format(format_dict, na_rep='', escape=None)
+        return styler
+    def get_datatypes(self, columns: List[str]) -> List[str]:
+        """Get Gradio datatypes for columns."""
+        return column_registry.get_datatypes(columns)
+    def get_column_widths(self, columns: List[str]) -> List[str]:
+        """Get column widths for columns."""
+        return column_registry.get_widths(columns)

src/data/transformer.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+Data Transformation Module
+Handles DataFrame transformations and CSV loading.
+"""
+import logging
+import html
+import re
+from typing import List, Optional, Union
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from ..core.columns import column_registry, ColumnType
+from ..core.config import settings
+logger = logging.getLogger(__name__)
+def parse_parameter_string(value: Union[str, float, int]) -> Optional[float]:
+    """
+    Parse parameter strings like '307M', '1B', '1.7B', '4B' to numeric values.
+    Args:
+        value: Parameter string (e.g., '307M', '1B', '1.7B') or numeric value.
+    Returns:
+        Numeric value (in millions for consistency) or None if parsing fails.
+    """
+    if pd.isna(value):
+        return None
+    # If already numeric, return as-is
+    if isinstance(value, (int, float)):
+        return float(value)
+    value_str = str(value).strip().upper()
+    # Handle special cases
+    if value_str in ('', 'N/A', 'NA', 'NAN', 'NONE', '∞'):
+        return None
+    # Pattern to match numbers with optional suffix (K, M, B, T)
+    pattern = r'^([\d.]+)\s*([KMBT])?$'
+    match = re.match(pattern, value_str)
+    if not match:
+        return None
+    try:
+        number = float(match.group(1))
+        suffix = match.group(2)
+        # Convert to raw count based on suffix
+        multipliers = {
+            None: 1,
+            'K': 1_000,
+            'M': 1_000_000,
+            'B': 1_000_000_000,
+            'T': 1_000_000_000_000
+        }
+        return number * multipliers.get(suffix, 1)
+    except (ValueError, TypeError):
+        return None
+def format_parameter_count(value: Union[float, int, None]) -> str:
+    """
+    Format a numeric parameter count to human-readable string.
+    Args:
+        value: Numeric parameter count.
+    Returns:
+        Formatted string like '307M', '1.7B', '4B'.
+    """
+    if pd.isna(value) or value is None:
+        return ''
+    try:
+        value = float(value)
+    except (ValueError, TypeError):
+        return str(value)
+    if value >= 1_000_000_000_000:
+        formatted = value / 1_000_000_000_000
+        return f"{formatted:.1f}T" if formatted != int(formatted) else f"{int(formatted)}T"
+    elif value >= 1_000_000_000:
+        formatted = value / 1_000_000_000
+        return f"{formatted:.1f}B" if formatted != int(formatted) else f"{int(formatted)}B"
+    elif value >= 1_000_000:
+        formatted = value / 1_000_000
+        return f"{formatted:.0f}M" if formatted >= 10 else f"{formatted:.1f}M".rstrip('0').rstrip('.')+"M" if formatted != int(formatted) else f"{int(formatted)}M"
+    elif value >= 1_000:
+        formatted = value / 1_000
+        return f"{formatted:.0f}K" if formatted >= 10 else f"{formatted:.1f}K"
+    else:
+        return str(int(value))
+class DataTransformer:
+    """
+    Transforms data between different formats.
+    Handles CSV -> DataFrame conversions and display preparation.
+    """
+    @staticmethod
+    def create_empty_dataframe() -> pd.DataFrame:
+        """Create an empty DataFrame with all column definitions."""
+        return pd.DataFrame(columns=column_registry.all_columns)
+    @staticmethod
+    def load_from_csv(file_path: Path = None) -> pd.DataFrame:
+        """
+        Load leaderboard data from CSV file.
+        Args:
+            file_path: Path to CSV file (uses default if None).
+        Returns:
+            DataFrame with leaderboard data.
+        """
+        path = file_path or settings.data.csv_file
+        if not path.exists():
+            logger.warning(f"CSV file not found: {path}")
+            return DataTransformer.create_empty_dataframe()
+        try:
+            df = pd.read_csv(path)
+            logger.info(f"Loaded {len(df)} records from {path}")
+            # Convert to display format
+            df = DataTransformer._normalize_columns(df)
+            df = DataTransformer._convert_parameters_to_numeric(df)
+            df = DataTransformer._sort_by_rank(df)
+            return df
+        except Exception as e:
+            logger.error(f"Error loading CSV: {e}")
+            return DataTransformer.create_empty_dataframe()
+    @staticmethod
+    def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
+        """Normalize column names from CSV variations to standard names."""
+        # Column name mappings for variations
+        column_mappings = {
+            "Mean (TaskType)": "MTEB Score",
+            "Score(Legal)": "Legal Score",
+            "Embedding Dimensions": "Embed Dim",
+            "Embedding Dim": "Embed Dim",
+            "Max Tokens": "Max Sequence Length",
+            "Max Seq Length": "Max Sequence Length",
+            "Number of Parameters": "Parameters",
+            "PairClassification": "Pair Classification",
+            "Vocabulary Size": "Vocab Size",
+            "Vocabulary": "Vocab Size",
+        }
+        df = df.copy()
+        # Rename columns based on mappings
+        for old_name, new_name in column_mappings.items():
+            if old_name in df.columns and new_name not in df.columns:
+                df = df.rename(columns={old_name: new_name})
+        return df
+    @staticmethod
+    def _sort_by_rank(df: pd.DataFrame) -> pd.DataFrame:
+        """Sort DataFrame by MTEB Score descending and recalculate ranks."""
+        if "MTEB Score" in df.columns:
+            # Sort by MTEB Score descending (higher is better)
+            df = df.sort_values("MTEB Score", ascending=False, na_position='last').reset_index(drop=True)
+            # Recalculate ranks as 1, 2, 3, 4... (no ties)
+            df["Rank"] = range(1, len(df) + 1)
+        elif "Rank" in df.columns:
+            # Fallback to existing rank if MTEB Score not available
+            df = df.sort_values("Rank", ascending=True).reset_index(drop=True)
+        return df
+    @staticmethod
+    def _convert_parameters_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Convert Parameters column from string format to numeric for proper sorting.
+        Converts values like '307M', '1B', '1.7B' to numeric values.
+        """
+        if "Parameters" not in df.columns:
+            return df
+        df = df.copy()
+        df["Parameters"] = df["Parameters"].apply(parse_parameter_string)
+        return df
+    @staticmethod
+    def add_model_links(df: pd.DataFrame) -> pd.DataFrame:
+        """Add clickable HuggingFace links to Model column."""
+        if "Model" not in df.columns:
+            return df
+        df = df.copy()
+        df["Model"] = df["Model"].apply(
+            lambda x: f'<a href="https://huggingface.co/{html.escape(str(x))}" target="_blank" '
+                     f'style="color: #2563eb; text-decoration: underline;">{html.escape(str(x))}</a>'
+        )
+        return df
+    @staticmethod
+    def ensure_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
+        """Convert numeric columns to proper types."""
+        df = df.copy()
+        for col_name in column_registry.numeric_columns:
+            if col_name not in df.columns:
+                continue
+            col_def = column_registry.get(col_name)
+            if col_def is None:
+                continue
+            # Handle "N/A" and empty values
+            df[col_name] = df[col_name].replace("N/A", pd.NA)
+            df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
+            # Round to specified decimals
+            if col_def.decimals == 0:
+                # Keep as float to preserve NaN, format later
+                pass
+            else:
+                df[col_name] = df[col_name].round(col_def.decimals)
+        return df
+    @staticmethod
+    def filter_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
+        """Filter DataFrame to only include specified columns (preserves order)."""
+        available = [col for col in columns if col in df.columns]
+        return df[available]
+    @classmethod
+    def prepare_for_display(
+        cls,
+        df: pd.DataFrame,
+        columns: List[str] = None,
+        add_links: bool = True
+    ) -> pd.DataFrame:
+        """
+        Prepare DataFrame for Gradio display.
+        Args:
+            df: Source DataFrame.
+            columns: Columns to include (preserves order passed in).
+            add_links: Whether to add HuggingFace links.
+        Returns:
+            Prepared DataFrame.
+        """
+        if df is None or df.empty:
+            return cls.create_empty_dataframe()
+        # Work with a copy
+        result = df.copy()
+        # Filter columns if specified (preserves the order passed in)
+        if columns:
+            result = cls.filter_columns(result, columns)
+        # Convert numeric columns
+        result = cls.ensure_numeric_columns(result)
+        # Add model links
+        if add_links and "Model" in result.columns:
+            result = cls.add_model_links(result)
+        return result

ui_components.py DELETED Viewed

@@ -1,259 +0,0 @@
-#!/usr/bin/env python3
-"""
-UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
-Simplified version with only leaderboard and dataset components
-"""
-import gradio as gr
-import pandas as pd
-from data_processor import (create_styled_leaderboard_dataframe,
-                            create_empty_leaderboard_dataframe)
-def create_leaderboard_tab(current_data: pd.DataFrame):
-    """Create the main leaderboard tab with color styling"""
-    # Handle empty or invalid data
-    if current_data.empty or "Model" not in current_data.columns:
-        print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
-        current_data = create_empty_leaderboard_dataframe()
-    # Apply color styling to score columns using pandas Styler
-    styled_data = create_styled_leaderboard_dataframe(current_data)
-    leaderboard = gr.Dataframe(
-        value=styled_data,
-        interactive=False,
-        wrap=True,
-        max_height=600,
-        show_search=True,
-        datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"],  # Model column as HTML for clickable links
-        column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
-    )
-    # Information about the leaderboard
-    gr.Markdown("""
-    ### 🔍 How to Use the Leaderboard:
-    - **Search**: Use the search box to find specific models
-    - **Color Coding**: Scores are color-coded from red (low) to green (high)
-    - **Sorting**: Click on column headers to sort by different metrics
-    - **Rankings**: Models are ranked by Mean (Task) score
-    ### 📊 Performance Insights:
-    - **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
-    - **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
-    - **Model Size vs Performance**: Larger models generally perform better but with exceptions
-    """)
-    return leaderboard
-def create_dataset_tab():
-    """Create the dataset information tab"""
-    gr.Markdown("### 📊 MTEB Turkish Dataset Overview")
-    # Task name to dataset path mapping
-    task_to_dataset = {
-        'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
-        'XQuADRetrieval': 'google/xquad',
-        'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
-        'MKQARetrieval': 'apple/mkqa',
-        'MassiveIntentClassification': 'mteb/amazon_massive_intent',
-        'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
-        'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
-        'SIB200Classification': 'mteb/sib200',
-        'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
-        'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
-        'SIB200ClusteringS2S': 'mteb/sib200',
-        'XNLI': 'mteb/xnli',
-        'XNLIV2': 'mteb/xnli2.0-multi-pair',
-        'STS22.v2': 'mteb/sts22-crosslingual-sts'
-    }
-    # Create clickable task names
-    clickable_task_names = []
-    for task_name in [
-        'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
-        'MassiveIntentClassification', 'MassiveScenarioClassification',
-        'MultilingualSentimentClassification', 'SIB200Classification',
-        'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
-        'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
-    ]:
-        dataset_path = task_to_dataset[task_name]
-        hf_link = f"https://huggingface.co/datasets/{dataset_path}"
-        clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
-        clickable_task_names.append(clickable_name)
-    # Create dataset information table
-    dataset_data = pd.DataFrame({
-        'Task Name': clickable_task_names,
-        'Task Type': [
-            'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
-            'Classification', 'Classification',
-            'Classification', 'Classification',
-            'Classification', 'Classification',
-            'Clustering', 'PairClassification', 'PairClassification', 'STS'
-        ],
-        'Description': [
-            'Turkish FAQ retrieval task',
-            'Turkish question answering retrieval',
-            'Historical Turkish document retrieval',
-            'Multilingual knowledge QA retrieval',
-            'Intent classification for Turkish',
-            'Scenario classification for Turkish',
-            'Multilingual sentiment classification',
-            'SIB200 language identification',
-            'Turkish movie review sentiment',
-            'Turkish product review sentiment',
-            'SIB200 clustering task',
-            'Turkish natural language inference',
-            'Enhanced Turkish NLI task',
-            'Turkish semantic textual similarity'
-        ],
-        'Domain': [
-            'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
-            'Intent', 'Scenario',
-            'Sentiment', 'Language ID',
-            'Movies', 'Products',
-            'Language ID', 'NLI', 'NLI', 'STS'
-        ],
-        'Samples': [
-            '~135K', '~10K', '~1.4K', '~10K',
-            '~11K', '~11K',
-            '~4.5K', '~700',
-            '~8K', '~4.8K',
-            '~1K', '~1.4K', '~1.4K', '~400'
-        ]
-    })
-    dataset_table = gr.Dataframe(
-        value=dataset_data,
-        label="MTEB Turkish Task Details",
-        interactive=False,
-        wrap=True,
-        datatype=["html", "str", "str", "str", "str"]  # First column (Task Name) as HTML for clickable links
-    )
-    # Task type distribution
-    gr.Markdown("""
-    ### 📈 Task Distribution:
-    **By Task Type:**
-    - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
-    - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
-    - **Pair Classification**: 2 tasks (natural language inference)
-    - **Clustering**: 1 task (language clustering)
-    - **STS**: 1 task (semantic textual similarity)
-    **By Domain:**
-    - **Sentiment Analysis**: Movie and product reviews
-    - **Question Answering**: FAQ, reading comprehension, and knowledge QA
-    - **Intent/Scenario**: Conversational AI applications
-    - **Language Tasks**: NLI, STS, clustering
-    - **Multilingual**: Cross-lingual evaluation capabilities
-    """)
-    # Statistics summary
-    stats_data = pd.DataFrame({
-        'Metric': [
-            'Total Tasks',
-            'Total Samples',
-            'Task Types',
-            'Languages',
-            'Avg. Tokens per Sample'
-        ],
-        'Value': [
-            '14 tasks',
-            '~190K samples',
-            '5 types',
-            'Turkish + Multilingual',
-            '~150 tokens'
-        ],
-        'Notes': [
-            'Comprehensive evaluation across domains',
-            'Large-scale evaluation dataset',
-            'Classification, Retrieval, STS, NLI, Clustering',
-            'Focus on Turkish with multilingual support',
-            'Varies by task type and domain'
-        ]
-    })
-    gr.Dataframe(
-        value=stats_data,
-        label="Dataset Statistics Summary",
-        interactive=False
-    )
-    gr.Markdown("""
-    ### 🎯 Evaluation Methodology:
-    **Scoring:**
-    - Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
-    - **Mean (Task)**: Direct average of all individual task scores
-    - **Mean (TaskType)**: Average of task category means
-    - **Individual Categories**: Performance in each task type
-    **Model Ranking:**
-    - Primary ranking by **Mean (Task)** score
-    - Correlation metrics provide additional insights
-    - Task-specific performance shows model strengths
-    **Quality Assurance:**
-    - Standardized evaluation protocols
-    - Consistent preprocessing across tasks
-    - Multiple metrics per task for robustness
-    """)
-    return dataset_table
-def create_submit_evaluation_tab():
-    """Create the submit evaluation tab with form"""
-    gr.Markdown("### 🚀 Submit Model for Evaluation")
-    gr.Markdown("""
-    Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
-    **Authentication with Hugging Face is required to submit evaluations.**
-    """)
-    # OAuth login button
-    login_button = gr.LoginButton(value="Sign in with Hugging Face")
-    model_input = gr.Textbox(
-        label="🤖 Model Name",
-        placeholder="sentence-transformers/your-model",
-        info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
-    )
-    email_input = gr.Textbox(
-        label="📧 Email Address",
-        placeholder="your.email@example.com",
-        info="Email for notifications about evaluation status and results"
-    )
-    submit_btn = gr.Button(
-        "🚀 Submit",
-        variant="primary",
-        size="lg"
-    )
-    # Result output for authentication messages
-    result_output = gr.HTML(label="Status")
-    # Information about the evaluation process
-    gr.Markdown("""
-    ### 📋 Evaluation Process:
-    1. **Sign In**: First, sign in with your Hugging Face account using the button above
-    2. **Submit Request**: Fill out the form with your model details and email
-    3. **Admin Review**: Your request will be reviewed by administrators
-    4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
-    5. **Results**: You'll receive email notifications and results will appear on the leaderboard
-    ### ⚠️ Important Notes:
-    - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
-    - You'll receive email updates about your request status
-    - Make sure your model is publicly available on HuggingFace
-    - Valid email address is required for receiving results
-    """)
-    return (model_input, email_input, submit_btn, login_button, result_output)