Refactor codebase structure
Browse files- .gitattributes +0 -35
- .gitignore +53 -0
- api_client.py +0 -103
- app.py +79 -111
- config.py +0 -28
- data_processor.py +0 -208
- evaluation_service.py +0 -190
- leaderboard_data.csv +52 -33
- requirements.txt +3 -2
- src/__init__.py +19 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/api/__init__.py +7 -0
- src/api/__pycache__/__init__.cpython-312.pyc +0 -0
- src/api/__pycache__/client.cpython-312.pyc +0 -0
- src/api/client.py +74 -0
- src/components/__init__.py +11 -0
- src/components/__pycache__/__init__.cpython-312.pyc +0 -0
- src/components/__pycache__/dataset.cpython-312.pyc +0 -0
- src/components/__pycache__/leaderboard.cpython-312.pyc +0 -0
- src/components/__pycache__/submit.cpython-312.pyc +0 -0
- src/components/dataset.py +270 -0
- src/components/leaderboard.py +461 -0
- src/components/submit.py +186 -0
- src/core/__init__.py +12 -0
- src/core/__pycache__/__init__.cpython-312.pyc +0 -0
- src/core/__pycache__/columns.cpython-312.pyc +0 -0
- src/core/__pycache__/config.cpython-312.pyc +0 -0
- src/core/columns.py +402 -0
- src/core/config.py +53 -0
- src/data/__init__.py +11 -0
- src/data/__pycache__/__init__.cpython-312.pyc +0 -0
- src/data/__pycache__/styler.cpython-312.pyc +0 -0
- src/data/__pycache__/transformer.cpython-312.pyc +0 -0
- src/data/styler.py +164 -0
- src/data/transformer.py +280 -0
- ui_components.py +0 -259
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual environments
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
.venv/
|
| 28 |
+
|
| 29 |
+
# IDE
|
| 30 |
+
.idea/
|
| 31 |
+
.vscode/
|
| 32 |
+
*.swp
|
| 33 |
+
*.swo
|
| 34 |
+
*~
|
| 35 |
+
|
| 36 |
+
# Environment variables
|
| 37 |
+
.env
|
| 38 |
+
.env.local
|
| 39 |
+
.env.*.local
|
| 40 |
+
|
| 41 |
+
# Logs
|
| 42 |
+
*.log
|
| 43 |
+
logs/
|
| 44 |
+
|
| 45 |
+
# Docker
|
| 46 |
+
.docker/
|
| 47 |
+
|
| 48 |
+
# OS
|
| 49 |
+
.DS_Store
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Gradio
|
| 53 |
+
flagged/
|
api_client.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
API Client module for MTEB Turkish Leaderboard
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
from typing import Optional, Dict, Any
|
| 7 |
-
import traceback
|
| 8 |
-
import requests
|
| 9 |
-
|
| 10 |
-
from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def check_api_health() -> bool:
|
| 14 |
-
"""Check if API is available"""
|
| 15 |
-
try:
|
| 16 |
-
response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
|
| 17 |
-
return response.status_code == 200
|
| 18 |
-
except:
|
| 19 |
-
return False
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
|
| 23 |
-
"""
|
| 24 |
-
Send an evaluation request to the API for the specified model.
|
| 25 |
-
Returns the API response as a dictionary if successful, otherwise None.
|
| 26 |
-
"""
|
| 27 |
-
try:
|
| 28 |
-
payload = {
|
| 29 |
-
"model_name": model_name,
|
| 30 |
-
"model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
|
| 31 |
-
"batch_size": batch_size,
|
| 32 |
-
"email": email,
|
| 33 |
-
"model_type": "sentence-transformer"
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
# Authentication credentials
|
| 37 |
-
auth = (USERNAME, PASSWORD)
|
| 38 |
-
|
| 39 |
-
response = requests.post(
|
| 40 |
-
f"{API_URL}/api/mteb/request",
|
| 41 |
-
json=payload,
|
| 42 |
-
timeout=API_TIMEOUT,
|
| 43 |
-
auth=auth
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
print(f"Response Status: {response.status_code}")
|
| 47 |
-
|
| 48 |
-
if response.status_code == 200:
|
| 49 |
-
result = response.json()
|
| 50 |
-
return result
|
| 51 |
-
else:
|
| 52 |
-
print(f"API Error: {response.status_code}")
|
| 53 |
-
try:
|
| 54 |
-
error_detail = response.json()
|
| 55 |
-
print(f" Error Detail: {error_detail}")
|
| 56 |
-
except:
|
| 57 |
-
print(f" Raw Response: {response.text}")
|
| 58 |
-
return None
|
| 59 |
-
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"API Call Error: {e}")
|
| 62 |
-
traceback.print_exc()
|
| 63 |
-
return None
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
|
| 67 |
-
"""Get evaluation status from"""
|
| 68 |
-
try:
|
| 69 |
-
auth = (USERNAME, PASSWORD)
|
| 70 |
-
|
| 71 |
-
response = requests.get(
|
| 72 |
-
f"{API_URL}/api/mteb/status/{request_id}",
|
| 73 |
-
timeout=API_TIMEOUT,
|
| 74 |
-
auth=auth
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
if response.status_code == 200:
|
| 78 |
-
return response.json()
|
| 79 |
-
else:
|
| 80 |
-
print(f"Status check error: {response.status_code}")
|
| 81 |
-
return None
|
| 82 |
-
|
| 83 |
-
except Exception as e:
|
| 84 |
-
print(f"Status check error: {e}")
|
| 85 |
-
return None
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def cancel_evaluation_request(request_id: str) -> bool:
|
| 89 |
-
"""Cancel an evaluation request"""
|
| 90 |
-
try:
|
| 91 |
-
auth = (USERNAME, PASSWORD)
|
| 92 |
-
|
| 93 |
-
response = requests.delete(
|
| 94 |
-
f"{API_URL}/api/mteb/request/{request_id}",
|
| 95 |
-
timeout=API_TIMEOUT,
|
| 96 |
-
auth=auth
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
return response.status_code == 200
|
| 100 |
-
|
| 101 |
-
except Exception as e:
|
| 102 |
-
print(f"Cancel request error: {e}")
|
| 103 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,136 +1,104 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Mizan Leaderboard -
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
from data_processor import load_leaderboard_from_csv
|
| 13 |
-
from evaluation_service import submit_evaluation
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
| 18 |
|
| 19 |
-
def create_leaderboard_demo():
|
| 20 |
-
"""Create enhanced leaderboard demo interface with submit functionality"""
|
| 21 |
-
|
| 22 |
-
global current_data
|
| 23 |
-
|
| 24 |
-
# Setup directories
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
# Authentication check
|
| 55 |
-
if profile is None:
|
| 56 |
-
logging.warning("Unauthorized submission attempt with no profile")
|
| 57 |
-
return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
|
| 58 |
-
|
| 59 |
-
# IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
|
| 60 |
-
# This is NOT a real authentication, just a placeholder for local testing
|
| 61 |
-
if isinstance(profile, str) and profile == "Sign in with Hugging Face":
|
| 62 |
-
# Block submission in local dev with mock auth
|
| 63 |
-
return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
|
| 64 |
-
|
| 65 |
-
# Email is required
|
| 66 |
-
if not email or email.strip() == "":
|
| 67 |
-
return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
|
| 68 |
-
|
| 69 |
-
global current_data
|
| 70 |
-
batch_size = 32 # Always use default batch size
|
| 71 |
-
result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
|
| 72 |
-
# Note: For now, we don't update the leaderboard since evaluation is async
|
| 73 |
-
# The leaderboard will be updated manually when results are available
|
| 74 |
-
logging.info(f"Submission processed for model: {model_name} by user: {profile}")
|
| 75 |
-
return result_msg
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
# Tab 3: Dataset Information
|
| 84 |
-
with gr.Tab("📊 Dataset Information"):
|
| 85 |
-
dataset_table = create_dataset_tab()
|
| 86 |
-
gr.Markdown("""
|
| 87 |
-
---
|
| 88 |
-
### 📊 Metrics Explanation:
|
| 89 |
-
- **Mean (Task)**: Average performance across all individual tasks
|
| 90 |
-
- **Mean (TaskType)**: Average performance by task categories
|
| 91 |
-
- **Classification**: Performance on Turkish classification tasks
|
| 92 |
-
- **Clustering**: Performance on Turkish clustering tasks
|
| 93 |
-
- **Pair Classification**: Performance on pair classification tasks (like NLI)
|
| 94 |
-
- **Retrieval**: Performance on information retrieval tasks
|
| 95 |
-
- **STS**: Performance on Semantic Textual Similarity tasks
|
| 96 |
-
- **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
|
| 97 |
-
- **Parameters**: Number of model parameters
|
| 98 |
-
- **Embed Dim**: Embedding dimension size
|
| 99 |
-
- **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
|
| 100 |
-
- **Vocab Size**: Size of the model's vocabulary
|
| 101 |
-
|
| 102 |
-
### 📖 About Mizan:
|
| 103 |
-
This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
|
| 104 |
-
on Turkish language tasks across multiple domains including:
|
| 105 |
-
- Text classification and sentiment analysis
|
| 106 |
-
- Information retrieval and search
|
| 107 |
-
- Semantic textual similarity
|
| 108 |
-
- Text clustering and pair classification
|
| 109 |
-
|
| 110 |
-
### 🚀 Submit Your Model:
|
| 111 |
-
Use the **Submit** tab to submit your Turkish embedding model for evaluation.
|
| 112 |
-
Your request will be reviewed by administrators and you'll receive email notifications about the progress.
|
| 113 |
-
|
| 114 |
-
### Contact:
|
| 115 |
-
For any questions or feedback, please contact info@newmind.ai
|
| 116 |
-
|
| 117 |
-
### Links:
|
| 118 |
-
- **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
|
| 119 |
-
""")
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
def main():
|
| 125 |
-
"""Main entry point"""
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
demo = create_leaderboard_demo()
|
| 129 |
-
demo.launch(
|
| 130 |
-
server_name="0.0.0.0",
|
| 131 |
-
server_port=7860,
|
| 132 |
-
share=False
|
| 133 |
-
)
|
| 134 |
|
| 135 |
|
| 136 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Mizan Turkish Leaderboard - HuggingFace Space Version
|
| 4 |
+
|
| 5 |
+
Clean entry point that wires together all components.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import logging
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
import gradio as gr
|
| 12 |
|
| 13 |
+
from src.core.config import settings
|
| 14 |
+
from src.data import DataTransformer
|
| 15 |
+
from src.components import LeaderboardTab, DatasetTab, SubmitTab
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.DEBUG if settings.ui.debug else logging.INFO,
|
| 20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 21 |
+
handlers=[
|
| 22 |
+
logging.StreamHandler(sys.stdout),
|
| 23 |
+
]
|
| 24 |
+
)
|
| 25 |
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
class MizanApp:
|
| 30 |
+
"""
|
| 31 |
+
Main application class.
|
| 32 |
|
| 33 |
+
Orchestrates all components and creates the Gradio interface.
|
| 34 |
+
"""
|
| 35 |
|
| 36 |
+
def __init__(self):
|
| 37 |
+
# Load data
|
| 38 |
+
self.transformer = DataTransformer()
|
| 39 |
+
self.data = self.transformer.load_from_csv()
|
| 40 |
|
| 41 |
+
# UI components (will be initialized during build)
|
| 42 |
+
self._leaderboard_tab: LeaderboardTab = None
|
| 43 |
+
self._dataset_tab: DatasetTab = None
|
| 44 |
+
self._submit_tab: SubmitTab = None
|
| 45 |
|
| 46 |
+
logger.info(f"Application initialized with {len(self.data)} models")
|
| 47 |
+
|
| 48 |
+
def build_interface(self) -> gr.Blocks:
|
| 49 |
+
"""
|
| 50 |
+
Build the complete Gradio interface.
|
| 51 |
|
| 52 |
+
Returns:
|
| 53 |
+
Gradio Blocks application.
|
| 54 |
+
"""
|
| 55 |
+
with gr.Blocks(
|
| 56 |
+
title="🇹🇷 Mizan Turkish Leaderboard",
|
| 57 |
+
theme=gr.themes.Soft()
|
| 58 |
+
) as demo:
|
| 59 |
+
|
| 60 |
+
# Header
|
| 61 |
+
gr.Markdown("""
|
| 62 |
+
# 🇹🇷 Mizan Turkish Evaluation Leaderboard
|
| 63 |
+
|
| 64 |
+
Performance comparison for Turkish embedding models
|
| 65 |
+
""")
|
| 66 |
|
| 67 |
+
with gr.Tabs():
|
| 68 |
+
# Tab 1: Leaderboard
|
| 69 |
+
with gr.Tab("Leaderboard"):
|
| 70 |
+
self._leaderboard_tab = LeaderboardTab(data=self.data)
|
| 71 |
+
self._leaderboard_tab.build()
|
| 72 |
|
| 73 |
+
# Tab 2: Submit
|
| 74 |
+
with gr.Tab("Submit"):
|
| 75 |
+
self._submit_tab = SubmitTab()
|
| 76 |
+
self._submit_tab.build()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
# Tab 3: Dataset Information
|
| 79 |
+
with gr.Tab("Dataset Information"):
|
| 80 |
+
self._dataset_tab = DatasetTab()
|
| 81 |
+
self._dataset_tab.build()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
return demo
|
| 84 |
+
|
| 85 |
+
def run(self):
|
| 86 |
+
"""Run the application."""
|
| 87 |
+
logger.info("Starting Mizan Turkish Leaderboard...")
|
| 88 |
+
|
| 89 |
+
# Build and launch
|
| 90 |
+
demo = self.build_interface()
|
| 91 |
+
demo.launch(
|
| 92 |
+
server_name="0.0.0.0",
|
| 93 |
+
server_port=settings.ui.port,
|
| 94 |
+
share=False
|
| 95 |
+
)
|
| 96 |
|
| 97 |
|
| 98 |
def main():
|
| 99 |
+
"""Main entry point."""
|
| 100 |
+
app = MizanApp()
|
| 101 |
+
app.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
if __name__ == "__main__":
|
config.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Configuration module for MTEB Turkish Leaderboard
|
| 4 |
-
Centralizes environment variables and configuration settings
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
-
|
| 10 |
-
# Load environment variables from .env file
|
| 11 |
-
load_dotenv()
|
| 12 |
-
|
| 13 |
-
# API Configuration
|
| 14 |
-
API_URL = os.environ.get("API_URL")
|
| 15 |
-
USERNAME = os.environ.get("API_USERNAME")
|
| 16 |
-
PASSWORD = os.environ.get("API_PASSWORD")
|
| 17 |
-
|
| 18 |
-
# API Configuration (public settings)
|
| 19 |
-
API_BASE_URL = "http://localhost:8000"
|
| 20 |
-
API_TIMEOUT = 30
|
| 21 |
-
|
| 22 |
-
# Polling and refresh intervals (public settings)
|
| 23 |
-
POLL_INTERVAL = 5 # seconds
|
| 24 |
-
LEADERBOARD_REFRESH_INTERVAL = 30 # seconds
|
| 25 |
-
|
| 26 |
-
# CSV file path for leaderboard data
|
| 27 |
-
CSV_FILE_PATH = "leaderboard_data.csv"
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_processor.py
DELETED
|
@@ -1,208 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
|
| 4 |
-
Simplified version for loading and processing CSV data
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import pandas as pd
|
| 9 |
-
from pandas.io.formats.style import Styler
|
| 10 |
-
from matplotlib.colors import LinearSegmentedColormap
|
| 11 |
-
import html
|
| 12 |
-
|
| 13 |
-
# CSV file path
|
| 14 |
-
CSV_FILE_PATH = "leaderboard_data.csv"
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def load_leaderboard_from_csv() -> pd.DataFrame:
|
| 18 |
-
"""Load leaderboard data from CSV file"""
|
| 19 |
-
try:
|
| 20 |
-
if not os.path.exists(CSV_FILE_PATH):
|
| 21 |
-
print(f"❌ CSV file not found: {CSV_FILE_PATH}")
|
| 22 |
-
return create_empty_leaderboard_dataframe()
|
| 23 |
-
|
| 24 |
-
df = pd.read_csv(CSV_FILE_PATH)
|
| 25 |
-
print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")
|
| 26 |
-
|
| 27 |
-
# Convert to leaderboard format
|
| 28 |
-
leaderboard_df = csv_to_leaderboard_format(df)
|
| 29 |
-
|
| 30 |
-
# Sort by Mean (Task) score and add rankings
|
| 31 |
-
leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
|
| 32 |
-
leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
|
| 33 |
-
|
| 34 |
-
return leaderboard_df
|
| 35 |
-
|
| 36 |
-
except Exception as e:
|
| 37 |
-
print(f"❌ Error loading CSV: {e}")
|
| 38 |
-
return create_empty_leaderboard_dataframe()
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def create_empty_leaderboard_dataframe() -> pd.DataFrame:
|
| 42 |
-
"""Create an empty DataFrame with proper leaderboard column structure"""
|
| 43 |
-
return pd.DataFrame(columns=[
|
| 44 |
-
"Rank",
|
| 45 |
-
"Model",
|
| 46 |
-
"Mean (Task)",
|
| 47 |
-
"Mean (TaskType)",
|
| 48 |
-
"Classification",
|
| 49 |
-
"Clustering",
|
| 50 |
-
"Pair Classification",
|
| 51 |
-
"Retrieval",
|
| 52 |
-
"STS",
|
| 53 |
-
"Correlation",
|
| 54 |
-
"Parameters",
|
| 55 |
-
"Embed Dim",
|
| 56 |
-
"Max Sequence Length",
|
| 57 |
-
"Vocab Size",
|
| 58 |
-
])
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
|
| 62 |
-
"""Convert CSV data to leaderboard format"""
|
| 63 |
-
data = []
|
| 64 |
-
for idx, row in df.iterrows():
|
| 65 |
-
model_name = row['Model']
|
| 66 |
-
|
| 67 |
-
# Prepare model name for display
|
| 68 |
-
model_name_clean = html.escape(model_name)
|
| 69 |
-
|
| 70 |
-
# Create clickable HuggingFace link for model name
|
| 71 |
-
hf_link = f"https://huggingface.co/{model_name_clean}"
|
| 72 |
-
clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
|
| 73 |
-
|
| 74 |
-
# Handle different column name variations
|
| 75 |
-
embedding_dim_col = 'Embedding Dim'
|
| 76 |
-
max_seq_col = 'Max Seq Length'
|
| 77 |
-
pair_classification_col = 'Pair Classification'
|
| 78 |
-
|
| 79 |
-
data_row = {
|
| 80 |
-
"Rank": idx + 1, # Initial ranking, will be recalculated
|
| 81 |
-
"Model": clickable_model,
|
| 82 |
-
"Mean (Task)": round(float(row['Mean (Task)']), 2),
|
| 83 |
-
"Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
|
| 84 |
-
"Classification": round(float(row['Classification']), 2),
|
| 85 |
-
"Clustering": round(float(row['Clustering']), 2),
|
| 86 |
-
"Pair Classification": round(float(row[pair_classification_col]), 2),
|
| 87 |
-
"Retrieval": round(float(row['Retrieval']), 2),
|
| 88 |
-
"STS": round(float(row['STS']), 2),
|
| 89 |
-
"Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
|
| 90 |
-
"Parameters": row['Number of Parameters'],
|
| 91 |
-
"Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
|
| 92 |
-
"Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
|
| 93 |
-
"Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
|
| 94 |
-
}
|
| 95 |
-
data.append(data_row)
|
| 96 |
-
|
| 97 |
-
result_df = pd.DataFrame(data)
|
| 98 |
-
return result_df
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def create_excel_like_cmap():
|
| 102 |
-
"""Create Excel-like colormap for score visualization"""
|
| 103 |
-
colors = [
|
| 104 |
-
(0.9, 0.1, 0.2), # Red
|
| 105 |
-
(1.0, 1.0, 0.0), # Yellow
|
| 106 |
-
(0/255, 176/255, 80/255) # Excel-style Green
|
| 107 |
-
]
|
| 108 |
-
|
| 109 |
-
return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def rgb_to_hex(rgb_tuple):
|
| 113 |
-
"""Convert RGB tuple to hex color"""
|
| 114 |
-
r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
|
| 115 |
-
return f"#{r:02x}{g:02x}{b:02x}"
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
|
| 119 |
-
"""Create colored cell HTML for score visualization"""
|
| 120 |
-
if pd.isna(value) or value == "N/A":
|
| 121 |
-
return str(value)
|
| 122 |
-
|
| 123 |
-
try:
|
| 124 |
-
# Normalize value to 0-1 range
|
| 125 |
-
if max_val > min_val:
|
| 126 |
-
normalized = (float(value) - min_val) / (max_val - min_val)
|
| 127 |
-
else:
|
| 128 |
-
normalized = 0.5
|
| 129 |
-
|
| 130 |
-
# Get color from colormap
|
| 131 |
-
color_rgba = colormap(normalized)
|
| 132 |
-
color_hex = rgb_to_hex(color_rgba)
|
| 133 |
-
|
| 134 |
-
# Create colored cell HTML with data-sort attribute for proper numeric sorting
|
| 135 |
-
return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
|
| 136 |
-
|
| 137 |
-
except (ValueError, TypeError):
|
| 138 |
-
return str(value)
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
|
| 142 |
-
"""Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
|
| 143 |
-
|
| 144 |
-
Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
|
| 145 |
-
"""
|
| 146 |
-
if df.empty:
|
| 147 |
-
return df.style
|
| 148 |
-
|
| 149 |
-
colormap = create_excel_like_cmap()
|
| 150 |
-
|
| 151 |
-
# Score columns to colorize
|
| 152 |
-
score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
|
| 153 |
-
"Pair Classification", "Retrieval", "STS", "Correlation"]
|
| 154 |
-
|
| 155 |
-
# Calculate min/max for each score column for normalization
|
| 156 |
-
color_ranges = {}
|
| 157 |
-
for col in score_columns:
|
| 158 |
-
if col in df.columns:
|
| 159 |
-
numeric_values = pd.to_numeric(df[col], errors='coerce')
|
| 160 |
-
if not numeric_values.isna().all():
|
| 161 |
-
color_ranges[col] = {
|
| 162 |
-
'min': numeric_values.min(),
|
| 163 |
-
'max': numeric_values.max()
|
| 164 |
-
}
|
| 165 |
-
|
| 166 |
-
# Create styler with background colors for score columns
|
| 167 |
-
def apply_color_gradient(val, col_name):
|
| 168 |
-
"""Apply background color based on value"""
|
| 169 |
-
if col_name not in color_ranges:
|
| 170 |
-
return ''
|
| 171 |
-
|
| 172 |
-
if pd.isna(val) or val == "N/A":
|
| 173 |
-
return ''
|
| 174 |
-
|
| 175 |
-
try:
|
| 176 |
-
min_val = color_ranges[col_name]['min']
|
| 177 |
-
max_val = color_ranges[col_name]['max']
|
| 178 |
-
|
| 179 |
-
# Normalize value to 0-1 range
|
| 180 |
-
if max_val > min_val:
|
| 181 |
-
normalized = (float(val) - min_val) / (max_val - min_val)
|
| 182 |
-
else:
|
| 183 |
-
normalized = 0.5
|
| 184 |
-
|
| 185 |
-
# Get color from colormap
|
| 186 |
-
color_rgba = colormap(normalized)
|
| 187 |
-
color_hex = rgb_to_hex(color_rgba)
|
| 188 |
-
|
| 189 |
-
return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
|
| 190 |
-
except (ValueError, TypeError):
|
| 191 |
-
return ''
|
| 192 |
-
|
| 193 |
-
# Apply styling to score columns using map (applymap is deprecated)
|
| 194 |
-
styler = df.style
|
| 195 |
-
for col in score_columns:
|
| 196 |
-
if col in df.columns:
|
| 197 |
-
styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
|
| 198 |
-
|
| 199 |
-
# Format score columns to 2 decimal places
|
| 200 |
-
format_dict = {}
|
| 201 |
-
for col in score_columns:
|
| 202 |
-
if col in df.columns:
|
| 203 |
-
format_dict[col] = '{:.2f}'
|
| 204 |
-
|
| 205 |
-
if format_dict:
|
| 206 |
-
styler = styler.format(format_dict, na_rep='N/A')
|
| 207 |
-
|
| 208 |
-
return styler
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation_service.py
DELETED
|
@@ -1,190 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Evaluation Service module for MTEB Turkish Leaderboard
|
| 4 |
-
Handles evaluation submissions and status tracking
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import time
|
| 8 |
-
import re
|
| 9 |
-
from typing import Optional, Tuple, List
|
| 10 |
-
import traceback
|
| 11 |
-
import pandas as pd
|
| 12 |
-
import gradio as gr
|
| 13 |
-
|
| 14 |
-
from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
|
| 15 |
-
|
| 16 |
-
# Global state management for active evaluations
|
| 17 |
-
active_evaluations = {} # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def get_active_evaluations_status() -> str:
|
| 21 |
-
"""Show status of active evaluations"""
|
| 22 |
-
if not active_evaluations:
|
| 23 |
-
return "🟢 No active evaluation requests"
|
| 24 |
-
|
| 25 |
-
status_lines = []
|
| 26 |
-
for request_id, info in active_evaluations.items():
|
| 27 |
-
model_name = info["model_name"]
|
| 28 |
-
email = info["email"]
|
| 29 |
-
elapsed = int(time.time() - info["start_time"])
|
| 30 |
-
status = info.get("status", "PENDING")
|
| 31 |
-
status_lines.append(f"🔄 {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
|
| 32 |
-
|
| 33 |
-
return "\n".join(status_lines)
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
|
| 37 |
-
"""Get active evaluations status and cancellation options"""
|
| 38 |
-
status_text = get_active_evaluations_status()
|
| 39 |
-
|
| 40 |
-
cancel_options = []
|
| 41 |
-
for request_id, info in active_evaluations.items():
|
| 42 |
-
model_name = info["model_name"]
|
| 43 |
-
cancel_options.append(f"{request_id} - {model_name}")
|
| 44 |
-
|
| 45 |
-
return status_text, cancel_options
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
def clear_active_evaluations() -> str:
|
| 49 |
-
"""Clear all active evaluations from tracking"""
|
| 50 |
-
global active_evaluations
|
| 51 |
-
count = len(active_evaluations)
|
| 52 |
-
active_evaluations.clear()
|
| 53 |
-
return f"✅ Cleared {count} active evaluation(s) from tracking"
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def cancel_active_evaluation(selection: str) -> str:
|
| 57 |
-
"""Cancel a selected active evaluation"""
|
| 58 |
-
if not selection:
|
| 59 |
-
return "❌ No evaluation selected for cancellation"
|
| 60 |
-
|
| 61 |
-
try:
|
| 62 |
-
request_id = selection.split(" - ")[0]
|
| 63 |
-
|
| 64 |
-
if request_id not in active_evaluations:
|
| 65 |
-
return f"❌ Evaluation {request_id} not found in active evaluations"
|
| 66 |
-
|
| 67 |
-
# Try to cancel via API
|
| 68 |
-
success = cancel_evaluation_request(request_id)
|
| 69 |
-
|
| 70 |
-
if success:
|
| 71 |
-
model_name = active_evaluations[request_id]["model_name"]
|
| 72 |
-
del active_evaluations[request_id]
|
| 73 |
-
return f"✅ Successfully cancelled evaluation for {model_name} (ID: {request_id})"
|
| 74 |
-
else:
|
| 75 |
-
return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
|
| 76 |
-
|
| 77 |
-
except Exception as e:
|
| 78 |
-
return f"❌ Error cancelling evaluation: {str(e)}"
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
|
| 82 |
-
"""Validate evaluation request parameters"""
|
| 83 |
-
# Model name validation
|
| 84 |
-
if not model_name or not model_name.strip():
|
| 85 |
-
return "❌ Model name cannot be empty!"
|
| 86 |
-
|
| 87 |
-
model_name = model_name.strip()
|
| 88 |
-
|
| 89 |
-
# Check model name length (format: org/model-name)
|
| 90 |
-
if len(model_name) < 3:
|
| 91 |
-
return "❌ Model name too short!"
|
| 92 |
-
|
| 93 |
-
if len(model_name) > 256:
|
| 94 |
-
return "❌ Model name too long (maximum 256 characters)!"
|
| 95 |
-
|
| 96 |
-
# Check for valid HuggingFace model name format (must be org/model)
|
| 97 |
-
if '/' not in model_name:
|
| 98 |
-
return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
|
| 99 |
-
|
| 100 |
-
if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
|
| 101 |
-
return "❌ Invalid model name format! Use format: organization/model-name"
|
| 102 |
-
|
| 103 |
-
# Email validation
|
| 104 |
-
if not email or not email.strip():
|
| 105 |
-
return "❌ Email address cannot be empty!"
|
| 106 |
-
|
| 107 |
-
email = email.strip()
|
| 108 |
-
|
| 109 |
-
if len(email) > 254:
|
| 110 |
-
return "❌ Email address too long!"
|
| 111 |
-
|
| 112 |
-
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
| 113 |
-
if not re.match(email_pattern, email):
|
| 114 |
-
return "❌ Invalid email address format!"
|
| 115 |
-
|
| 116 |
-
return None
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
|
| 120 |
-
try:
|
| 121 |
-
# Input validation
|
| 122 |
-
error_msg = _validate_evaluation_request(model_name, email)
|
| 123 |
-
if error_msg:
|
| 124 |
-
return error_msg, None
|
| 125 |
-
|
| 126 |
-
# Show progress
|
| 127 |
-
progress(0.1, desc="Sending evaluation request to API...")
|
| 128 |
-
|
| 129 |
-
# Send request to API - regardless of backend response, show success to user
|
| 130 |
-
api_response = send_evaluation_request_to_api(model_name, batch_size, email)
|
| 131 |
-
|
| 132 |
-
# Always show success message to user
|
| 133 |
-
# Backend errors (like duplicate requests) are handled by API and communicated via email
|
| 134 |
-
progress(1.0, desc="Request submitted successfully!")
|
| 135 |
-
|
| 136 |
-
# Return success message regardless of backend response
|
| 137 |
-
success_msg = f"""
|
| 138 |
-
✅ Evaluation request submitted successfully!
|
| 139 |
-
|
| 140 |
-
🤖 Model: {model_name}
|
| 141 |
-
📧 Email: {email}
|
| 142 |
-
|
| 143 |
-
📋 Next Steps:
|
| 144 |
-
⏱️ Your request will be reviewed by our system
|
| 145 |
-
📧 You will receive email notifications about the status of your evaluation
|
| 146 |
-
🔄 If you've submitted this model before, you'll be notified via email
|
| 147 |
-
|
| 148 |
-
Thank you for contributing to the Mizan Leaderboard!
|
| 149 |
-
"""
|
| 150 |
-
|
| 151 |
-
return success_msg.strip(), current_data
|
| 152 |
-
|
| 153 |
-
except Exception as e:
|
| 154 |
-
# Log error for debugging
|
| 155 |
-
print(f"❌ Error submitting evaluation: {str(e)}")
|
| 156 |
-
traceback.print_exc()
|
| 157 |
-
|
| 158 |
-
error_msg = f"""
|
| 159 |
-
❌ Failed to submit evaluation request
|
| 160 |
-
|
| 161 |
-
🤖 Model: {model_name}
|
| 162 |
-
📧 Email: {email}
|
| 163 |
-
|
| 164 |
-
⚠️ Error: Unable to connect to the evaluation service.
|
| 165 |
-
|
| 166 |
-
Please try again later or contact support if the problem persists.
|
| 167 |
-
"""
|
| 168 |
-
return error_msg.strip(), None
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
def refresh_evaluation_status() -> str:
|
| 172 |
-
"""Refresh status of all active evaluations"""
|
| 173 |
-
if not active_evaluations:
|
| 174 |
-
return "🟢 No active evaluations to refresh"
|
| 175 |
-
|
| 176 |
-
updated_count = 0
|
| 177 |
-
for request_id, info in active_evaluations.items():
|
| 178 |
-
try:
|
| 179 |
-
status_data = get_evaluation_status(request_id)
|
| 180 |
-
if status_data and "status" in status_data:
|
| 181 |
-
old_status = info.get("status", "UNKNOWN")
|
| 182 |
-
new_status = status_data["status"]
|
| 183 |
-
if old_status != new_status:
|
| 184 |
-
info["status"] = new_status
|
| 185 |
-
updated_count += 1
|
| 186 |
-
print(f"Status updated for {request_id}: {old_status} -> {new_status}")
|
| 187 |
-
except Exception as e:
|
| 188 |
-
print(f"Error refreshing status for {request_id}: {e}")
|
| 189 |
-
|
| 190 |
-
return f"🔄 Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data.csv
CHANGED
|
@@ -1,33 +1,52 @@
|
|
| 1 |
-
Model,
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
newmindai/
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
newmindai/
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
sentence-transformers/
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
newmindai/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Rank (Borda),Model,Model Architecture,Tokenizer Type,Unique Token Count,Turkish Token Count,Turkish Token %,Pure Token Count,Pure Token %,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Contracts,Regulation,Caselaw,Score(Legal),Memory Usage (MB),Number of Parameters,Embed Dim,Vocab Size,Max Seq Length,Correlation,Model Type
|
| 2 |
+
1,google/embeddinggemma-300m,Gemma3TextModel,GemmaTokenizer,13697.0,5910.0,43.15,3980.0,29.06,67.23,65.42,77.74,45.05,80.02,55.06,69.22,83.97,39.56,28.38,50.63,1173.0,307M,768.0,262144.0,2048,0.51,Embedding
|
| 3 |
+
2,newmindai/bge-m3-stsb,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.46,63.53068666666667,74.24768333333334,43.9295,78.50975,50.142,70.8245,82.609,38.141000000000005,29.167,49.97233333333333,2165.0,567M,1024.0,250002.0,8194,0.6350506506291465,Embedding
|
| 4 |
+
3,BAAI/bge-m3,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,64.75,62.87,75.35,35.86,78.88,54.42,69.83,86.08,38.09,29.3,51.16,2165.0,567M,1024.0,250002.0,8194,0.61,Embedding
|
| 5 |
+
4,Lajavaness/bilingual-embedding-large,BilingualModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.62,62.468826666666665,74.15278333333333,42.2467,73.0609,52.248250000000006,70.6355,82.14099999999999,35.399,24.551,47.36366666666666,2135.0,559M,1024.0,250002.0,514,0.611419419101738,Embedding
|
| 6 |
+
5,newmindai/TurkEmbed4STS,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.67,62.41829666666666,69.69163333333334,44.2897,81.76675,49.135,67.2084,78.877,35.18,27.635,47.23066666666666,1164.0,305M,768.0,250048.0,8192,0.6839028854791485,Embedding
|
| 7 |
+
6,intfloat/multilingual-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.14,61.50873666666666,71.79943333333334,41.1967,72.76185000000001,54.29849999999999,67.4872,85.38,33.178000000000004,22.299,46.952333333333335,2135.0,559M,1024.0,250002.0,514,0.5844910512151045,Embedding
|
| 8 |
+
7,ytu-ce-cosmos/turkish-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.0,60.36150666666667,72.41818333333333,38.1709,70.86345,51.114,69.241,80.729,37.384,26.476,48.196333333333335,2135.0,559M,1024.0,250002.0,514,0.5608614724386807,Embedding
|
| 9 |
+
8,Alibaba-NLP/gte-multilingual-base,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,61.18,60.12285333333333,67.99526666666667,39.1645,75.99780000000001,50.516000000000005,66.94069999999999,76.012,36.391,27.066000000000003,46.489666666666665,1164.0,305M,768.0,250048.0,8192,0.6170556873432124,Embedding
|
| 10 |
+
9,nomic-ai/nomic-embed-text-v2-moe,NomicBertModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,60.63,59.54449333333332,68.53571666666666,43.3523,64.42945,52.6895,68.71549999999999,84.466,39.939,27.849,50.75133333333333,1813.0,475M,768.0,250048.0,2048,0.530989593067926,Embedding
|
| 11 |
+
10,magibu/embeddingmagibu-200m,Gemma3TextModel,GemmaTokenizer,29799.0,18946.0,63.58,8515.0,28.57,59.989025,59.247110000000006,66.4086,40.1472,74.98685,48.2505,66.4424,75.745,33.984,27.033,45.587,789.0,206M,768.0,131072.0,8192,0.585573508421718,Embedding
|
| 12 |
+
11,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.62,58.92842666666667,70.87778333333333,41.799,83.59875000000001,39.8555,58.511100000000006,65.403,7.61,1.289,24.767333333333337,1060.0,278M,768.0,250002.0,514,0.6495769869027372,Embedding
|
| 13 |
+
12,intfloat/multilingual-e5-large-instruct,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.91,58.85126,72.24580000000002,31.5179,72.91635,48.01275,69.5635,78.985,35.735,25.351000000000003,46.690333333333335,2135.0,559M,1024.0,250002.0,514,0.5663941110812728,Embedding
|
| 14 |
+
13,newmindai/TurkEmbed4Retrieval,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.1,58.36369333333333,64.78041666666665,47.468700000000005,64.0415,48.86425,66.6636,74.626,36.121,28.898000000000003,46.54833333333334,1164.0,305M,768.0,250048.0,512,0.5743432298546475,Embedding
|
| 15 |
+
14,newmindai/Mursit-Embed-Qwen3-1.7B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,58.08,56.84,68.46,42.22,59.67,50.1,63.77,70.22,17.94,16.11,34.76,6563.0,1.7B,2048.0,151936.0,40960,0.44,CLM-Embedding
|
| 16 |
+
15,newmindai/Mursit-Large-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.57,56.43,67.47,38.76,59.88,51.59,64.44,81.63,32.39,25.24,46.42,1539.0,403M,1024.0,59008.0,2048,0.49,Embedding
|
| 17 |
+
16,newmindai/modernbert-base-tr-uncased-allnli-stsb,ModernBertModel,PreTrainedTokenizerFast,20502.0,16007.0,78.08,6077.0,29.64,56.35,56.31918666666665,71.45993333333332,35.4615,82.83494999999999,35.11075,56.7288,62.937,15.297,17.466,31.899999999999995,514.0,134M,768.0,32000.0,8192,0.6637952581670423,Embedding
|
| 18 |
+
17,newmindai/Mursit-Base-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.01,55.86,66.25,39.75,61.31,50.07,61.9,80.4,34.1,28.07,47.52,593.0,155M,768.0,59008.0,1024,0.49,Embedding
|
| 19 |
+
18,emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,56.03,54.33,68.42,23.64,74.94,42.29,62.39,72.83,22.88,20.78,38.83,421.0,110M,768.0,32000.0,512,0.62,Embedding
|
| 20 |
+
19,newmindai/TurkEmbed4STS-HD,NewForTokenClassification,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,56.14,54.25491999999999,67.61245,36.856100000000005,80.07815000000001,39.2535,47.4744,70.233,4.837000000000001,6.1690000000000005,27.079666666666668,1164.0,305M,768.0,250048.0,8192,0.6504462482545317,Embedding
|
| 21 |
+
20,ibm-granite/granite-embedding-278m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.68,53.93412333333333,58.37791666666666,39.4453,60.1335,45.139,66.5749,67.254,24.53,16.229,36.004333333333335,1060.0,278M,768.0,250002.0,514,0.4137480806327822,Embedding
|
| 22 |
+
21,newmindai/Mursit-Embed-Qwen3-4B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,56.47,53.65,67.29,36.68,58.36,51.12,54.77,69.25,24.21,17.56,37.0,15344.0,4B,2560.0,151936.0,40960,0.34,CLM-Embedding
|
| 23 |
+
22,nvidia/llama-embed-nemotron-8b,LlamaBidirectionalModel,PreTrainedTokenizerFast,12041.0,5485.0,45.55,3507.0,29.13,51.06448333333333,53.52449666666666,68.51398333333334,39.8189,58.1497,30.656,70.4839,52.095,28.802,16.756999999999998,32.55133333333333,28629.0,8B,4096.0,128256.0,131072,0.3817553384080386,CLM-Embedding
|
| 24 |
+
23,KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5,Qwen2Model,Qwen2TokenizerFast,10262.0,3234.0,31.51,2294.0,22.35,52.71,52.83622666666668,64.64263333333334,37.6148,57.5669,35.511500000000005,68.8453,32.014,35.608000000000004,30.239,32.62033333333334,1884.0,494M,896.0,151936.0,131072,0.4053257465375148,CLM-Embedding
|
| 25 |
+
24,ibm-granite/granite-embedding-107m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,50.52,51.07249,55.654500000000006,34.6266,59.86395,41.3655,63.85189999999999,60.72,20.033,11.705,30.819333333333333,408.0,106M,384.0,250002.0,514,0.3807947055039975,Embedding
|
| 26 |
+
25,sentence-transformers/LaBSE,BertModel,BertTokenizerFast,19595.0,11061.0,56.45,5800.0,29.6,51.83,50.72844,63.18349999999999,25.5499,64.0111,38.4625,62.4352,63.809000000000005,15.122,13.838,30.923,1798.0,471M,768.0,501153.0,512,0.4794392790632775,Embedding
|
| 27 |
+
26,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,BertModel,BertTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.99,50.29915666666667,67.18508333333332,42.3102,79.30365,35.82925,26.867600000000003,56.875,0.8410000000000001,0.713,19.476333333333333,448.0,117M,384.0,250037.0,512,0.6043096711195243,Embedding
|
| 28 |
+
27,numind/NuSentiment-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,54.0,50.16527553055566,73.67280773306626,14.960431297201202,76.8943051047361,35.343,49.95583351777477,64.037,10.431,10.38,28.282666666666668,1060.0,278M,768.0,250002.0,514,0.5183345151582207,Embedding
|
| 29 |
+
28,dbmdz/bert-base-turkish-uncased,BertModel,BertTokenizerFast,14807.0,10953.0,73.97,5876.0,39.68,51.99,46.44,67.93,34.76,60.54,31.98,37.01,52.48,12.02,10.09,24.86,421.0,110M,768.0,32000.0,512,0.36,MLM
|
| 30 |
+
29,minishlab/potion-multilingual-128M,StaticModel,XLMRobertaTokenizerFast,18943.0,12657.0,66.82,5986.0,31.6,47.96,45.95582333333334,58.34376666666668,25.4021,59.76105,36.3395,49.9327,65.022,21.481,14.031,33.51133333333334,488.0,128M,256.0,500358.0,∞,0.4306555947403001,Embedding
|
| 31 |
+
30,ytu-ce-cosmos/turkish-large-bert-cased,BertForPreTraining,BertTokenizerFast,21076.0,16830.0,79.85,8670.0,41.14,50.7,45.3,67.43,34.24,60.11,28.68,36.04,47.57,5.93,3.85,19.12,1286.0,337M,1024.0,32000.0,1024,0.33,MLM
|
| 32 |
+
31,dbmdz/bert-base-turkish-cased,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,47.89,45.17,66.39,35.28,60.05,30.52,33.62,54.03,10.13,9.07,24.41,421.0,110M,768.0,32000.0,512,0.33,MLM
|
| 33 |
+
32,newmindai/TurkEmbed4STS-Static,StaticModel,Tokenizer,13258.0,7304.0,55.09,4075.0,30.74,45.45,43.05512,57.01745,19.3065,65.30815000000001,32.834500000000006,40.809,63.33800000000001,19.964,12.687,31.996333333333336,244.0,64M,256.0,250002.0,∞,0.4254717954565192,Embedding
|
| 34 |
+
33,KocLab-Bilkent/BERTurk-Legal,BertForMaskedLM,BertTokenizerFast,27482.0,19590.0,71.28,8228.0,29.94,46.44,42.02,60.61,26.24,59.51,25.8,37.94,61.4,15.51,20.99,32.63,703.0,184M,768.0,128000.0,512,0.34,MLM
|
| 35 |
+
34,newmindai/Mursit-Large,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,44.65,41.75,62.95,25.34,58.04,27.4,35.01,42.74,11.29,17.1,23.71,1539.0,403M,1024.0,59008.0,2048,0.28,MLM
|
| 36 |
+
35,nomic-ai/nomic-embed-text-v1,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.75,41.66643666666667,47.90213333333333,9.1279,60.08205,34.3415,56.8786,58.672,23.771,15.572,32.67166666666667,521.0,136M,768.0,30528.0,8192,0.426704518889946,Embedding
|
| 37 |
+
36,ytu-ce-cosmos/turkish-base-bert-uncased,BertForPreTraining,BertTokenizerFast,17128.0,14329.0,83.66,6062.0,35.39,50.54,40.95,66.2,25.68,58.21,20.46,34.2,45.94,10.21,6.28,20.81,421.0,110M,768.0,32000.0,512,0.3,MLM
|
| 38 |
+
37,nomic-ai/nomic-embed-text-v1.5,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.21,40.30043666666667,48.92313333333334,9.3571,58.52505,33.8085,50.8884,56.711,13.358,5.783,25.284,521.0,136M,768.0,30528.0,8192,0.4147406606805225,Embedding
|
| 39 |
+
38,newmindai/Mursit-Base,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,41.34,40.23,59.78,25.48,58.65,20.82,36.45,36.0,7.4,10.4,17.93,593.0,155M,768.0,59008.0,1024,0.28,MLM
|
| 40 |
+
39,mixedbread-ai/mxbai-embed-large-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,40.92,40.03663,49.5437,15.9903,56.6587,31.74075,46.2497,43.591,10.564,9.052,21.069,1278.0,335M,1024.0,30522.0,512,0.3720971359650719,Embedding
|
| 41 |
+
40,jhu-clsp/mmBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,13585.0,5611.0,41.3,5710.0,42.03,43.87,39.65,61.84,26.77,59.25,15.83,34.56,34.45,1.33,0.68,12.15,1170.0,306M,768.0,256000.0,8192,0.34,MLM
|
| 42 |
+
41,boun-tabilab/TabiBERT,ModernBertForMaskedLM,PreTrainedTokenizerFast,32444.0,20388.0,62.84,12186.0,37.56,42.15,37.77,59.63,25.75,58.19,14.96,30.32,32.02,1.86,0.63,11.5,567.0,148M,768.0,50176.0,8192,0.32,MLM
|
| 43 |
+
42,sentence-transformers/all-MiniLM-L12-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.56,33.19119,44.84295,7.693999999999999,58.1998,20.928,34.2912,38.948,2.771,2.557,14.758666666666668,127.0,33M,384.0,30522.0,512,0.3620264346982647,Embedding
|
| 44 |
+
43,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.81,32.343716666666666,44.079283333333336,5.5512,58.2895,24.92,28.8786,36.243,4.816,5.283,15.447333333333336,86.0,22M,384.0,30522.0,512,0.3352620465291676,Embedding
|
| 45 |
+
44,boun-tabi-LMG/TURNA,T5ForConditionalGeneration,T5TokenizerFast,21630.0,18600.0,85.99,7923.0,36.63,31.74,31.622866666666663,47.17373333333333,10.2619,56.6155,16.333,27.7302,34.89,8.883000000000001,4.55,16.107666666666667,1889.0,495M,1024.0,32128.0,1024,0.2188615462224458,Seq2Seq
|
| 46 |
+
45,sentence-transformers/all-mpnet-base-v2,MPNetForMaskedLM,MPNetTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.51,31.580113333333333,43.75221666666667,10.0253,55.9924,17.051750000000002,31.0789,32.477000000000004,2.243,3.31,12.67666666666667,417.0,109M,768.0,30527.0,514,0.3072208676420578,Embedding
|
| 47 |
+
46,sentence-transformers/all-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,30.84,30.223826666666668,44.49228333333334,6.576,56.7533,16.46825,26.8293,32.039,3.052,3.514,12.868333333333334,86.0,22M,384.0,30522.0,512,0.3117993950335187,Embedding
|
| 48 |
+
47,minishlab/potion-base-8M,StaticModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.26,30.1419,42.5097,2.2195,57.8614,22.4745,25.6444,46.72,13.243,9.77,23.244333333333334,28.0,7M,256.0,29528.0,∞,0.363850332504128,Embedding
|
| 49 |
+
48,sentence-transformers/paraphrase-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,29.68,28.88314666666667,44.08553333333333,5.963100000000001,56.6191,14.424,23.324,22.977,4.347,2.266,9.863333333333337,86.0,22M,384.0,30522.0,512,0.3273012423895394,Embedding
|
| 50 |
+
49,answerdotai/ModernBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.33,23.8,39.06,2.01,53.95,2.1,21.91,7.92,0.62,0.43,2.99,568.0,149M,768.0,50368.0,8192,0.23,MLM
|
| 51 |
+
50,answerdotai/ModernBERT-large,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.46,23.74,39.44,3.9,53.73,1.8,19.85,6.12,0.62,0.59,2.44,1505.0,394M,1024.0,50368.0,8192,0.2,MLM
|
| 52 |
+
51,google-bert/bert-base-uncased,BertForMaskedLM,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,22.86,23.49519,40.2581,2.7069,53.06465,2.8455,18.6008,8.535,0.393,0.912,3.2800000000000007,417.0,109M,768.0,30522.0,512,0.1652374209194242,MLM
|
requirements.txt
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
gradio
|
| 2 |
pandas>=2.3.3
|
| 3 |
numpy>=2.3.4
|
| 4 |
-
|
|
|
|
| 5 |
requests>=2.32.5
|
| 6 |
python-dotenv>=1.1.1
|
| 7 |
itsdangerous>=2.2.0
|
|
|
|
| 1 |
+
gradio==5.50.0
|
| 2 |
pandas>=2.3.3
|
| 3 |
numpy>=2.3.4
|
| 4 |
+
plotly>=6.5.0
|
| 5 |
+
matplotlib>=3.10.0
|
| 6 |
requests>=2.32.5
|
| 7 |
python-dotenv>=1.1.1
|
| 8 |
itsdangerous>=2.2.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Mizan Turkish Leaderboard - HuggingFace Space Version
|
| 3 |
+
|
| 4 |
+
Clean, modular architecture for the public leaderboard.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from .core import column_registry, settings
|
| 8 |
+
from .data import DataTransformer, LeaderboardStyler
|
| 9 |
+
from .components import LeaderboardTab, DatasetTab, SubmitTab
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"column_registry",
|
| 13 |
+
"settings",
|
| 14 |
+
"DataTransformer",
|
| 15 |
+
"LeaderboardStyler",
|
| 16 |
+
"LeaderboardTab",
|
| 17 |
+
"DatasetTab",
|
| 18 |
+
"SubmitTab",
|
| 19 |
+
]
|
src/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/src/__pycache__/__init__.cpython-312.pyc and b/src/__pycache__/__init__.cpython-312.pyc differ
|
|
|
src/api/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API client modules."""
|
| 2 |
+
|
| 3 |
+
from .client import EvaluationApiClient
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"EvaluationApiClient",
|
| 7 |
+
]
|
src/api/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/src/api/__pycache__/__init__.cpython-312.pyc and b/src/api/__pycache__/__init__.cpython-312.pyc differ
|
|
|
src/api/__pycache__/client.cpython-312.pyc
CHANGED
|
Binary files a/src/api/__pycache__/client.cpython-312.pyc and b/src/api/__pycache__/client.cpython-312.pyc differ
|
|
|
src/api/client.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API Client Module
|
| 3 |
+
|
| 4 |
+
Handles communication with the evaluation backend.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Optional
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
from ..core.config import settings
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class EvaluationApiClient:
|
| 17 |
+
"""
|
| 18 |
+
Client for evaluation API operations.
|
| 19 |
+
|
| 20 |
+
Handles submission of evaluation requests to the backend.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.api_url = settings.api.url
|
| 25 |
+
self.auth = (settings.api.username, settings.api.password)
|
| 26 |
+
self.timeout = settings.api.timeout
|
| 27 |
+
|
| 28 |
+
def submit_evaluation(
|
| 29 |
+
self,
|
| 30 |
+
model_name: str,
|
| 31 |
+
email: str,
|
| 32 |
+
batch_size: int = 32
|
| 33 |
+
) -> bool:
|
| 34 |
+
"""
|
| 35 |
+
Submit an evaluation request to the API.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
model_name: HuggingFace model identifier.
|
| 39 |
+
email: Email for notifications.
|
| 40 |
+
batch_size: Batch size for evaluation.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
True if submission was successful.
|
| 44 |
+
"""
|
| 45 |
+
if not settings.api.is_configured:
|
| 46 |
+
logger.error("API not configured - cannot submit evaluation")
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
payload = {
|
| 51 |
+
"model_name": model_name,
|
| 52 |
+
"model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
|
| 53 |
+
"batch_size": batch_size,
|
| 54 |
+
"email": email,
|
| 55 |
+
"model_type": "sentence-transformer"
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
response = requests.post(
|
| 59 |
+
f"{self.api_url}/api/mteb/request",
|
| 60 |
+
json=payload,
|
| 61 |
+
timeout=self.timeout,
|
| 62 |
+
auth=self.auth
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
if response.status_code == 200:
|
| 66 |
+
logger.info(f"Evaluation submitted successfully for {model_name}")
|
| 67 |
+
return True
|
| 68 |
+
else:
|
| 69 |
+
logger.error(f"API returned status {response.status_code}")
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Error submitting evaluation: {e}")
|
| 74 |
+
return False
|
src/components/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""UI Components for Gradio interface."""
|
| 2 |
+
|
| 3 |
+
from .leaderboard import LeaderboardTab
|
| 4 |
+
from .dataset import DatasetTab
|
| 5 |
+
from .submit import SubmitTab
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"LeaderboardTab",
|
| 9 |
+
"DatasetTab",
|
| 10 |
+
"SubmitTab",
|
| 11 |
+
]
|
src/components/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/src/components/__pycache__/__init__.cpython-312.pyc and b/src/components/__pycache__/__init__.cpython-312.pyc differ
|
|
|
src/components/__pycache__/dataset.cpython-312.pyc
CHANGED
|
Binary files a/src/components/__pycache__/dataset.cpython-312.pyc and b/src/components/__pycache__/dataset.cpython-312.pyc differ
|
|
|
src/components/__pycache__/leaderboard.cpython-312.pyc
CHANGED
|
Binary files a/src/components/__pycache__/leaderboard.cpython-312.pyc and b/src/components/__pycache__/leaderboard.cpython-312.pyc differ
|
|
|
src/components/__pycache__/submit.cpython-312.pyc
CHANGED
|
Binary files a/src/components/__pycache__/submit.cpython-312.pyc and b/src/components/__pycache__/submit.cpython-312.pyc differ
|
|
|
src/components/dataset.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dataset Tab Component
|
| 3 |
+
|
| 4 |
+
Displays task and dataset information.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import html
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DatasetTab:
|
| 13 |
+
"""
|
| 14 |
+
Dataset information tab component.
|
| 15 |
+
|
| 16 |
+
Shows details about the evaluation tasks and datasets.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def build(self) -> None:
|
| 20 |
+
"""Build the dataset tab UI."""
|
| 21 |
+
gr.Markdown("### MTEB Turkish + Turkish Legal Dataset Overview")
|
| 22 |
+
|
| 23 |
+
# Task name to dataset path mapping
|
| 24 |
+
task_to_dataset = {
|
| 25 |
+
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
|
| 26 |
+
'XQuADRetrieval': 'google/xquad',
|
| 27 |
+
'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
|
| 28 |
+
'MKQARetrieval': 'apple/mkqa',
|
| 29 |
+
'MassiveIntentClassification': 'mteb/amazon_massive_intent',
|
| 30 |
+
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
|
| 31 |
+
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
|
| 32 |
+
'SIB200Classification': 'mteb/sib200',
|
| 33 |
+
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
|
| 34 |
+
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
|
| 35 |
+
'SIB200ClusteringS2S': 'mteb/sib200',
|
| 36 |
+
'XNLI': 'mteb/xnli',
|
| 37 |
+
'XNLIV2': 'mteb/xnli2.0-multi-pair',
|
| 38 |
+
'STS22.v2': 'mteb/sts22-crosslingual-sts'
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Create clickable task names
|
| 42 |
+
clickable_task_names = []
|
| 43 |
+
task_list = [
|
| 44 |
+
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
|
| 45 |
+
'MassiveIntentClassification', 'MassiveScenarioClassification',
|
| 46 |
+
'MultilingualSentimentClassification', 'SIB200Classification',
|
| 47 |
+
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
|
| 48 |
+
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
for task_name in task_list:
|
| 52 |
+
dataset_path = task_to_dataset[task_name]
|
| 53 |
+
hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
|
| 54 |
+
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
|
| 55 |
+
clickable_task_names.append(clickable_name)
|
| 56 |
+
|
| 57 |
+
# Create dataset information table
|
| 58 |
+
dataset_data = pd.DataFrame({
|
| 59 |
+
'Task Name': clickable_task_names,
|
| 60 |
+
'Task Type': [
|
| 61 |
+
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
|
| 62 |
+
'Classification', 'Classification',
|
| 63 |
+
'Classification', 'Classification',
|
| 64 |
+
'Classification', 'Classification',
|
| 65 |
+
'Clustering', 'PairClassification', 'PairClassification', 'STS'
|
| 66 |
+
],
|
| 67 |
+
'Description': [
|
| 68 |
+
'Turkish FAQ retrieval task',
|
| 69 |
+
'Turkish question answering retrieval',
|
| 70 |
+
'Historical Turkish document retrieval',
|
| 71 |
+
'Multilingual knowledge QA retrieval',
|
| 72 |
+
'Intent classification for Turkish',
|
| 73 |
+
'Scenario classification for Turkish',
|
| 74 |
+
'Multilingual sentiment classification',
|
| 75 |
+
'SIB200 language identification',
|
| 76 |
+
'Turkish movie review sentiment',
|
| 77 |
+
'Turkish product review sentiment',
|
| 78 |
+
'SIB200 clustering task',
|
| 79 |
+
'Turkish natural language inference',
|
| 80 |
+
'Enhanced Turkish NLI task',
|
| 81 |
+
'Turkish semantic textual similarity'
|
| 82 |
+
],
|
| 83 |
+
'Domain': [
|
| 84 |
+
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
|
| 85 |
+
'Intent', 'Scenario',
|
| 86 |
+
'Sentiment', 'Language ID',
|
| 87 |
+
'Movies', 'Products',
|
| 88 |
+
'Language ID', 'NLI', 'NLI', 'STS'
|
| 89 |
+
],
|
| 90 |
+
'Samples': [
|
| 91 |
+
'~145K', '~1.19K', '~1.33K', '~10K',
|
| 92 |
+
'~5K', '~5K',
|
| 93 |
+
'211', '~899',
|
| 94 |
+
'~2.64K', '800',
|
| 95 |
+
'99', '~7.5K', '~5.01K', '~208'
|
| 96 |
+
]
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
gr.Dataframe(
|
| 100 |
+
value=dataset_data,
|
| 101 |
+
label="MTEB Turkish Task Details",
|
| 102 |
+
interactive=False,
|
| 103 |
+
wrap=True,
|
| 104 |
+
datatype=["html", "str", "str", "str", "str"]
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Turkish Legal Tasks Section
|
| 108 |
+
self._build_legal_tasks_section()
|
| 109 |
+
|
| 110 |
+
# Task distribution
|
| 111 |
+
self._build_task_distribution_section()
|
| 112 |
+
|
| 113 |
+
# Metrics explanation
|
| 114 |
+
self._build_metrics_explanation_section()
|
| 115 |
+
|
| 116 |
+
def _build_legal_tasks_section(self):
|
| 117 |
+
"""Build the Turkish Legal Tasks section."""
|
| 118 |
+
gr.Markdown("---")
|
| 119 |
+
gr.Markdown("### Turkish Legal Tasks")
|
| 120 |
+
|
| 121 |
+
legal_task_to_dataset = {
|
| 122 |
+
'TurkishLegalQA': 'newmindai/contract-retrieval',
|
| 123 |
+
'TurkishTaxRulings': 'newmindai/regulation-retrieval',
|
| 124 |
+
'TurkishCourtOfCassation': 'newmindai/caselaw-retrieval'
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
clickable_legal_task_names = []
|
| 128 |
+
for task_name in ['TurkishLegalQA', 'TurkishTaxRulings', 'TurkishCourtOfCassation']:
|
| 129 |
+
dataset_path = legal_task_to_dataset[task_name]
|
| 130 |
+
hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
|
| 131 |
+
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
|
| 132 |
+
clickable_legal_task_names.append(clickable_name)
|
| 133 |
+
|
| 134 |
+
legal_task_data = pd.DataFrame({
|
| 135 |
+
'Task Name': clickable_legal_task_names,
|
| 136 |
+
'Task Type': ['Contracts', 'Regulation', 'Case Law'],
|
| 137 |
+
'Description': [
|
| 138 |
+
'Turkish legal question answering retrieval',
|
| 139 |
+
'Turkish legal tax rulings retrieval',
|
| 140 |
+
'Turkish Court of Cassation caselaw retrieval'
|
| 141 |
+
],
|
| 142 |
+
'Domain': ['Contracts', 'Regulation', 'Caselaw'],
|
| 143 |
+
'Samples': ['272', '~120K', '~1.39K']
|
| 144 |
+
})
|
| 145 |
+
|
| 146 |
+
gr.Dataframe(
|
| 147 |
+
value=legal_task_data,
|
| 148 |
+
label="Turkish Legal Task Details",
|
| 149 |
+
interactive=False,
|
| 150 |
+
wrap=True,
|
| 151 |
+
datatype=["html", "str", "str", "str", "str"]
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
def _build_task_distribution_section(self):
|
| 155 |
+
"""Build the task distribution section."""
|
| 156 |
+
gr.Markdown("""
|
| 157 |
+
### Task Distribution:
|
| 158 |
+
|
| 159 |
+
**Turkish Tasks (14):**
|
| 160 |
+
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
|
| 161 |
+
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
|
| 162 |
+
- **Pair Classification**: 2 tasks (natural language inference)
|
| 163 |
+
- **Clustering**: 1 task (language clustering)
|
| 164 |
+
- **STS**: 1 task (semantic textual similarity)
|
| 165 |
+
|
| 166 |
+
**Turkish Legal Tasks (3):**
|
| 167 |
+
- **Contracts**: 1 task (Turkish legal QA retrieval)
|
| 168 |
+
- **Regulation**: 1 task (Turkish tax rulings retrieval)
|
| 169 |
+
- **Caselaw**: 1 task (Turkish Court of Cassation case law retrieval)
|
| 170 |
+
|
| 171 |
+
**Total: 17 tasks across 8 categories**
|
| 172 |
+
""")
|
| 173 |
+
|
| 174 |
+
# Statistics summary
|
| 175 |
+
stats_data = pd.DataFrame({
|
| 176 |
+
'Metric': [
|
| 177 |
+
'Total Tasks',
|
| 178 |
+
'Turkish Tasks',
|
| 179 |
+
'Legal Tasks',
|
| 180 |
+
'Task Categories',
|
| 181 |
+
'Languages',
|
| 182 |
+
'Avg. Tokens per Sample'
|
| 183 |
+
],
|
| 184 |
+
'Value': [
|
| 185 |
+
'17 tasks',
|
| 186 |
+
'14 tasks',
|
| 187 |
+
'3 tasks',
|
| 188 |
+
'8 categories',
|
| 189 |
+
'Turkish',
|
| 190 |
+
'~150 tokens'
|
| 191 |
+
],
|
| 192 |
+
'Notes': [
|
| 193 |
+
'Comprehensive evaluation: Turkish NLP + Legal',
|
| 194 |
+
'Classification, Retrieval, STS, NLI, Clustering',
|
| 195 |
+
'Contracts, Regulation, Caselaw',
|
| 196 |
+
'Turkish: 5 types, Legal: 3 types',
|
| 197 |
+
'Turkish-focused',
|
| 198 |
+
'Varies by task type and domain'
|
| 199 |
+
]
|
| 200 |
+
})
|
| 201 |
+
|
| 202 |
+
gr.Dataframe(
|
| 203 |
+
value=stats_data,
|
| 204 |
+
label="Dataset Statistics Summary",
|
| 205 |
+
interactive=False
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
def _build_metrics_explanation_section(self):
|
| 209 |
+
"""Build the metrics explanation section."""
|
| 210 |
+
gr.Markdown("""
|
| 211 |
+
---
|
| 212 |
+
### Metrics Explanation:
|
| 213 |
+
|
| 214 |
+
**Task Categories:**
|
| 215 |
+
- **MTEB Score**: Average performance by task categories (refers to Mean (TaskType))
|
| 216 |
+
- **Mean (Task)**: Average performance across all individual tasks
|
| 217 |
+
- **Classification**: Performance on Turkish classification tasks
|
| 218 |
+
- **Clustering**: Performance on Turkish clustering tasks
|
| 219 |
+
- **Pair Classification**: Performance on pair classification tasks (like NLI)
|
| 220 |
+
- **Retrieval**: Performance on Turkish information retrieval tasks
|
| 221 |
+
- **STS**: Performance on Semantic Textual Similarity tasks
|
| 222 |
+
|
| 223 |
+
**Turkish Legal Categories:**
|
| 224 |
+
- **Contracts**: Performance on Turkish legal contract analysis tasks
|
| 225 |
+
- **Regulation**: Performance on Turkish legal regulation analysis tasks
|
| 226 |
+
- **Caselaw**: Performance on Turkish Court of Cassation case law retrieval tasks
|
| 227 |
+
|
| 228 |
+
### Tokenizer Quality Metrics:
|
| 229 |
+
- **Unique Token Count**: Number of unique tokens generated by the tokenizer on Turkish MMLU dataset
|
| 230 |
+
- **Turkish Token Count**: How many unique tokens are valid Turkish words/morphemes
|
| 231 |
+
- **Turkish Token %**: Percentage of unique tokens that are linguistically valid Turkish
|
| 232 |
+
- **Pure Token Count**: How many unique tokens are morphologically pure (root words)
|
| 233 |
+
- **Pure Token %**: Percentage of unique tokens that are root words without suffixes
|
| 234 |
+
|
| 235 |
+
### Model Information:
|
| 236 |
+
- **Parameters**: Number of model parameters
|
| 237 |
+
- **Embed Dim**: Embedding dimension size
|
| 238 |
+
- **Max Seq Length**: Maximum sequence length the model can process
|
| 239 |
+
- **Vocab Size**: Size of the model's vocabulary
|
| 240 |
+
- **Model Architecture**: The underlying model architecture
|
| 241 |
+
- **Tokenizer Type**: The tokenizer implementation used
|
| 242 |
+
""")
|
| 243 |
+
|
| 244 |
+
# About, Contact, and Links section
|
| 245 |
+
self._build_about_section()
|
| 246 |
+
|
| 247 |
+
def _build_about_section(self):
|
| 248 |
+
"""Build the about, contact, and links section."""
|
| 249 |
+
gr.Markdown("""
|
| 250 |
+
---
|
| 251 |
+
### About Mizan:
|
| 252 |
+
This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
|
| 253 |
+
on Turkish language tasks across multiple domains including:
|
| 254 |
+
- Text classification and sentiment analysis
|
| 255 |
+
- Information retrieval and search
|
| 256 |
+
- Semantic textual similarity
|
| 257 |
+
- Text clustering and pair classification
|
| 258 |
+
- **Turkish Legal**: Contract analysis, regulation, and case law retrieval
|
| 259 |
+
|
| 260 |
+
### Submit Your Model:
|
| 261 |
+
Use the **Submit** tab to submit your Turkish embedding model for evaluation.
|
| 262 |
+
Your request will be reviewed by administrators and you'll receive email notifications about the progress.
|
| 263 |
+
|
| 264 |
+
### Contact:
|
| 265 |
+
For any questions or feedback, please contact info@newmind.ai
|
| 266 |
+
|
| 267 |
+
### Links:
|
| 268 |
+
- **GitHub**: [embeddings-benchmark/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
|
| 269 |
+
- **Github**: [malibayram/tokenizer_benchmark](https://github.com/malibayram/tokenizer_benchmark) - Tokenizer evaluation is done with code from this repository, developed by Mehmet Ali Bayram, which utilizes ITU NLP tools for Turkish linguistic analysis.
|
| 270 |
+
""")
|
src/components/leaderboard.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Leaderboard Tab Component
|
| 3 |
+
|
| 4 |
+
Main leaderboard display with column filtering.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Dict, List, Optional
|
| 9 |
+
import gradio as gr
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import plotly.graph_objects as go
|
| 13 |
+
|
| 14 |
+
from ..core.columns import column_registry, ColumnGroup
|
| 15 |
+
from ..core.config import settings
|
| 16 |
+
from ..data import DataTransformer, LeaderboardStyler
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class LeaderboardTab:
|
| 22 |
+
"""
|
| 23 |
+
Leaderboard tab component.
|
| 24 |
+
|
| 25 |
+
Displays the main ranking table with:
|
| 26 |
+
- Color-coded scores
|
| 27 |
+
- Column filtering via checkbox groups
|
| 28 |
+
- Clickable model links
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, data: pd.DataFrame):
|
| 32 |
+
self.data = data
|
| 33 |
+
self.transformer = DataTransformer()
|
| 34 |
+
self.styler = LeaderboardStyler()
|
| 35 |
+
|
| 36 |
+
# UI components (will be set during build)
|
| 37 |
+
self.leaderboard: Optional[gr.Dataframe] = None
|
| 38 |
+
self._column_checkboxes: Dict[str, gr.CheckboxGroup] = {}
|
| 39 |
+
self._selected_columns_state: Optional[gr.State] = None
|
| 40 |
+
self._model_type_filter_state: Optional[gr.State] = None
|
| 41 |
+
self._search_state: Optional[gr.State] = None
|
| 42 |
+
|
| 43 |
+
def _get_styled_data(
|
| 44 |
+
self,
|
| 45 |
+
columns: List[str],
|
| 46 |
+
model_type_filter: str = "All"
|
| 47 |
+
) -> "pd.io.formats.style.Styler":
|
| 48 |
+
"""Get styled DataFrame for given columns."""
|
| 49 |
+
if self.data is None or self.data.empty:
|
| 50 |
+
empty = self.transformer.create_empty_dataframe()
|
| 51 |
+
return empty.style
|
| 52 |
+
|
| 53 |
+
# Apply model type filter
|
| 54 |
+
filtered_data = self.data.copy()
|
| 55 |
+
if model_type_filter != "All" and "Model Type" in filtered_data.columns:
|
| 56 |
+
filtered_data = filtered_data[filtered_data["Model Type"] == model_type_filter]
|
| 57 |
+
|
| 58 |
+
filtered = self.transformer.prepare_for_display(filtered_data, columns, add_links=False)
|
| 59 |
+
return self.styler.apply_styling(filtered)
|
| 60 |
+
|
| 61 |
+
def _get_column_groups(self) -> Dict[str, List[str]]:
|
| 62 |
+
"""Get optional columns organized by group (exclude default columns)."""
|
| 63 |
+
groups = {}
|
| 64 |
+
|
| 65 |
+
# Get default column names to exclude
|
| 66 |
+
default_cols = set(column_registry.default_columns)
|
| 67 |
+
|
| 68 |
+
# MTEB Task Scores (only optional ones)
|
| 69 |
+
mteb_cols = [col for col in column_registry.get_group_names(ColumnGroup.MTEB) if col not in default_cols]
|
| 70 |
+
if mteb_cols:
|
| 71 |
+
groups["MTEB Scores"] = mteb_cols
|
| 72 |
+
|
| 73 |
+
# Legal Task Scores (only optional ones)
|
| 74 |
+
legal_cols = [col for col in column_registry.get_group_names(ColumnGroup.LEGAL) if col not in default_cols]
|
| 75 |
+
if legal_cols:
|
| 76 |
+
groups["Legal Scores"] = legal_cols
|
| 77 |
+
|
| 78 |
+
# Correlation
|
| 79 |
+
corr_cols = [col for col in column_registry.get_group_names(ColumnGroup.CORRELATION) if col not in default_cols]
|
| 80 |
+
if corr_cols:
|
| 81 |
+
groups["Correlation"] = corr_cols
|
| 82 |
+
|
| 83 |
+
# Tokenizer Quality (only optional ones)
|
| 84 |
+
tok_cols = [col for col in column_registry.get_group_names(ColumnGroup.TOKENIZER) if col not in default_cols]
|
| 85 |
+
if tok_cols:
|
| 86 |
+
groups["Tokenizer Quality"] = tok_cols
|
| 87 |
+
|
| 88 |
+
# Additional Model Info (only optional ones)
|
| 89 |
+
model_info_cols = [col for col in column_registry.get_group_names(ColumnGroup.MODEL_INFO) if col not in default_cols]
|
| 90 |
+
if model_info_cols:
|
| 91 |
+
groups["Model Info"] = model_info_cols
|
| 92 |
+
|
| 93 |
+
return groups
|
| 94 |
+
|
| 95 |
+
def _filter_columns_handler(
|
| 96 |
+
self,
|
| 97 |
+
previous_selected: List[str],
|
| 98 |
+
model_type_filter: str,
|
| 99 |
+
*checkbox_values
|
| 100 |
+
) -> tuple:
|
| 101 |
+
"""Handle checkbox group changes with click-order tracking."""
|
| 102 |
+
# Collect all currently selected columns from all checkbox groups
|
| 103 |
+
currently_selected = set()
|
| 104 |
+
for selected_list in checkbox_values:
|
| 105 |
+
if selected_list:
|
| 106 |
+
for col_name in selected_list:
|
| 107 |
+
currently_selected.add(col_name)
|
| 108 |
+
|
| 109 |
+
previous_set = set(previous_selected)
|
| 110 |
+
|
| 111 |
+
# Find newly added columns (in current but not in previous)
|
| 112 |
+
newly_added = currently_selected - previous_set
|
| 113 |
+
|
| 114 |
+
# Find removed columns (in previous but not in current)
|
| 115 |
+
removed = previous_set - currently_selected
|
| 116 |
+
|
| 117 |
+
# Update the ordered list: keep previous order, remove deselected, append new
|
| 118 |
+
updated_selected = [col for col in previous_selected if col not in removed]
|
| 119 |
+
for col in newly_added:
|
| 120 |
+
updated_selected.append(col)
|
| 121 |
+
|
| 122 |
+
# Build final column list: defaults + selected optional in order
|
| 123 |
+
ordered_columns = list(column_registry.default_columns) + updated_selected
|
| 124 |
+
|
| 125 |
+
# Get styled data with model type filter
|
| 126 |
+
styled = self._get_styled_data(ordered_columns, model_type_filter)
|
| 127 |
+
datatypes = self.styler.get_datatypes(ordered_columns)
|
| 128 |
+
widths = self.styler.get_column_widths(ordered_columns)
|
| 129 |
+
|
| 130 |
+
return gr.update(value=styled, datatype=datatypes, column_widths=widths), updated_selected
|
| 131 |
+
|
| 132 |
+
def _model_type_filter_handler(self, previous_selected: List[str], model_type_filter: str) -> tuple:
|
| 133 |
+
"""Handle model type filter changes."""
|
| 134 |
+
# Build final column list: defaults + selected optional in order
|
| 135 |
+
ordered_columns = list(column_registry.default_columns) + previous_selected
|
| 136 |
+
|
| 137 |
+
# Get styled data with model type filter
|
| 138 |
+
styled = self._get_styled_data(ordered_columns, model_type_filter)
|
| 139 |
+
datatypes = self.styler.get_datatypes(ordered_columns)
|
| 140 |
+
widths = self.styler.get_column_widths(ordered_columns)
|
| 141 |
+
|
| 142 |
+
return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter
|
| 143 |
+
|
| 144 |
+
def _model_type_and_plots_handler(
|
| 145 |
+
self,
|
| 146 |
+
previous_selected: List[str],
|
| 147 |
+
model_type_filter: str
|
| 148 |
+
) -> tuple:
|
| 149 |
+
"""Handle model type filter changes and update both leaderboard and plots."""
|
| 150 |
+
# Build final column list: defaults + selected optional in order
|
| 151 |
+
ordered_columns = list(column_registry.default_columns) + previous_selected
|
| 152 |
+
|
| 153 |
+
# Get styled data with model type filter
|
| 154 |
+
styled = self._get_styled_data(ordered_columns, model_type_filter)
|
| 155 |
+
datatypes = self.styler.get_datatypes(ordered_columns)
|
| 156 |
+
widths = self.styler.get_column_widths(ordered_columns)
|
| 157 |
+
|
| 158 |
+
# Update plots with filtered data
|
| 159 |
+
plot1 = self._get_pure_vs_mean_task_plot(model_type_filter)
|
| 160 |
+
plot2 = self._get_pure_vs_legal_score_plot(model_type_filter)
|
| 161 |
+
|
| 162 |
+
return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter, plot1, plot2
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _create_bubble_plot(self, x_col: str, y_col: str, size_col: str,
|
| 166 |
+
title: str, xlabel: str, ylabel: str, model_type_filter: str = "All") -> Optional[go.Figure]:
|
| 167 |
+
"""
|
| 168 |
+
Create an interactive Plotly bubble plot for tokenizer visualization.
|
| 169 |
+
|
| 170 |
+
Features:
|
| 171 |
+
- Interactive hover, zoom, pan
|
| 172 |
+
- Text annotations on bubbles
|
| 173 |
+
- Viridis colormap
|
| 174 |
+
- Matches matplotlib styling
|
| 175 |
+
- Model type filtering
|
| 176 |
+
"""
|
| 177 |
+
try:
|
| 178 |
+
# Load leaderboard summary
|
| 179 |
+
file_path = settings.data.csv_file
|
| 180 |
+
if not file_path.exists():
|
| 181 |
+
logger.warning(f"Leaderboard data not found: {file_path}")
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
df = pd.read_csv(file_path)
|
| 185 |
+
|
| 186 |
+
# Apply column name mappings from CSV to display names
|
| 187 |
+
csv_mapping = column_registry.get_csv_mapping()
|
| 188 |
+
df = df.rename(columns=csv_mapping)
|
| 189 |
+
|
| 190 |
+
# Apply model type filter
|
| 191 |
+
if model_type_filter != "All" and "Model Type" in df.columns:
|
| 192 |
+
df = df[df["Model Type"] == model_type_filter]
|
| 193 |
+
|
| 194 |
+
# Filter rows that have the required columns
|
| 195 |
+
required_cols = [x_col, y_col, size_col, 'Model']
|
| 196 |
+
if not all(col in df.columns for col in required_cols):
|
| 197 |
+
logger.warning(f"Missing required columns for plot")
|
| 198 |
+
return None
|
| 199 |
+
|
| 200 |
+
# Filter out rows with missing data
|
| 201 |
+
plot_df = df[required_cols].copy()
|
| 202 |
+
plot_df = plot_df.dropna(subset=[x_col, y_col, size_col])
|
| 203 |
+
|
| 204 |
+
if plot_df.empty:
|
| 205 |
+
logger.warning(f"No data available for plotting {x_col} vs {y_col}")
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
# Prepare data
|
| 209 |
+
x = plot_df[x_col]
|
| 210 |
+
y = plot_df[y_col]
|
| 211 |
+
sizes = plot_df[size_col]
|
| 212 |
+
models = plot_df['Model']
|
| 213 |
+
|
| 214 |
+
# Normalize sizes for bubble plot (smaller bubbles for cleaner look)
|
| 215 |
+
size_min, size_max = sizes.min(), sizes.max()
|
| 216 |
+
if size_max > size_min:
|
| 217 |
+
normalized_sizes = 8 + (sizes - size_min) / (size_max - size_min) * 35
|
| 218 |
+
else:
|
| 219 |
+
normalized_sizes = np.full(len(sizes), 20)
|
| 220 |
+
|
| 221 |
+
# Create Plotly figure
|
| 222 |
+
fig = go.Figure()
|
| 223 |
+
|
| 224 |
+
# Add scatter trace with bubbles
|
| 225 |
+
fig.add_trace(go.Scatter(
|
| 226 |
+
x=x,
|
| 227 |
+
y=y,
|
| 228 |
+
mode='markers',
|
| 229 |
+
marker=dict(
|
| 230 |
+
size=normalized_sizes,
|
| 231 |
+
color=sizes, # Color by Turkish Token Count
|
| 232 |
+
colorscale='Viridis',
|
| 233 |
+
showscale=True,
|
| 234 |
+
colorbar=dict(
|
| 235 |
+
title=dict(text="Turkish<br>Token<br>Count", font=dict(size=12, family='Arial, sans-serif')),
|
| 236 |
+
thickness=12,
|
| 237 |
+
len=1
|
| 238 |
+
),
|
| 239 |
+
line=dict(width=0.5, color='rgba(0,0,0,0.3)'),
|
| 240 |
+
opacity=0.7
|
| 241 |
+
),
|
| 242 |
+
text=models,
|
| 243 |
+
hovertemplate='<b>%{text}</b><br>' +
|
| 244 |
+
f'{xlabel}: %{{x:.2f}}<br>' +
|
| 245 |
+
f'{ylabel}: %{{y:.0f}}<br>' +
|
| 246 |
+
f'{size_col}: %{{marker.color:.0f}}<br>' +
|
| 247 |
+
'<extra></extra>',
|
| 248 |
+
name='',
|
| 249 |
+
showlegend=False
|
| 250 |
+
))
|
| 251 |
+
|
| 252 |
+
# Get top 5 models by Pure Token Count for custom legend
|
| 253 |
+
top_5_df = plot_df.nlargest(5, y_col)
|
| 254 |
+
top_5_models = top_5_df['Model'].tolist()
|
| 255 |
+
|
| 256 |
+
# Build custom legend text using annotations (pixel-perfect control)
|
| 257 |
+
legend_lines = ["<b>Top 5 Models</b>"] + [f"{i}. {name}" for i, name in enumerate(top_5_models, 1)]
|
| 258 |
+
legend_text = "<br>".join(legend_lines)
|
| 259 |
+
|
| 260 |
+
# Update layout for responsive, clean display
|
| 261 |
+
fig.update_layout(
|
| 262 |
+
title=dict(
|
| 263 |
+
text=title,
|
| 264 |
+
font=dict(size=14, family='Arial, sans-serif', color='black'),
|
| 265 |
+
x=0.5,
|
| 266 |
+
xanchor='center',
|
| 267 |
+
y=0.98,
|
| 268 |
+
yanchor='top'
|
| 269 |
+
),
|
| 270 |
+
xaxis=dict(
|
| 271 |
+
title=dict(text=xlabel, font=dict(size=12, family='Arial, sans-serif')),
|
| 272 |
+
gridcolor='rgba(128,128,128,0.2)',
|
| 273 |
+
gridwidth=0.5,
|
| 274 |
+
showgrid=True,
|
| 275 |
+
zeroline=False
|
| 276 |
+
),
|
| 277 |
+
yaxis=dict(
|
| 278 |
+
title=dict(text=ylabel, font=dict(size=12, family='Arial, sans-serif')),
|
| 279 |
+
gridcolor='rgba(128,128,128,0.2)',
|
| 280 |
+
gridwidth=0.5,
|
| 281 |
+
showgrid=True,
|
| 282 |
+
zeroline=False
|
| 283 |
+
),
|
| 284 |
+
plot_bgcolor='white',
|
| 285 |
+
paper_bgcolor='white',
|
| 286 |
+
autosize=True,
|
| 287 |
+
hovermode='closest',
|
| 288 |
+
showlegend=False,
|
| 289 |
+
margin=dict(l=60, r=60, t=80, b=60)
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Add custom legend as annotation
|
| 293 |
+
fig.add_annotation(
|
| 294 |
+
text=legend_text,
|
| 295 |
+
xref='paper',
|
| 296 |
+
yref='paper',
|
| 297 |
+
x=1.14,
|
| 298 |
+
y=1.255,
|
| 299 |
+
xanchor='right',
|
| 300 |
+
yanchor='top',
|
| 301 |
+
showarrow=False,
|
| 302 |
+
font=dict(size=9, family='Arial, sans-serif', color='#333'),
|
| 303 |
+
align='left',
|
| 304 |
+
bgcolor='rgba(255,255,255,0.9)',
|
| 305 |
+
bordercolor='rgba(0,0,0,0.15)',
|
| 306 |
+
borderwidth=1,
|
| 307 |
+
borderpad=4
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Expand x-axis range for better spacing
|
| 311 |
+
x_min, x_max = x.min(), x.max()
|
| 312 |
+
x_range = x_max - x_min
|
| 313 |
+
fig.update_xaxes(range=[x_min - x_range * 0.05, x_max + x_range * 0.05])
|
| 314 |
+
|
| 315 |
+
return fig
|
| 316 |
+
|
| 317 |
+
except Exception as e:
|
| 318 |
+
logger.error(f"Error creating bubble plot: {e}")
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
def _get_pure_vs_mean_task_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
|
| 322 |
+
"""Get Plotly figure for Pure Token Count vs MTEB Score plot."""
|
| 323 |
+
return self._create_bubble_plot(
|
| 324 |
+
x_col='MTEB Score',
|
| 325 |
+
y_col='Pure Token Count',
|
| 326 |
+
size_col='Turkish Token Count',
|
| 327 |
+
title='Pure Token Count vs MTEB Score',
|
| 328 |
+
xlabel='MTEB Score (%)',
|
| 329 |
+
ylabel='Pure Token Count',
|
| 330 |
+
model_type_filter=model_type_filter
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
def _get_pure_vs_legal_score_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
|
| 334 |
+
"""Get Plotly figure for Pure Token Count vs Legal Score plot."""
|
| 335 |
+
return self._create_bubble_plot(
|
| 336 |
+
x_col='Legal Score',
|
| 337 |
+
y_col='Pure Token Count',
|
| 338 |
+
size_col='Turkish Token Count',
|
| 339 |
+
title='Pure Token Count vs Legal Score',
|
| 340 |
+
xlabel='Legal Score (%)',
|
| 341 |
+
ylabel='Pure Token Count',
|
| 342 |
+
model_type_filter=model_type_filter
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
def build(self) -> gr.Dataframe:
|
| 346 |
+
"""
|
| 347 |
+
Build the leaderboard tab UI.
|
| 348 |
+
|
| 349 |
+
Returns:
|
| 350 |
+
The main leaderboard Dataframe component.
|
| 351 |
+
"""
|
| 352 |
+
# Initial styled data (filter to All by default)
|
| 353 |
+
initial_columns = column_registry.default_columns
|
| 354 |
+
initial_styled = self._get_styled_data(initial_columns, "All")
|
| 355 |
+
initial_datatypes = self.styler.get_datatypes(initial_columns)
|
| 356 |
+
initial_widths = self.styler.get_column_widths(initial_columns)
|
| 357 |
+
|
| 358 |
+
# State to track selected columns in click order
|
| 359 |
+
self._selected_columns_state = gr.State([])
|
| 360 |
+
|
| 361 |
+
# State to track model type filter
|
| 362 |
+
self._model_type_filter_state = gr.State("All")
|
| 363 |
+
|
| 364 |
+
# Get column groups
|
| 365 |
+
column_groups = self._get_column_groups()
|
| 366 |
+
|
| 367 |
+
# Model Type Filter (Radio buttons)
|
| 368 |
+
model_type_choices = ["All", "CLM-Embedding", "Embedding", "MLM", "Seq2Seq"]
|
| 369 |
+
model_type_radio = gr.Radio(
|
| 370 |
+
choices=model_type_choices,
|
| 371 |
+
value="All",
|
| 372 |
+
label="Filter by Model Type",
|
| 373 |
+
container=True,
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# Create checkbox groups in a compact accordion layout
|
| 377 |
+
checkbox_components = []
|
| 378 |
+
|
| 379 |
+
with gr.Accordion("Optional Columns", open=False):
|
| 380 |
+
with gr.Row():
|
| 381 |
+
for group_name, columns in column_groups.items():
|
| 382 |
+
checkbox = gr.CheckboxGroup(
|
| 383 |
+
choices=columns,
|
| 384 |
+
value=[],
|
| 385 |
+
label=group_name,
|
| 386 |
+
container=True,
|
| 387 |
+
)
|
| 388 |
+
self._column_checkboxes[group_name] = checkbox
|
| 389 |
+
checkbox_components.append(checkbox)
|
| 390 |
+
|
| 391 |
+
# Main leaderboard
|
| 392 |
+
self.leaderboard = gr.Dataframe(
|
| 393 |
+
value=initial_styled,
|
| 394 |
+
datatype=initial_datatypes,
|
| 395 |
+
column_widths=initial_widths,
|
| 396 |
+
interactive=False,
|
| 397 |
+
wrap=True,
|
| 398 |
+
max_height=settings.ui.max_table_height,
|
| 399 |
+
show_search=True,
|
| 400 |
+
show_copy_button=True,
|
| 401 |
+
show_fullscreen_button=True,
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
# Tokenizer visualizations
|
| 405 |
+
gr.Markdown("### Tokenizer Quality Visualizations")
|
| 406 |
+
gr.Markdown("""
|
| 407 |
+
Interactive bubble plots showing tokenizer quality metrics vs model performance.
|
| 408 |
+
Bubble size and color represent Turkish Token Count. Hover for details, zoom, and pan.
|
| 409 |
+
""")
|
| 410 |
+
|
| 411 |
+
with gr.Row():
|
| 412 |
+
# Plot 1: Pure Token Count vs Mean Task
|
| 413 |
+
self.plot_mean_task = gr.Plot(
|
| 414 |
+
value=self._get_pure_vs_mean_task_plot("All"),
|
| 415 |
+
label="Pure Token Count vs Mean Task (MTEB)",
|
| 416 |
+
show_label=False,
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# Plot 2: Pure Token Count vs Legal Score
|
| 420 |
+
self.plot_legal_score = gr.Plot(
|
| 421 |
+
value=self._get_pure_vs_legal_score_plot("All"),
|
| 422 |
+
label="Pure Token Count vs Score(Legal)",
|
| 423 |
+
show_label=False,
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
# Usage instructions
|
| 427 |
+
gr.Markdown("""
|
| 428 |
+
### How to Use:
|
| 429 |
+
- **Search**: Use the search box to find specific models
|
| 430 |
+
- **Color Coding**: Scores are color-coded from red (low) to green (high)
|
| 431 |
+
- **Sorting**: Click on column headers to sort
|
| 432 |
+
- **Rankings**: Models ranked by MTEB Score
|
| 433 |
+
- **Toggle Columns**: Use the checkboxes above to show/hide additional metrics
|
| 434 |
+
- **Filter by Model Type**: Use the radio buttons to filter models by their type
|
| 435 |
+
""")
|
| 436 |
+
|
| 437 |
+
# Wire up events
|
| 438 |
+
self._setup_events(checkbox_components, model_type_radio)
|
| 439 |
+
|
| 440 |
+
return self.leaderboard
|
| 441 |
+
|
| 442 |
+
def _setup_events(
|
| 443 |
+
self,
|
| 444 |
+
checkbox_components: List[gr.CheckboxGroup],
|
| 445 |
+
model_type_radio: gr.Radio
|
| 446 |
+
):
|
| 447 |
+
"""Set up event handlers."""
|
| 448 |
+
# Each checkbox group triggers column filtering with state tracking
|
| 449 |
+
for checkbox in checkbox_components:
|
| 450 |
+
checkbox.change(
|
| 451 |
+
fn=self._filter_columns_handler,
|
| 452 |
+
inputs=[self._selected_columns_state, self._model_type_filter_state] + checkbox_components,
|
| 453 |
+
outputs=[self.leaderboard, self._selected_columns_state]
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
# Model type radio triggers filtering and plot updates
|
| 457 |
+
model_type_radio.change(
|
| 458 |
+
fn=self._model_type_and_plots_handler,
|
| 459 |
+
inputs=[self._selected_columns_state, model_type_radio],
|
| 460 |
+
outputs=[self.leaderboard, self._model_type_filter_state, self.plot_mean_task, self.plot_legal_score]
|
| 461 |
+
)
|
src/components/submit.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Submit Tab Component
|
| 3 |
+
|
| 4 |
+
Model evaluation submission with HuggingFace authentication.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import re
|
| 9 |
+
from typing import Optional, Tuple
|
| 10 |
+
import gradio as gr
|
| 11 |
+
|
| 12 |
+
from ..api import EvaluationApiClient
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SubmitTab:
|
| 18 |
+
"""
|
| 19 |
+
Submit evaluation tab component.
|
| 20 |
+
|
| 21 |
+
Provides:
|
| 22 |
+
- HuggingFace OAuth login
|
| 23 |
+
- Model submission form
|
| 24 |
+
- Email notification setup
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self.api_client = EvaluationApiClient()
|
| 29 |
+
|
| 30 |
+
# UI components (will be set during build)
|
| 31 |
+
self.model_input: Optional[gr.Textbox] = None
|
| 32 |
+
self.email_input: Optional[gr.Textbox] = None
|
| 33 |
+
self.submit_btn: Optional[gr.Button] = None
|
| 34 |
+
self.login_button: Optional[gr.LoginButton] = None
|
| 35 |
+
self.result_output: Optional[gr.HTML] = None
|
| 36 |
+
|
| 37 |
+
def _validate_model_name(self, model_name: str) -> Optional[str]:
|
| 38 |
+
"""Validate model name format."""
|
| 39 |
+
if not model_name or not model_name.strip():
|
| 40 |
+
return "Model name cannot be empty!"
|
| 41 |
+
|
| 42 |
+
model_name = model_name.strip()
|
| 43 |
+
|
| 44 |
+
if len(model_name) < 3:
|
| 45 |
+
return "Model name too short!"
|
| 46 |
+
|
| 47 |
+
if len(model_name) > 256:
|
| 48 |
+
return "Model name too long (maximum 256 characters)!"
|
| 49 |
+
|
| 50 |
+
if '/' not in model_name:
|
| 51 |
+
return "Invalid format! Must include organization (e.g., organization/model-name)"
|
| 52 |
+
|
| 53 |
+
if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
|
| 54 |
+
return "Invalid format! Use format: organization/model-name"
|
| 55 |
+
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
def _validate_email(self, email: str) -> Optional[str]:
|
| 59 |
+
"""Validate email format."""
|
| 60 |
+
if not email or not email.strip():
|
| 61 |
+
return "Email address cannot be empty!"
|
| 62 |
+
|
| 63 |
+
email = email.strip()
|
| 64 |
+
|
| 65 |
+
if len(email) > 254:
|
| 66 |
+
return "Email address too long!"
|
| 67 |
+
|
| 68 |
+
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
| 69 |
+
if not re.match(email_pattern, email):
|
| 70 |
+
return "Invalid email address format!"
|
| 71 |
+
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
def _handle_submit(self, model_name: str, email: str, profile) -> str:
|
| 75 |
+
"""Handle evaluation submission."""
|
| 76 |
+
# Authentication check
|
| 77 |
+
if profile is None:
|
| 78 |
+
return "<p style='color: red; font-weight: bold;'>⚠️ Authentication required. Please log in with your Hugging Face account.</p>"
|
| 79 |
+
|
| 80 |
+
# Check for local dev mock auth
|
| 81 |
+
if isinstance(profile, str) and profile == "Sign in with Hugging Face":
|
| 82 |
+
return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
|
| 83 |
+
|
| 84 |
+
# Validate model name
|
| 85 |
+
model_error = self._validate_model_name(model_name)
|
| 86 |
+
if model_error:
|
| 87 |
+
return f"<p style='color: red; font-weight: bold;'>❌ {model_error}</p>"
|
| 88 |
+
|
| 89 |
+
# Validate email
|
| 90 |
+
email_error = self._validate_email(email)
|
| 91 |
+
if email_error:
|
| 92 |
+
return f"<p style='color: red; font-weight: bold;'>❌ {email_error}</p>"
|
| 93 |
+
|
| 94 |
+
# Submit to API
|
| 95 |
+
model_name = model_name.strip()
|
| 96 |
+
email = email.strip()
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
success = self.api_client.submit_evaluation(model_name, email)
|
| 100 |
+
|
| 101 |
+
if success:
|
| 102 |
+
return f"""
|
| 103 |
+
<div style='padding: 16px; background: #d4edda; border-radius: 8px; border: 1px solid #c3e6cb; color: #155724;'>
|
| 104 |
+
<h3 style='color: #155724; margin: 0 0 12px 0;'>✅ Evaluation Request Submitted!</h3>
|
| 105 |
+
<p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Model:</strong> {model_name}</p>
|
| 106 |
+
<p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Email:</strong> {email}</p>
|
| 107 |
+
<hr style='margin: 12px 0; border-color: #c3e6cb;'>
|
| 108 |
+
<p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Next Steps:</strong></p>
|
| 109 |
+
<ul style='color: #155724; margin: 8px 0; padding-left: 20px;'>
|
| 110 |
+
<li style='color: #155724;'>Your request will be reviewed by our system</li>
|
| 111 |
+
<li style='color: #155724;'>You will receive email notifications about the status</li>
|
| 112 |
+
<li style='color: #155724;'>Results will appear on the leaderboard when complete</li>
|
| 113 |
+
</ul>
|
| 114 |
+
<p style='color: #155724; margin-top: 12px; font-style: italic;'>Thank you for contributing to the Mizan Leaderboard!</p>
|
| 115 |
+
</div>
|
| 116 |
+
"""
|
| 117 |
+
else:
|
| 118 |
+
return """
|
| 119 |
+
<div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
|
| 120 |
+
<h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Submission Failed</h3>
|
| 121 |
+
<p>Unable to connect to the evaluation service. Please try again later.</p>
|
| 122 |
+
</div>
|
| 123 |
+
"""
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"Error submitting evaluation: {e}")
|
| 126 |
+
return f"""
|
| 127 |
+
<div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
|
| 128 |
+
<h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Error</h3>
|
| 129 |
+
<p>An unexpected error occurred. Please try again later.</p>
|
| 130 |
+
</div>
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
def build(self) -> None:
|
| 134 |
+
"""Build the submit tab UI."""
|
| 135 |
+
gr.Markdown("### Submit Model for Evaluation")
|
| 136 |
+
gr.Markdown("""
|
| 137 |
+
Submit your Turkish embedding model for evaluation on the Mizan benchmark.
|
| 138 |
+
**Authentication with Hugging Face is required to submit evaluations.**
|
| 139 |
+
""")
|
| 140 |
+
|
| 141 |
+
# OAuth login button
|
| 142 |
+
self.login_button = gr.LoginButton(value="Sign in with Hugging Face")
|
| 143 |
+
|
| 144 |
+
self.model_input = gr.Textbox(
|
| 145 |
+
label="Model Name",
|
| 146 |
+
placeholder="sentence-transformers/your-model",
|
| 147 |
+
info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
self.email_input = gr.Textbox(
|
| 151 |
+
label="Email Address",
|
| 152 |
+
placeholder="your.email@example.com",
|
| 153 |
+
info="Email for notifications about evaluation status and results"
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
self.submit_btn = gr.Button(
|
| 157 |
+
"Submit",
|
| 158 |
+
variant="primary",
|
| 159 |
+
size="lg"
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Result output
|
| 163 |
+
self.result_output = gr.HTML(label="Status")
|
| 164 |
+
|
| 165 |
+
# Wire up submit button
|
| 166 |
+
self.submit_btn.click(
|
| 167 |
+
fn=self._handle_submit,
|
| 168 |
+
inputs=[self.model_input, self.email_input, self.login_button],
|
| 169 |
+
outputs=[self.result_output]
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Information about the evaluation process
|
| 173 |
+
gr.Markdown("""
|
| 174 |
+
### Evaluation Process:
|
| 175 |
+
1. **Sign In**: First, sign in with your Hugging Face account using the button above
|
| 176 |
+
2. **Submit Request**: Fill out the form with your model details and email
|
| 177 |
+
3. **Admin Review**: Your request will be reviewed by administrators
|
| 178 |
+
4. **Evaluation**: If approved, your model will be evaluated on Mizan benchmark
|
| 179 |
+
5. **Results**: You'll receive email notifications and results will appear on the leaderboard
|
| 180 |
+
|
| 181 |
+
### Important Notes:
|
| 182 |
+
- **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
|
| 183 |
+
- You'll receive email updates about your request status
|
| 184 |
+
- Make sure your model is publicly available on HuggingFace
|
| 185 |
+
- Valid email address is required for receiving results
|
| 186 |
+
""")
|
src/core/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core modules - configuration and column definitions."""
|
| 2 |
+
|
| 3 |
+
from .columns import column_registry, ColumnType, ColumnGroup, ColumnDefinition
|
| 4 |
+
from .config import settings
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"column_registry",
|
| 8 |
+
"ColumnType",
|
| 9 |
+
"ColumnGroup",
|
| 10 |
+
"ColumnDefinition",
|
| 11 |
+
"settings",
|
| 12 |
+
]
|
src/core/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/src/core/__pycache__/__init__.cpython-312.pyc and b/src/core/__pycache__/__init__.cpython-312.pyc differ
|
|
|
src/core/__pycache__/columns.cpython-312.pyc
CHANGED
|
Binary files a/src/core/__pycache__/columns.cpython-312.pyc and b/src/core/__pycache__/columns.cpython-312.pyc differ
|
|
|
src/core/__pycache__/config.cpython-312.pyc
CHANGED
|
Binary files a/src/core/__pycache__/config.cpython-312.pyc and b/src/core/__pycache__/config.cpython-312.pyc differ
|
|
|
src/core/columns.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized Column Definitions
|
| 3 |
+
|
| 4 |
+
Single source of truth for all leaderboard columns.
|
| 5 |
+
Add new columns here and they propagate everywhere automatically.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from enum import Enum, auto
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ColumnType(Enum):
|
| 14 |
+
"""Column data types for Gradio."""
|
| 15 |
+
NUMBER = "number"
|
| 16 |
+
STRING = "str"
|
| 17 |
+
HTML = "html"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ColumnGroup(Enum):
|
| 21 |
+
"""Column groupings for organization and filtering."""
|
| 22 |
+
CORE = auto() # Always visible: Rank, Model
|
| 23 |
+
LEGAL = auto() # Legal benchmark scores
|
| 24 |
+
MTEB = auto() # MTEB task type scores
|
| 25 |
+
TOKENIZER = auto() # Tokenizer quality metrics
|
| 26 |
+
MODEL_INFO = auto() # Model metadata
|
| 27 |
+
CORRELATION = auto() # Correlation metrics
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class ColumnDefinition:
|
| 32 |
+
"""
|
| 33 |
+
Complete definition for a leaderboard column.
|
| 34 |
+
|
| 35 |
+
This is the single source of truth - all column metadata lives here.
|
| 36 |
+
"""
|
| 37 |
+
name: str
|
| 38 |
+
api_name: Optional[str] = None
|
| 39 |
+
column_type: ColumnType = ColumnType.STRING
|
| 40 |
+
group: ColumnGroup = ColumnGroup.CORE
|
| 41 |
+
width: str = "120px"
|
| 42 |
+
decimals: int = 2
|
| 43 |
+
default_visible: bool = True
|
| 44 |
+
colorize: bool = False
|
| 45 |
+
description: str = ""
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def csv_key(self) -> str:
|
| 49 |
+
"""Get the key used in CSV files."""
|
| 50 |
+
return self.api_name or self.name
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
COLUMN_DEFINITIONS: List[ColumnDefinition] = [
|
| 54 |
+
# 1. Rank (always first)
|
| 55 |
+
ColumnDefinition(
|
| 56 |
+
name="Rank",
|
| 57 |
+
column_type=ColumnType.NUMBER,
|
| 58 |
+
group=ColumnGroup.CORE,
|
| 59 |
+
width="50px",
|
| 60 |
+
decimals=0,
|
| 61 |
+
default_visible=True,
|
| 62 |
+
description="Rank by MTEB Score (Mean TaskType)"
|
| 63 |
+
),
|
| 64 |
+
# 2. Model (always second)
|
| 65 |
+
ColumnDefinition(
|
| 66 |
+
name="Model",
|
| 67 |
+
column_type=ColumnType.HTML,
|
| 68 |
+
group=ColumnGroup.CORE,
|
| 69 |
+
width="280px",
|
| 70 |
+
default_visible=True,
|
| 71 |
+
colorize=False,
|
| 72 |
+
description="Model name with HuggingFace link"
|
| 73 |
+
),
|
| 74 |
+
# 3. MTEB Score - default
|
| 75 |
+
ColumnDefinition(
|
| 76 |
+
name="MTEB Score",
|
| 77 |
+
api_name="Mean (TaskType)",
|
| 78 |
+
column_type=ColumnType.NUMBER,
|
| 79 |
+
group=ColumnGroup.MTEB,
|
| 80 |
+
width="140px",
|
| 81 |
+
default_visible=True,
|
| 82 |
+
colorize=True,
|
| 83 |
+
description="MTEB Score: Average of task type category scores"
|
| 84 |
+
),
|
| 85 |
+
# 4. Legal Score - default
|
| 86 |
+
ColumnDefinition(
|
| 87 |
+
name="Legal Score",
|
| 88 |
+
api_name="Score(Legal)",
|
| 89 |
+
column_type=ColumnType.NUMBER,
|
| 90 |
+
group=ColumnGroup.LEGAL,
|
| 91 |
+
width="120px",
|
| 92 |
+
default_visible=True,
|
| 93 |
+
colorize=True,
|
| 94 |
+
description="Mean of legal benchmark scores (Contracts, Regulation, Caselaw)"
|
| 95 |
+
),
|
| 96 |
+
# 5. Pure Token Count - default
|
| 97 |
+
ColumnDefinition(
|
| 98 |
+
name="Pure Token Count",
|
| 99 |
+
column_type=ColumnType.NUMBER,
|
| 100 |
+
group=ColumnGroup.TOKENIZER,
|
| 101 |
+
width="150px",
|
| 102 |
+
decimals=0,
|
| 103 |
+
default_visible=True,
|
| 104 |
+
description="Tokens that are morphologically pure"
|
| 105 |
+
),
|
| 106 |
+
# 6. Max Sequence Length - default
|
| 107 |
+
ColumnDefinition(
|
| 108 |
+
name="Max Sequence Length",
|
| 109 |
+
api_name="Max Tokens",
|
| 110 |
+
column_type=ColumnType.NUMBER,
|
| 111 |
+
group=ColumnGroup.MODEL_INFO,
|
| 112 |
+
width="160px",
|
| 113 |
+
decimals=0,
|
| 114 |
+
default_visible=True,
|
| 115 |
+
description="Maximum sequence length"
|
| 116 |
+
),
|
| 117 |
+
# 7. Parameters - default
|
| 118 |
+
ColumnDefinition(
|
| 119 |
+
name="Parameters",
|
| 120 |
+
api_name="Number of Parameters",
|
| 121 |
+
column_type=ColumnType.NUMBER,
|
| 122 |
+
group=ColumnGroup.MODEL_INFO,
|
| 123 |
+
width="120px",
|
| 124 |
+
decimals=0,
|
| 125 |
+
default_visible=True,
|
| 126 |
+
description="Number of model parameters (e.g., 1.2B)"
|
| 127 |
+
),
|
| 128 |
+
# 8. Model Architecture - default
|
| 129 |
+
ColumnDefinition(
|
| 130 |
+
name="Model Architecture",
|
| 131 |
+
column_type=ColumnType.STRING,
|
| 132 |
+
group=ColumnGroup.MODEL_INFO,
|
| 133 |
+
width="180px",
|
| 134 |
+
default_visible=True,
|
| 135 |
+
description="Underlying model architecture (e.g., XLMRobertaModel)"
|
| 136 |
+
),
|
| 137 |
+
# 9. Mean (Task) - optional
|
| 138 |
+
ColumnDefinition(
|
| 139 |
+
name="Mean (Task)",
|
| 140 |
+
column_type=ColumnType.NUMBER,
|
| 141 |
+
group=ColumnGroup.MTEB,
|
| 142 |
+
width="120px",
|
| 143 |
+
default_visible=False,
|
| 144 |
+
colorize=True,
|
| 145 |
+
description="Average of all individual task scores"
|
| 146 |
+
),
|
| 147 |
+
# 10. Contracts - optional
|
| 148 |
+
ColumnDefinition(
|
| 149 |
+
name="Contracts",
|
| 150 |
+
column_type=ColumnType.NUMBER,
|
| 151 |
+
group=ColumnGroup.LEGAL,
|
| 152 |
+
width="110px",
|
| 153 |
+
default_visible=False,
|
| 154 |
+
colorize=True,
|
| 155 |
+
description="Performance on Turkish legal contract analysis"
|
| 156 |
+
),
|
| 157 |
+
# 11. Regulation - optional
|
| 158 |
+
ColumnDefinition(
|
| 159 |
+
name="Regulation",
|
| 160 |
+
column_type=ColumnType.NUMBER,
|
| 161 |
+
group=ColumnGroup.LEGAL,
|
| 162 |
+
width="110px",
|
| 163 |
+
default_visible=False,
|
| 164 |
+
colorize=True,
|
| 165 |
+
description="Performance on Turkish tax rulings retrieval"
|
| 166 |
+
),
|
| 167 |
+
# 12. Caselaw - optional
|
| 168 |
+
ColumnDefinition(
|
| 169 |
+
name="Caselaw",
|
| 170 |
+
column_type=ColumnType.NUMBER,
|
| 171 |
+
group=ColumnGroup.LEGAL,
|
| 172 |
+
width="110px",
|
| 173 |
+
default_visible=False,
|
| 174 |
+
colorize=True,
|
| 175 |
+
description="Performance on Court of Cassation case retrieval"
|
| 176 |
+
),
|
| 177 |
+
# 13. Classification - optional
|
| 178 |
+
ColumnDefinition(
|
| 179 |
+
name="Classification",
|
| 180 |
+
column_type=ColumnType.NUMBER,
|
| 181 |
+
group=ColumnGroup.MTEB,
|
| 182 |
+
width="130px",
|
| 183 |
+
default_visible=False,
|
| 184 |
+
colorize=True,
|
| 185 |
+
description="Performance on Turkish classification tasks"
|
| 186 |
+
),
|
| 187 |
+
# 14. Clustering - optional
|
| 188 |
+
ColumnDefinition(
|
| 189 |
+
name="Clustering",
|
| 190 |
+
column_type=ColumnType.NUMBER,
|
| 191 |
+
group=ColumnGroup.MTEB,
|
| 192 |
+
width="120px",
|
| 193 |
+
default_visible=False,
|
| 194 |
+
colorize=True,
|
| 195 |
+
description="Performance on Turkish clustering tasks"
|
| 196 |
+
),
|
| 197 |
+
# 15. Pair Classification - optional
|
| 198 |
+
ColumnDefinition(
|
| 199 |
+
name="Pair Classification",
|
| 200 |
+
api_name="PairClassification",
|
| 201 |
+
column_type=ColumnType.NUMBER,
|
| 202 |
+
group=ColumnGroup.MTEB,
|
| 203 |
+
width="150px",
|
| 204 |
+
default_visible=False,
|
| 205 |
+
colorize=True,
|
| 206 |
+
description="Performance on pair classification tasks (NLI)"
|
| 207 |
+
),
|
| 208 |
+
# 16. Retrieval - optional
|
| 209 |
+
ColumnDefinition(
|
| 210 |
+
name="Retrieval",
|
| 211 |
+
column_type=ColumnType.NUMBER,
|
| 212 |
+
group=ColumnGroup.MTEB,
|
| 213 |
+
width="120px",
|
| 214 |
+
default_visible=False,
|
| 215 |
+
colorize=True,
|
| 216 |
+
description="Performance on information retrieval tasks"
|
| 217 |
+
),
|
| 218 |
+
# 17. STS - optional
|
| 219 |
+
ColumnDefinition(
|
| 220 |
+
name="STS",
|
| 221 |
+
column_type=ColumnType.NUMBER,
|
| 222 |
+
group=ColumnGroup.MTEB,
|
| 223 |
+
width="100px",
|
| 224 |
+
default_visible=False,
|
| 225 |
+
colorize=True,
|
| 226 |
+
description="Performance on Semantic Textual Similarity tasks"
|
| 227 |
+
),
|
| 228 |
+
# 18. Correlation - optional
|
| 229 |
+
ColumnDefinition(
|
| 230 |
+
name="Correlation",
|
| 231 |
+
column_type=ColumnType.NUMBER,
|
| 232 |
+
group=ColumnGroup.CORRELATION,
|
| 233 |
+
width="120px",
|
| 234 |
+
decimals=3,
|
| 235 |
+
default_visible=False,
|
| 236 |
+
colorize=True,
|
| 237 |
+
description="Weighted average of correlation metrics"
|
| 238 |
+
),
|
| 239 |
+
# 19. Tokenizer Type - optional
|
| 240 |
+
ColumnDefinition(
|
| 241 |
+
name="Tokenizer Type",
|
| 242 |
+
column_type=ColumnType.STRING,
|
| 243 |
+
group=ColumnGroup.TOKENIZER,
|
| 244 |
+
width="180px",
|
| 245 |
+
default_visible=False,
|
| 246 |
+
description="Tokenizer implementation type"
|
| 247 |
+
),
|
| 248 |
+
# 20. Unique Token Count - optional
|
| 249 |
+
ColumnDefinition(
|
| 250 |
+
name="Unique Token Count",
|
| 251 |
+
column_type=ColumnType.NUMBER,
|
| 252 |
+
group=ColumnGroup.TOKENIZER,
|
| 253 |
+
width="160px",
|
| 254 |
+
decimals=0,
|
| 255 |
+
default_visible=False,
|
| 256 |
+
description="Number of unique tokens on Turkish MMLU"
|
| 257 |
+
),
|
| 258 |
+
# 21. Turkish Token Count - optional
|
| 259 |
+
ColumnDefinition(
|
| 260 |
+
name="Turkish Token Count",
|
| 261 |
+
column_type=ColumnType.NUMBER,
|
| 262 |
+
group=ColumnGroup.TOKENIZER,
|
| 263 |
+
width="170px",
|
| 264 |
+
decimals=0,
|
| 265 |
+
default_visible=False,
|
| 266 |
+
description="Unique tokens that are valid Turkish"
|
| 267 |
+
),
|
| 268 |
+
# 22. Turkish Token % - optional
|
| 269 |
+
ColumnDefinition(
|
| 270 |
+
name="Turkish Token %",
|
| 271 |
+
column_type=ColumnType.NUMBER,
|
| 272 |
+
group=ColumnGroup.TOKENIZER,
|
| 273 |
+
width="140px",
|
| 274 |
+
default_visible=False,
|
| 275 |
+
description="Percentage of valid Turkish tokens"
|
| 276 |
+
),
|
| 277 |
+
# 23. Pure Token % - optional
|
| 278 |
+
ColumnDefinition(
|
| 279 |
+
name="Pure Token %",
|
| 280 |
+
column_type=ColumnType.NUMBER,
|
| 281 |
+
group=ColumnGroup.TOKENIZER,
|
| 282 |
+
width="130px",
|
| 283 |
+
default_visible=False,
|
| 284 |
+
description="Percentage of pure root word tokens"
|
| 285 |
+
),
|
| 286 |
+
# 24. Embed Dim - optional
|
| 287 |
+
ColumnDefinition(
|
| 288 |
+
name="Embed Dim",
|
| 289 |
+
api_name="Embedding Dimensions",
|
| 290 |
+
column_type=ColumnType.NUMBER,
|
| 291 |
+
group=ColumnGroup.MODEL_INFO,
|
| 292 |
+
width="120px",
|
| 293 |
+
decimals=0,
|
| 294 |
+
default_visible=False,
|
| 295 |
+
description="Embedding dimension size"
|
| 296 |
+
),
|
| 297 |
+
# 25. Vocab Size - optional
|
| 298 |
+
ColumnDefinition(
|
| 299 |
+
name="Vocab Size",
|
| 300 |
+
column_type=ColumnType.NUMBER,
|
| 301 |
+
group=ColumnGroup.MODEL_INFO,
|
| 302 |
+
width="120px",
|
| 303 |
+
decimals=0,
|
| 304 |
+
default_visible=False,
|
| 305 |
+
description="Vocabulary size"
|
| 306 |
+
),
|
| 307 |
+
# 26. Model Type - optional
|
| 308 |
+
ColumnDefinition(
|
| 309 |
+
name="Model Type",
|
| 310 |
+
column_type=ColumnType.STRING,
|
| 311 |
+
group=ColumnGroup.MODEL_INFO,
|
| 312 |
+
width="130px",
|
| 313 |
+
default_visible=False,
|
| 314 |
+
description="Model type: Embedding, MLM, CLM-Embedding, or Seq2Seq"
|
| 315 |
+
),
|
| 316 |
+
]
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
class ColumnRegistry:
|
| 320 |
+
"""
|
| 321 |
+
Central registry for column definitions.
|
| 322 |
+
|
| 323 |
+
Provides convenient access methods for column metadata.
|
| 324 |
+
"""
|
| 325 |
+
|
| 326 |
+
def __init__(self, definitions: List[ColumnDefinition] = None):
|
| 327 |
+
self._definitions = definitions or COLUMN_DEFINITIONS
|
| 328 |
+
self._by_name: Dict[str, ColumnDefinition] = {
|
| 329 |
+
col.name: col for col in self._definitions
|
| 330 |
+
}
|
| 331 |
+
self._by_csv_key: Dict[str, ColumnDefinition] = {
|
| 332 |
+
col.csv_key: col for col in self._definitions
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
@property
|
| 336 |
+
def all_columns(self) -> List[str]:
|
| 337 |
+
"""All column names in order."""
|
| 338 |
+
return [col.name for col in self._definitions]
|
| 339 |
+
|
| 340 |
+
@property
|
| 341 |
+
def default_columns(self) -> List[str]:
|
| 342 |
+
"""Columns visible by default."""
|
| 343 |
+
return [col.name for col in self._definitions if col.default_visible]
|
| 344 |
+
|
| 345 |
+
@property
|
| 346 |
+
def optional_columns(self) -> List[str]:
|
| 347 |
+
"""Columns that can be toggled on/off."""
|
| 348 |
+
return [col.name for col in self._definitions if not col.default_visible]
|
| 349 |
+
|
| 350 |
+
@property
|
| 351 |
+
def score_columns(self) -> List[str]:
|
| 352 |
+
"""Columns that should be colorized."""
|
| 353 |
+
return [col.name for col in self._definitions if col.colorize]
|
| 354 |
+
|
| 355 |
+
@property
|
| 356 |
+
def numeric_columns(self) -> List[str]:
|
| 357 |
+
"""Columns with numeric type."""
|
| 358 |
+
return [col.name for col in self._definitions if col.column_type == ColumnType.NUMBER]
|
| 359 |
+
|
| 360 |
+
def get(self, name: str) -> Optional[ColumnDefinition]:
|
| 361 |
+
"""Get column definition by name."""
|
| 362 |
+
return self._by_name.get(name)
|
| 363 |
+
|
| 364 |
+
def get_by_csv_key(self, csv_key: str) -> Optional[ColumnDefinition]:
|
| 365 |
+
"""Get column definition by CSV key."""
|
| 366 |
+
return self._by_csv_key.get(csv_key)
|
| 367 |
+
|
| 368 |
+
def get_by_group(self, group: ColumnGroup) -> List[ColumnDefinition]:
|
| 369 |
+
"""Get all columns in a group."""
|
| 370 |
+
return [col for col in self._definitions if col.group == group]
|
| 371 |
+
|
| 372 |
+
def get_group_names(self, group: ColumnGroup) -> List[str]:
|
| 373 |
+
"""Get column names for a group."""
|
| 374 |
+
return [col.name for col in self.get_by_group(group)]
|
| 375 |
+
|
| 376 |
+
def get_datatypes(self, columns: List[str]) -> List[str]:
|
| 377 |
+
"""Get Gradio datatypes for given columns."""
|
| 378 |
+
return [
|
| 379 |
+
self._by_name[col].column_type.value
|
| 380 |
+
for col in columns
|
| 381 |
+
if col in self._by_name
|
| 382 |
+
]
|
| 383 |
+
|
| 384 |
+
def get_widths(self, columns: List[str]) -> List[str]:
|
| 385 |
+
"""Get column widths for given columns."""
|
| 386 |
+
return [
|
| 387 |
+
self._by_name[col].width
|
| 388 |
+
for col in columns
|
| 389 |
+
if col in self._by_name
|
| 390 |
+
]
|
| 391 |
+
|
| 392 |
+
def get_csv_mapping(self) -> Dict[str, str]:
|
| 393 |
+
"""Get mapping from CSV keys to display names."""
|
| 394 |
+
return {
|
| 395 |
+
col.csv_key: col.name
|
| 396 |
+
for col in self._definitions
|
| 397 |
+
if col.csv_key != col.name
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
# Global registry instance
|
| 402 |
+
column_registry = ColumnRegistry()
|
src/core/config.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration Module for HuggingFace Space
|
| 3 |
+
|
| 4 |
+
Simplified configuration for the public Mizan leaderboard.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Load environment variables
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class ApiSettings:
|
| 18 |
+
"""API settings for evaluation submissions."""
|
| 19 |
+
url: str = field(default_factory=lambda: os.environ.get("API_URL", ""))
|
| 20 |
+
username: str = field(default_factory=lambda: os.environ.get("API_USERNAME", ""))
|
| 21 |
+
password: str = field(default_factory=lambda: os.environ.get("API_PASSWORD", ""))
|
| 22 |
+
timeout: int = 30
|
| 23 |
+
|
| 24 |
+
@property
|
| 25 |
+
def is_configured(self) -> bool:
|
| 26 |
+
"""Check if API is fully configured."""
|
| 27 |
+
return bool(self.url and self.username and self.password)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class UISettings:
|
| 32 |
+
"""UI-specific settings."""
|
| 33 |
+
port: int = 7860
|
| 34 |
+
max_table_height: int = 600
|
| 35 |
+
debug: bool = field(default_factory=lambda: os.environ.get("DEBUG", "false").lower() == "true")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class DataSettings:
|
| 40 |
+
"""Data file settings."""
|
| 41 |
+
csv_file: Path = field(default_factory=lambda: Path("leaderboard_data.csv"))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class Settings:
|
| 46 |
+
"""Main application settings container."""
|
| 47 |
+
api: ApiSettings = field(default_factory=ApiSettings)
|
| 48 |
+
ui: UISettings = field(default_factory=UISettings)
|
| 49 |
+
data: DataSettings = field(default_factory=DataSettings)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# Global settings instance
|
| 53 |
+
settings = Settings()
|
src/data/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data processing modules."""
|
| 2 |
+
|
| 3 |
+
from .transformer import DataTransformer, parse_parameter_string, format_parameter_count
|
| 4 |
+
from .styler import LeaderboardStyler
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"DataTransformer",
|
| 8 |
+
"LeaderboardStyler",
|
| 9 |
+
"parse_parameter_string",
|
| 10 |
+
"format_parameter_count",
|
| 11 |
+
]
|
src/data/__pycache__/__init__.cpython-312.pyc
CHANGED
|
Binary files a/src/data/__pycache__/__init__.cpython-312.pyc and b/src/data/__pycache__/__init__.cpython-312.pyc differ
|
|
|
src/data/__pycache__/styler.cpython-312.pyc
CHANGED
|
Binary files a/src/data/__pycache__/styler.cpython-312.pyc and b/src/data/__pycache__/styler.cpython-312.pyc differ
|
|
|
src/data/__pycache__/transformer.cpython-312.pyc
CHANGED
|
Binary files a/src/data/__pycache__/transformer.cpython-312.pyc and b/src/data/__pycache__/transformer.cpython-312.pyc differ
|
|
|
src/data/styler.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Leaderboard Styling Module
|
| 3 |
+
|
| 4 |
+
Handles color gradients and visual styling for the leaderboard.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import html
|
| 9 |
+
from typing import Dict, Tuple, List
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from matplotlib.colors import LinearSegmentedColormap
|
| 12 |
+
|
| 13 |
+
from ..core.columns import column_registry
|
| 14 |
+
from .transformer import format_parameter_count
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class LeaderboardStyler:
|
| 20 |
+
"""
|
| 21 |
+
Applies visual styling to leaderboard DataFrames.
|
| 22 |
+
|
| 23 |
+
Uses Excel-like Red-Yellow-Green color gradients for score columns.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# Excel-style color gradient: Red -> Yellow -> Green
|
| 27 |
+
GRADIENT_COLORS = [
|
| 28 |
+
(0.9, 0.1, 0.2), # Red (low scores)
|
| 29 |
+
(1.0, 1.0, 0.0), # Yellow (medium scores)
|
| 30 |
+
(0/255, 176/255, 80/255) # Excel Green (high scores)
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
def __init__(self):
|
| 34 |
+
self._colormap = LinearSegmentedColormap.from_list(
|
| 35 |
+
"ExcelRedYellowGreen",
|
| 36 |
+
self.GRADIENT_COLORS,
|
| 37 |
+
N=256
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def rgb_to_hex(rgb: Tuple[float, float, float]) -> str:
|
| 42 |
+
"""Convert RGB tuple (0-1 range) to hex color."""
|
| 43 |
+
r = int(rgb[0] * 255)
|
| 44 |
+
g = int(rgb[1] * 255)
|
| 45 |
+
b = int(rgb[2] * 255)
|
| 46 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
| 47 |
+
|
| 48 |
+
def get_color_for_value(self, value: float, min_val: float, max_val: float) -> str:
|
| 49 |
+
"""Get hex color for a value within a range."""
|
| 50 |
+
if max_val == min_val:
|
| 51 |
+
normalized = 0.5
|
| 52 |
+
else:
|
| 53 |
+
normalized = (value - min_val) / (max_val - min_val)
|
| 54 |
+
|
| 55 |
+
# Clamp to [0, 0.999] to avoid edge case at exactly 1.0
|
| 56 |
+
normalized = max(0, min(0.999, normalized))
|
| 57 |
+
|
| 58 |
+
rgba = self._colormap(normalized)
|
| 59 |
+
return self.rgb_to_hex(rgba[:3])
|
| 60 |
+
|
| 61 |
+
def calculate_color_ranges(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
|
| 62 |
+
"""Calculate min/max for each score column."""
|
| 63 |
+
ranges = {}
|
| 64 |
+
|
| 65 |
+
for col_name in column_registry.score_columns:
|
| 66 |
+
if col_name not in df.columns:
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
numeric_values = pd.to_numeric(df[col_name], errors='coerce')
|
| 70 |
+
if numeric_values.isna().all():
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
ranges[col_name] = {
|
| 74 |
+
'min': numeric_values.min(),
|
| 75 |
+
'max': numeric_values.max()
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
return ranges
|
| 79 |
+
|
| 80 |
+
def apply_styling(self, df: pd.DataFrame) -> "pd.io.formats.style.Styler":
|
| 81 |
+
"""
|
| 82 |
+
Apply color styling to DataFrame.
|
| 83 |
+
|
| 84 |
+
Returns a pandas Styler object that Gradio can render.
|
| 85 |
+
"""
|
| 86 |
+
if df.empty:
|
| 87 |
+
return df.style
|
| 88 |
+
|
| 89 |
+
df_copy = df.copy()
|
| 90 |
+
|
| 91 |
+
# Convert "N/A" to NaN for proper formatting
|
| 92 |
+
for col in column_registry.score_columns:
|
| 93 |
+
if col in df_copy.columns:
|
| 94 |
+
df_copy[col] = df_copy[col].replace("N/A", pd.NA)
|
| 95 |
+
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
|
| 96 |
+
|
| 97 |
+
# Calculate color ranges
|
| 98 |
+
color_ranges = self.calculate_color_ranges(df_copy)
|
| 99 |
+
|
| 100 |
+
# Create style function
|
| 101 |
+
def apply_gradient(val, col_name: str):
|
| 102 |
+
if col_name not in color_ranges:
|
| 103 |
+
return ''
|
| 104 |
+
|
| 105 |
+
if pd.isna(val):
|
| 106 |
+
return ''
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
min_val = color_ranges[col_name]['min']
|
| 110 |
+
max_val = color_ranges[col_name]['max']
|
| 111 |
+
color_hex = self.get_color_for_value(float(val), min_val, max_val)
|
| 112 |
+
return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
|
| 113 |
+
except (ValueError, TypeError):
|
| 114 |
+
return ''
|
| 115 |
+
|
| 116 |
+
# Apply styling
|
| 117 |
+
styler = df_copy.style
|
| 118 |
+
|
| 119 |
+
for col in column_registry.score_columns:
|
| 120 |
+
if col in df_copy.columns:
|
| 121 |
+
styler = styler.map(
|
| 122 |
+
lambda val, c=col: apply_gradient(val, c),
|
| 123 |
+
subset=[col]
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Format numeric columns
|
| 127 |
+
format_dict = {}
|
| 128 |
+
for col_name in column_registry.numeric_columns:
|
| 129 |
+
if col_name in df_copy.columns:
|
| 130 |
+
col_def = column_registry.get(col_name)
|
| 131 |
+
# Special handling for Parameters column - use human-readable format
|
| 132 |
+
if col_name == "Parameters":
|
| 133 |
+
format_dict[col_name] = format_parameter_count
|
| 134 |
+
elif col_def and col_def.decimals == 0:
|
| 135 |
+
format_dict[col_name] = '{:.0f}'
|
| 136 |
+
elif col_def and col_def.decimals == 3:
|
| 137 |
+
format_dict[col_name] = '{:.3f}'
|
| 138 |
+
else:
|
| 139 |
+
format_dict[col_name] = '{:.2f}'
|
| 140 |
+
|
| 141 |
+
# Format model column as hyperlink without mutating the underlying data
|
| 142 |
+
if "Model" in df_copy.columns:
|
| 143 |
+
def _model_link_formatter(value: object) -> str:
|
| 144 |
+
model_name = html.escape(str(value))
|
| 145 |
+
return (
|
| 146 |
+
f'<a href="https://huggingface.co/{model_name}" target="_blank" '
|
| 147 |
+
f'style="color: #2563eb; text-decoration: underline;">{model_name}</a>'
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
format_dict["Model"] = _model_link_formatter
|
| 151 |
+
|
| 152 |
+
if format_dict:
|
| 153 |
+
# Don't replace NA values - let them display as they are in the CSV
|
| 154 |
+
styler = styler.format(format_dict, na_rep='', escape=None)
|
| 155 |
+
|
| 156 |
+
return styler
|
| 157 |
+
|
| 158 |
+
def get_datatypes(self, columns: List[str]) -> List[str]:
|
| 159 |
+
"""Get Gradio datatypes for columns."""
|
| 160 |
+
return column_registry.get_datatypes(columns)
|
| 161 |
+
|
| 162 |
+
def get_column_widths(self, columns: List[str]) -> List[str]:
|
| 163 |
+
"""Get column widths for columns."""
|
| 164 |
+
return column_registry.get_widths(columns)
|
src/data/transformer.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Transformation Module
|
| 3 |
+
|
| 4 |
+
Handles DataFrame transformations and CSV loading.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import html
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Optional, Union
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
from ..core.columns import column_registry, ColumnType
|
| 16 |
+
from ..core.config import settings
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def parse_parameter_string(value: Union[str, float, int]) -> Optional[float]:
|
| 22 |
+
"""
|
| 23 |
+
Parse parameter strings like '307M', '1B', '1.7B', '4B' to numeric values.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
value: Parameter string (e.g., '307M', '1B', '1.7B') or numeric value.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Numeric value (in millions for consistency) or None if parsing fails.
|
| 30 |
+
"""
|
| 31 |
+
if pd.isna(value):
|
| 32 |
+
return None
|
| 33 |
+
|
| 34 |
+
# If already numeric, return as-is
|
| 35 |
+
if isinstance(value, (int, float)):
|
| 36 |
+
return float(value)
|
| 37 |
+
|
| 38 |
+
value_str = str(value).strip().upper()
|
| 39 |
+
|
| 40 |
+
# Handle special cases
|
| 41 |
+
if value_str in ('', 'N/A', 'NA', 'NAN', 'NONE', '∞'):
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
# Pattern to match numbers with optional suffix (K, M, B, T)
|
| 45 |
+
pattern = r'^([\d.]+)\s*([KMBT])?$'
|
| 46 |
+
match = re.match(pattern, value_str)
|
| 47 |
+
|
| 48 |
+
if not match:
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
number = float(match.group(1))
|
| 53 |
+
suffix = match.group(2)
|
| 54 |
+
|
| 55 |
+
# Convert to raw count based on suffix
|
| 56 |
+
multipliers = {
|
| 57 |
+
None: 1,
|
| 58 |
+
'K': 1_000,
|
| 59 |
+
'M': 1_000_000,
|
| 60 |
+
'B': 1_000_000_000,
|
| 61 |
+
'T': 1_000_000_000_000
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
return number * multipliers.get(suffix, 1)
|
| 65 |
+
except (ValueError, TypeError):
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def format_parameter_count(value: Union[float, int, None]) -> str:
|
| 70 |
+
"""
|
| 71 |
+
Format a numeric parameter count to human-readable string.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
value: Numeric parameter count.
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Formatted string like '307M', '1.7B', '4B'.
|
| 78 |
+
"""
|
| 79 |
+
if pd.isna(value) or value is None:
|
| 80 |
+
return ''
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
value = float(value)
|
| 84 |
+
except (ValueError, TypeError):
|
| 85 |
+
return str(value)
|
| 86 |
+
|
| 87 |
+
if value >= 1_000_000_000_000:
|
| 88 |
+
formatted = value / 1_000_000_000_000
|
| 89 |
+
return f"{formatted:.1f}T" if formatted != int(formatted) else f"{int(formatted)}T"
|
| 90 |
+
elif value >= 1_000_000_000:
|
| 91 |
+
formatted = value / 1_000_000_000
|
| 92 |
+
return f"{formatted:.1f}B" if formatted != int(formatted) else f"{int(formatted)}B"
|
| 93 |
+
elif value >= 1_000_000:
|
| 94 |
+
formatted = value / 1_000_000
|
| 95 |
+
return f"{formatted:.0f}M" if formatted >= 10 else f"{formatted:.1f}M".rstrip('0').rstrip('.')+"M" if formatted != int(formatted) else f"{int(formatted)}M"
|
| 96 |
+
elif value >= 1_000:
|
| 97 |
+
formatted = value / 1_000
|
| 98 |
+
return f"{formatted:.0f}K" if formatted >= 10 else f"{formatted:.1f}K"
|
| 99 |
+
else:
|
| 100 |
+
return str(int(value))
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class DataTransformer:
|
| 104 |
+
"""
|
| 105 |
+
Transforms data between different formats.
|
| 106 |
+
|
| 107 |
+
Handles CSV -> DataFrame conversions and display preparation.
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
@staticmethod
|
| 111 |
+
def create_empty_dataframe() -> pd.DataFrame:
|
| 112 |
+
"""Create an empty DataFrame with all column definitions."""
|
| 113 |
+
return pd.DataFrame(columns=column_registry.all_columns)
|
| 114 |
+
|
| 115 |
+
@staticmethod
|
| 116 |
+
def load_from_csv(file_path: Path = None) -> pd.DataFrame:
|
| 117 |
+
"""
|
| 118 |
+
Load leaderboard data from CSV file.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
file_path: Path to CSV file (uses default if None).
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
DataFrame with leaderboard data.
|
| 125 |
+
"""
|
| 126 |
+
path = file_path or settings.data.csv_file
|
| 127 |
+
|
| 128 |
+
if not path.exists():
|
| 129 |
+
logger.warning(f"CSV file not found: {path}")
|
| 130 |
+
return DataTransformer.create_empty_dataframe()
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
df = pd.read_csv(path)
|
| 134 |
+
logger.info(f"Loaded {len(df)} records from {path}")
|
| 135 |
+
|
| 136 |
+
# Convert to display format
|
| 137 |
+
df = DataTransformer._normalize_columns(df)
|
| 138 |
+
df = DataTransformer._convert_parameters_to_numeric(df)
|
| 139 |
+
df = DataTransformer._sort_by_rank(df)
|
| 140 |
+
|
| 141 |
+
return df
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Error loading CSV: {e}")
|
| 145 |
+
return DataTransformer.create_empty_dataframe()
|
| 146 |
+
|
| 147 |
+
@staticmethod
|
| 148 |
+
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 149 |
+
"""Normalize column names from CSV variations to standard names."""
|
| 150 |
+
# Column name mappings for variations
|
| 151 |
+
column_mappings = {
|
| 152 |
+
"Mean (TaskType)": "MTEB Score",
|
| 153 |
+
"Score(Legal)": "Legal Score",
|
| 154 |
+
"Embedding Dimensions": "Embed Dim",
|
| 155 |
+
"Embedding Dim": "Embed Dim",
|
| 156 |
+
"Max Tokens": "Max Sequence Length",
|
| 157 |
+
"Max Seq Length": "Max Sequence Length",
|
| 158 |
+
"Number of Parameters": "Parameters",
|
| 159 |
+
"PairClassification": "Pair Classification",
|
| 160 |
+
"Vocabulary Size": "Vocab Size",
|
| 161 |
+
"Vocabulary": "Vocab Size",
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
df = df.copy()
|
| 165 |
+
|
| 166 |
+
# Rename columns based on mappings
|
| 167 |
+
for old_name, new_name in column_mappings.items():
|
| 168 |
+
if old_name in df.columns and new_name not in df.columns:
|
| 169 |
+
df = df.rename(columns={old_name: new_name})
|
| 170 |
+
|
| 171 |
+
return df
|
| 172 |
+
|
| 173 |
+
@staticmethod
|
| 174 |
+
def _sort_by_rank(df: pd.DataFrame) -> pd.DataFrame:
|
| 175 |
+
"""Sort DataFrame by MTEB Score descending and recalculate ranks."""
|
| 176 |
+
if "MTEB Score" in df.columns:
|
| 177 |
+
# Sort by MTEB Score descending (higher is better)
|
| 178 |
+
df = df.sort_values("MTEB Score", ascending=False, na_position='last').reset_index(drop=True)
|
| 179 |
+
# Recalculate ranks as 1, 2, 3, 4... (no ties)
|
| 180 |
+
df["Rank"] = range(1, len(df) + 1)
|
| 181 |
+
elif "Rank" in df.columns:
|
| 182 |
+
# Fallback to existing rank if MTEB Score not available
|
| 183 |
+
df = df.sort_values("Rank", ascending=True).reset_index(drop=True)
|
| 184 |
+
return df
|
| 185 |
+
|
| 186 |
+
@staticmethod
|
| 187 |
+
def _convert_parameters_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
| 188 |
+
"""
|
| 189 |
+
Convert Parameters column from string format to numeric for proper sorting.
|
| 190 |
+
|
| 191 |
+
Converts values like '307M', '1B', '1.7B' to numeric values.
|
| 192 |
+
"""
|
| 193 |
+
if "Parameters" not in df.columns:
|
| 194 |
+
return df
|
| 195 |
+
|
| 196 |
+
df = df.copy()
|
| 197 |
+
df["Parameters"] = df["Parameters"].apply(parse_parameter_string)
|
| 198 |
+
return df
|
| 199 |
+
|
| 200 |
+
@staticmethod
|
| 201 |
+
def add_model_links(df: pd.DataFrame) -> pd.DataFrame:
|
| 202 |
+
"""Add clickable HuggingFace links to Model column."""
|
| 203 |
+
if "Model" not in df.columns:
|
| 204 |
+
return df
|
| 205 |
+
|
| 206 |
+
df = df.copy()
|
| 207 |
+
df["Model"] = df["Model"].apply(
|
| 208 |
+
lambda x: f'<a href="https://huggingface.co/{html.escape(str(x))}" target="_blank" '
|
| 209 |
+
f'style="color: #2563eb; text-decoration: underline;">{html.escape(str(x))}</a>'
|
| 210 |
+
)
|
| 211 |
+
return df
|
| 212 |
+
|
| 213 |
+
@staticmethod
|
| 214 |
+
def ensure_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 215 |
+
"""Convert numeric columns to proper types."""
|
| 216 |
+
df = df.copy()
|
| 217 |
+
|
| 218 |
+
for col_name in column_registry.numeric_columns:
|
| 219 |
+
if col_name not in df.columns:
|
| 220 |
+
continue
|
| 221 |
+
|
| 222 |
+
col_def = column_registry.get(col_name)
|
| 223 |
+
if col_def is None:
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
# Handle "N/A" and empty values
|
| 227 |
+
df[col_name] = df[col_name].replace("N/A", pd.NA)
|
| 228 |
+
df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
|
| 229 |
+
|
| 230 |
+
# Round to specified decimals
|
| 231 |
+
if col_def.decimals == 0:
|
| 232 |
+
# Keep as float to preserve NaN, format later
|
| 233 |
+
pass
|
| 234 |
+
else:
|
| 235 |
+
df[col_name] = df[col_name].round(col_def.decimals)
|
| 236 |
+
|
| 237 |
+
return df
|
| 238 |
+
|
| 239 |
+
@staticmethod
|
| 240 |
+
def filter_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
| 241 |
+
"""Filter DataFrame to only include specified columns (preserves order)."""
|
| 242 |
+
available = [col for col in columns if col in df.columns]
|
| 243 |
+
return df[available]
|
| 244 |
+
|
| 245 |
+
@classmethod
|
| 246 |
+
def prepare_for_display(
|
| 247 |
+
cls,
|
| 248 |
+
df: pd.DataFrame,
|
| 249 |
+
columns: List[str] = None,
|
| 250 |
+
add_links: bool = True
|
| 251 |
+
) -> pd.DataFrame:
|
| 252 |
+
"""
|
| 253 |
+
Prepare DataFrame for Gradio display.
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
df: Source DataFrame.
|
| 257 |
+
columns: Columns to include (preserves order passed in).
|
| 258 |
+
add_links: Whether to add HuggingFace links.
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
Prepared DataFrame.
|
| 262 |
+
"""
|
| 263 |
+
if df is None or df.empty:
|
| 264 |
+
return cls.create_empty_dataframe()
|
| 265 |
+
|
| 266 |
+
# Work with a copy
|
| 267 |
+
result = df.copy()
|
| 268 |
+
|
| 269 |
+
# Filter columns if specified (preserves the order passed in)
|
| 270 |
+
if columns:
|
| 271 |
+
result = cls.filter_columns(result, columns)
|
| 272 |
+
|
| 273 |
+
# Convert numeric columns
|
| 274 |
+
result = cls.ensure_numeric_columns(result)
|
| 275 |
+
|
| 276 |
+
# Add model links
|
| 277 |
+
if add_links and "Model" in result.columns:
|
| 278 |
+
result = cls.add_model_links(result)
|
| 279 |
+
|
| 280 |
+
return result
|
ui_components.py
DELETED
|
@@ -1,259 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
|
| 4 |
-
Simplified version with only leaderboard and dataset components
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import gradio as gr
|
| 8 |
-
import pandas as pd
|
| 9 |
-
from data_processor import (create_styled_leaderboard_dataframe,
|
| 10 |
-
create_empty_leaderboard_dataframe)
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def create_leaderboard_tab(current_data: pd.DataFrame):
|
| 14 |
-
"""Create the main leaderboard tab with color styling"""
|
| 15 |
-
|
| 16 |
-
# Handle empty or invalid data
|
| 17 |
-
if current_data.empty or "Model" not in current_data.columns:
|
| 18 |
-
print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
|
| 19 |
-
current_data = create_empty_leaderboard_dataframe()
|
| 20 |
-
|
| 21 |
-
# Apply color styling to score columns using pandas Styler
|
| 22 |
-
styled_data = create_styled_leaderboard_dataframe(current_data)
|
| 23 |
-
|
| 24 |
-
leaderboard = gr.Dataframe(
|
| 25 |
-
value=styled_data,
|
| 26 |
-
interactive=False,
|
| 27 |
-
wrap=True,
|
| 28 |
-
max_height=600,
|
| 29 |
-
show_search=True,
|
| 30 |
-
datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
|
| 31 |
-
column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
# Information about the leaderboard
|
| 35 |
-
gr.Markdown("""
|
| 36 |
-
### 🔍 How to Use the Leaderboard:
|
| 37 |
-
- **Search**: Use the search box to find specific models
|
| 38 |
-
- **Color Coding**: Scores are color-coded from red (low) to green (high)
|
| 39 |
-
- **Sorting**: Click on column headers to sort by different metrics
|
| 40 |
-
- **Rankings**: Models are ranked by Mean (Task) score
|
| 41 |
-
|
| 42 |
-
### 📊 Performance Insights:
|
| 43 |
-
- **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
|
| 44 |
-
- **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
|
| 45 |
-
- **Model Size vs Performance**: Larger models generally perform better but with exceptions
|
| 46 |
-
""")
|
| 47 |
-
|
| 48 |
-
return leaderboard
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def create_dataset_tab():
|
| 52 |
-
"""Create the dataset information tab"""
|
| 53 |
-
|
| 54 |
-
gr.Markdown("### 📊 MTEB Turkish Dataset Overview")
|
| 55 |
-
|
| 56 |
-
# Task name to dataset path mapping
|
| 57 |
-
task_to_dataset = {
|
| 58 |
-
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
|
| 59 |
-
'XQuADRetrieval': 'google/xquad',
|
| 60 |
-
'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
|
| 61 |
-
'MKQARetrieval': 'apple/mkqa',
|
| 62 |
-
'MassiveIntentClassification': 'mteb/amazon_massive_intent',
|
| 63 |
-
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
|
| 64 |
-
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
|
| 65 |
-
'SIB200Classification': 'mteb/sib200',
|
| 66 |
-
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
|
| 67 |
-
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
|
| 68 |
-
'SIB200ClusteringS2S': 'mteb/sib200',
|
| 69 |
-
'XNLI': 'mteb/xnli',
|
| 70 |
-
'XNLIV2': 'mteb/xnli2.0-multi-pair',
|
| 71 |
-
'STS22.v2': 'mteb/sts22-crosslingual-sts'
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
# Create clickable task names
|
| 75 |
-
clickable_task_names = []
|
| 76 |
-
for task_name in [
|
| 77 |
-
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
|
| 78 |
-
'MassiveIntentClassification', 'MassiveScenarioClassification',
|
| 79 |
-
'MultilingualSentimentClassification', 'SIB200Classification',
|
| 80 |
-
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
|
| 81 |
-
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
|
| 82 |
-
]:
|
| 83 |
-
dataset_path = task_to_dataset[task_name]
|
| 84 |
-
hf_link = f"https://huggingface.co/datasets/{dataset_path}"
|
| 85 |
-
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
|
| 86 |
-
clickable_task_names.append(clickable_name)
|
| 87 |
-
|
| 88 |
-
# Create dataset information table
|
| 89 |
-
dataset_data = pd.DataFrame({
|
| 90 |
-
'Task Name': clickable_task_names,
|
| 91 |
-
'Task Type': [
|
| 92 |
-
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
|
| 93 |
-
'Classification', 'Classification',
|
| 94 |
-
'Classification', 'Classification',
|
| 95 |
-
'Classification', 'Classification',
|
| 96 |
-
'Clustering', 'PairClassification', 'PairClassification', 'STS'
|
| 97 |
-
],
|
| 98 |
-
'Description': [
|
| 99 |
-
'Turkish FAQ retrieval task',
|
| 100 |
-
'Turkish question answering retrieval',
|
| 101 |
-
'Historical Turkish document retrieval',
|
| 102 |
-
'Multilingual knowledge QA retrieval',
|
| 103 |
-
'Intent classification for Turkish',
|
| 104 |
-
'Scenario classification for Turkish',
|
| 105 |
-
'Multilingual sentiment classification',
|
| 106 |
-
'SIB200 language identification',
|
| 107 |
-
'Turkish movie review sentiment',
|
| 108 |
-
'Turkish product review sentiment',
|
| 109 |
-
'SIB200 clustering task',
|
| 110 |
-
'Turkish natural language inference',
|
| 111 |
-
'Enhanced Turkish NLI task',
|
| 112 |
-
'Turkish semantic textual similarity'
|
| 113 |
-
],
|
| 114 |
-
'Domain': [
|
| 115 |
-
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
|
| 116 |
-
'Intent', 'Scenario',
|
| 117 |
-
'Sentiment', 'Language ID',
|
| 118 |
-
'Movies', 'Products',
|
| 119 |
-
'Language ID', 'NLI', 'NLI', 'STS'
|
| 120 |
-
],
|
| 121 |
-
'Samples': [
|
| 122 |
-
'~135K', '~10K', '~1.4K', '~10K',
|
| 123 |
-
'~11K', '~11K',
|
| 124 |
-
'~4.5K', '~700',
|
| 125 |
-
'~8K', '~4.8K',
|
| 126 |
-
'~1K', '~1.4K', '~1.4K', '~400'
|
| 127 |
-
]
|
| 128 |
-
})
|
| 129 |
-
|
| 130 |
-
dataset_table = gr.Dataframe(
|
| 131 |
-
value=dataset_data,
|
| 132 |
-
label="MTEB Turkish Task Details",
|
| 133 |
-
interactive=False,
|
| 134 |
-
wrap=True,
|
| 135 |
-
datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
# Task type distribution
|
| 139 |
-
gr.Markdown("""
|
| 140 |
-
### 📈 Task Distribution:
|
| 141 |
-
|
| 142 |
-
**By Task Type:**
|
| 143 |
-
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
|
| 144 |
-
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
|
| 145 |
-
- **Pair Classification**: 2 tasks (natural language inference)
|
| 146 |
-
- **Clustering**: 1 task (language clustering)
|
| 147 |
-
- **STS**: 1 task (semantic textual similarity)
|
| 148 |
-
|
| 149 |
-
**By Domain:**
|
| 150 |
-
- **Sentiment Analysis**: Movie and product reviews
|
| 151 |
-
- **Question Answering**: FAQ, reading comprehension, and knowledge QA
|
| 152 |
-
- **Intent/Scenario**: Conversational AI applications
|
| 153 |
-
- **Language Tasks**: NLI, STS, clustering
|
| 154 |
-
- **Multilingual**: Cross-lingual evaluation capabilities
|
| 155 |
-
""")
|
| 156 |
-
|
| 157 |
-
# Statistics summary
|
| 158 |
-
stats_data = pd.DataFrame({
|
| 159 |
-
'Metric': [
|
| 160 |
-
'Total Tasks',
|
| 161 |
-
'Total Samples',
|
| 162 |
-
'Task Types',
|
| 163 |
-
'Languages',
|
| 164 |
-
'Avg. Tokens per Sample'
|
| 165 |
-
],
|
| 166 |
-
'Value': [
|
| 167 |
-
'14 tasks',
|
| 168 |
-
'~190K samples',
|
| 169 |
-
'5 types',
|
| 170 |
-
'Turkish + Multilingual',
|
| 171 |
-
'~150 tokens'
|
| 172 |
-
],
|
| 173 |
-
'Notes': [
|
| 174 |
-
'Comprehensive evaluation across domains',
|
| 175 |
-
'Large-scale evaluation dataset',
|
| 176 |
-
'Classification, Retrieval, STS, NLI, Clustering',
|
| 177 |
-
'Focus on Turkish with multilingual support',
|
| 178 |
-
'Varies by task type and domain'
|
| 179 |
-
]
|
| 180 |
-
})
|
| 181 |
-
|
| 182 |
-
gr.Dataframe(
|
| 183 |
-
value=stats_data,
|
| 184 |
-
label="Dataset Statistics Summary",
|
| 185 |
-
interactive=False
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
gr.Markdown("""
|
| 189 |
-
### 🎯 Evaluation Methodology:
|
| 190 |
-
|
| 191 |
-
**Scoring:**
|
| 192 |
-
- Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
|
| 193 |
-
- **Mean (Task)**: Direct average of all individual task scores
|
| 194 |
-
- **Mean (TaskType)**: Average of task category means
|
| 195 |
-
- **Individual Categories**: Performance in each task type
|
| 196 |
-
|
| 197 |
-
**Model Ranking:**
|
| 198 |
-
- Primary ranking by **Mean (Task)** score
|
| 199 |
-
- Correlation metrics provide additional insights
|
| 200 |
-
- Task-specific performance shows model strengths
|
| 201 |
-
|
| 202 |
-
**Quality Assurance:**
|
| 203 |
-
- Standardized evaluation protocols
|
| 204 |
-
- Consistent preprocessing across tasks
|
| 205 |
-
- Multiple metrics per task for robustness
|
| 206 |
-
""")
|
| 207 |
-
|
| 208 |
-
return dataset_table
|
| 209 |
-
|
| 210 |
-
def create_submit_evaluation_tab():
|
| 211 |
-
"""Create the submit evaluation tab with form"""
|
| 212 |
-
|
| 213 |
-
gr.Markdown("### 🚀 Submit Model for Evaluation")
|
| 214 |
-
gr.Markdown("""
|
| 215 |
-
Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
|
| 216 |
-
**Authentication with Hugging Face is required to submit evaluations.**
|
| 217 |
-
""")
|
| 218 |
-
|
| 219 |
-
# OAuth login button
|
| 220 |
-
login_button = gr.LoginButton(value="Sign in with Hugging Face")
|
| 221 |
-
|
| 222 |
-
model_input = gr.Textbox(
|
| 223 |
-
label="🤖 Model Name",
|
| 224 |
-
placeholder="sentence-transformers/your-model",
|
| 225 |
-
info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
email_input = gr.Textbox(
|
| 229 |
-
label="📧 Email Address",
|
| 230 |
-
placeholder="your.email@example.com",
|
| 231 |
-
info="Email for notifications about evaluation status and results"
|
| 232 |
-
)
|
| 233 |
-
|
| 234 |
-
submit_btn = gr.Button(
|
| 235 |
-
"🚀 Submit",
|
| 236 |
-
variant="primary",
|
| 237 |
-
size="lg"
|
| 238 |
-
)
|
| 239 |
-
|
| 240 |
-
# Result output for authentication messages
|
| 241 |
-
result_output = gr.HTML(label="Status")
|
| 242 |
-
|
| 243 |
-
# Information about the evaluation process
|
| 244 |
-
gr.Markdown("""
|
| 245 |
-
### 📋 Evaluation Process:
|
| 246 |
-
1. **Sign In**: First, sign in with your Hugging Face account using the button above
|
| 247 |
-
2. **Submit Request**: Fill out the form with your model details and email
|
| 248 |
-
3. **Admin Review**: Your request will be reviewed by administrators
|
| 249 |
-
4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
|
| 250 |
-
5. **Results**: You'll receive email notifications and results will appear on the leaderboard
|
| 251 |
-
|
| 252 |
-
### ⚠️ Important Notes:
|
| 253 |
-
- **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
|
| 254 |
-
- You'll receive email updates about your request status
|
| 255 |
-
- Make sure your model is publicly available on HuggingFace
|
| 256 |
-
- Valid email address is required for receiving results
|
| 257 |
-
""")
|
| 258 |
-
|
| 259 |
-
return (model_input, email_input, submit_btn, login_button, result_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|