nmmursit commited on
Commit
bc37111
·
1 Parent(s): 28277a5

Refactor codebase structure

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ venv/
25
+ ENV/
26
+ env/
27
+ .venv/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Environment variables
37
+ .env
38
+ .env.local
39
+ .env.*.local
40
+
41
+ # Logs
42
+ *.log
43
+ logs/
44
+
45
+ # Docker
46
+ .docker/
47
+
48
+ # OS
49
+ .DS_Store
50
+ Thumbs.db
51
+
52
+ # Gradio
53
+ flagged/
api_client.py DELETED
@@ -1,103 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- API Client module for MTEB Turkish Leaderboard
4
- """
5
-
6
- from typing import Optional, Dict, Any
7
- import traceback
8
- import requests
9
-
10
- from config import API_BASE_URL, API_TIMEOUT, API_URL, USERNAME, PASSWORD
11
-
12
-
13
- def check_api_health() -> bool:
14
- """Check if API is available"""
15
- try:
16
- response = requests.get(f"{API_BASE_URL}/api/v1/health", timeout=5)
17
- return response.status_code == 200
18
- except:
19
- return False
20
-
21
-
22
- def send_evaluation_request_to_api(model_name: str, batch_size: int = 32, email: str = "user@example.com") -> Optional[Dict[str, Any]]:
23
- """
24
- Send an evaluation request to the API for the specified model.
25
- Returns the API response as a dictionary if successful, otherwise None.
26
- """
27
- try:
28
- payload = {
29
- "model_name": model_name,
30
- "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
31
- "batch_size": batch_size,
32
- "email": email,
33
- "model_type": "sentence-transformer"
34
- }
35
-
36
- # Authentication credentials
37
- auth = (USERNAME, PASSWORD)
38
-
39
- response = requests.post(
40
- f"{API_URL}/api/mteb/request",
41
- json=payload,
42
- timeout=API_TIMEOUT,
43
- auth=auth
44
- )
45
-
46
- print(f"Response Status: {response.status_code}")
47
-
48
- if response.status_code == 200:
49
- result = response.json()
50
- return result
51
- else:
52
- print(f"API Error: {response.status_code}")
53
- try:
54
- error_detail = response.json()
55
- print(f" Error Detail: {error_detail}")
56
- except:
57
- print(f" Raw Response: {response.text}")
58
- return None
59
-
60
- except Exception as e:
61
- print(f"API Call Error: {e}")
62
- traceback.print_exc()
63
- return None
64
-
65
-
66
- def get_evaluation_status(request_id: str) -> Optional[Dict[str, Any]]:
67
- """Get evaluation status from"""
68
- try:
69
- auth = (USERNAME, PASSWORD)
70
-
71
- response = requests.get(
72
- f"{API_URL}/api/mteb/status/{request_id}",
73
- timeout=API_TIMEOUT,
74
- auth=auth
75
- )
76
-
77
- if response.status_code == 200:
78
- return response.json()
79
- else:
80
- print(f"Status check error: {response.status_code}")
81
- return None
82
-
83
- except Exception as e:
84
- print(f"Status check error: {e}")
85
- return None
86
-
87
-
88
- def cancel_evaluation_request(request_id: str) -> bool:
89
- """Cancel an evaluation request"""
90
- try:
91
- auth = (USERNAME, PASSWORD)
92
-
93
- response = requests.delete(
94
- f"{API_URL}/api/mteb/request/{request_id}",
95
- timeout=API_TIMEOUT,
96
- auth=auth
97
- )
98
-
99
- return response.status_code == 200
100
-
101
- except Exception as e:
102
- print(f"Cancel request error: {e}")
103
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,136 +1,104 @@
1
  #!/usr/bin/env python3
2
  """
3
- Mizan Leaderboard - Enhanced Version with Submit Functionality
4
- Includes leaderboard display, model submission, and evaluation tracking
 
5
  """
6
 
 
 
 
7
  import gradio as gr
8
 
9
- from ui_components import (
10
- create_leaderboard_tab, create_dataset_tab, create_submit_evaluation_tab
11
- )
12
- from data_processor import load_leaderboard_from_csv
13
- from evaluation_service import submit_evaluation
14
 
15
- # Global data storage
16
- current_data = None
 
 
 
 
 
 
17
 
 
18
 
19
- def create_leaderboard_demo():
20
- """Create enhanced leaderboard demo interface with submit functionality"""
21
-
22
- global current_data
23
-
24
- # Setup directories
25
 
 
 
 
26
 
27
- # Load data from CSV file
28
- current_data = load_leaderboard_from_csv()
29
 
30
- with gr.Blocks(
31
- title="Mizan",
32
- theme=gr.themes.Soft()
33
- ) as demo:
34
 
35
- gr.Markdown("""
36
- # Mizan Leaderboard
 
 
37
 
38
- Performance comparison for Turkish embedding models
39
- """)
 
 
 
40
 
41
- with gr.Tabs():
42
- # Tab 1: Leaderboard
43
- with gr.Tab("📊 Leaderboard"):
44
- leaderboard_table = create_leaderboard_tab(current_data)
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Tab 2: Submit
47
- with gr.Tab("🚀 Submit"):
48
- (model_input, email_input, submit_btn, login_button, result_output) = create_submit_evaluation_tab()
 
 
49
 
50
- # Submit evaluation functionality with authentication
51
- def handle_submit_evaluation(model_name, email, profile, progress=gr.Progress()):
52
- import logging
53
-
54
- # Authentication check
55
- if profile is None:
56
- logging.warning("Unauthorized submission attempt with no profile")
57
- return "<p style='color: red; font-weight: bold;'>Authentication required. Please log in with your Hugging Face account.</p>"
58
-
59
- # IMPORTANT: In local development, Gradio returns "Sign in with Hugging Face" string
60
- # This is NOT a real authentication, just a placeholder for local testing
61
- if isinstance(profile, str) and profile == "Sign in with Hugging Face":
62
- # Block submission in local dev with mock auth
63
- return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
64
-
65
- # Email is required
66
- if not email or email.strip() == "":
67
- return "<p style='color: red; font-weight: bold;'>Email address is required to receive benchmark results.</p>"
68
-
69
- global current_data
70
- batch_size = 32 # Always use default batch size
71
- result_msg, updated_data = submit_evaluation(model_name, email, batch_size, current_data, progress)
72
- # Note: For now, we don't update the leaderboard since evaluation is async
73
- # The leaderboard will be updated manually when results are available
74
- logging.info(f"Submission processed for model: {model_name} by user: {profile}")
75
- return result_msg
76
 
77
- submit_btn.click(
78
- fn=handle_submit_evaluation,
79
- inputs=[model_input, email_input, login_button],
80
- outputs=[result_output]
81
- )
82
-
83
- # Tab 3: Dataset Information
84
- with gr.Tab("📊 Dataset Information"):
85
- dataset_table = create_dataset_tab()
86
- gr.Markdown("""
87
- ---
88
- ### 📊 Metrics Explanation:
89
- - **Mean (Task)**: Average performance across all individual tasks
90
- - **Mean (TaskType)**: Average performance by task categories
91
- - **Classification**: Performance on Turkish classification tasks
92
- - **Clustering**: Performance on Turkish clustering tasks
93
- - **Pair Classification**: Performance on pair classification tasks (like NLI)
94
- - **Retrieval**: Performance on information retrieval tasks
95
- - **STS**: Performance on Semantic Textual Similarity tasks
96
- - **Correlation**: Weighted average of correlation metrics for NLI and STSB datasets
97
- - **Parameters**: Number of model parameters
98
- - **Embed Dim**: Embedding dimension size
99
- - **Max Seq Length**: Maximum sequence length the model can process (0 = infinite/unlimited)
100
- - **Vocab Size**: Size of the model's vocabulary
101
-
102
- ### 📖 About Mizan:
103
- This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
104
- on Turkish language tasks across multiple domains including:
105
- - Text classification and sentiment analysis
106
- - Information retrieval and search
107
- - Semantic textual similarity
108
- - Text clustering and pair classification
109
-
110
- ### 🚀 Submit Your Model:
111
- Use the **Submit** tab to submit your Turkish embedding model for evaluation.
112
- Your request will be reviewed by administrators and you'll receive email notifications about the progress.
113
-
114
- ### Contact:
115
- For any questions or feedback, please contact info@newmind.ai
116
-
117
- ### Links:
118
- - **GitHub**: [mteb/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
119
- """)
120
 
121
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  def main():
125
- """Main entry point"""
126
- print("🚀 Starting Mizan Leaderboard...")
127
-
128
- demo = create_leaderboard_demo()
129
- demo.launch(
130
- server_name="0.0.0.0",
131
- server_port=7860,
132
- share=False
133
- )
134
 
135
 
136
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """
3
+ Mizan Turkish Leaderboard - HuggingFace Space Version
4
+
5
+ Clean entry point that wires together all components.
6
  """
7
 
8
+ import logging
9
+ import sys
10
+
11
  import gradio as gr
12
 
13
+ from src.core.config import settings
14
+ from src.data import DataTransformer
15
+ from src.components import LeaderboardTab, DatasetTab, SubmitTab
 
 
16
 
17
+ # Configure logging
18
+ logging.basicConfig(
19
+ level=logging.DEBUG if settings.ui.debug else logging.INFO,
20
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
21
+ handlers=[
22
+ logging.StreamHandler(sys.stdout),
23
+ ]
24
+ )
25
 
26
+ logger = logging.getLogger(__name__)
27
 
 
 
 
 
 
 
28
 
29
+ class MizanApp:
30
+ """
31
+ Main application class.
32
 
33
+ Orchestrates all components and creates the Gradio interface.
34
+ """
35
 
36
+ def __init__(self):
37
+ # Load data
38
+ self.transformer = DataTransformer()
39
+ self.data = self.transformer.load_from_csv()
40
 
41
+ # UI components (will be initialized during build)
42
+ self._leaderboard_tab: LeaderboardTab = None
43
+ self._dataset_tab: DatasetTab = None
44
+ self._submit_tab: SubmitTab = None
45
 
46
+ logger.info(f"Application initialized with {len(self.data)} models")
47
+
48
+ def build_interface(self) -> gr.Blocks:
49
+ """
50
+ Build the complete Gradio interface.
51
 
52
+ Returns:
53
+ Gradio Blocks application.
54
+ """
55
+ with gr.Blocks(
56
+ title="🇹🇷 Mizan Turkish Leaderboard",
57
+ theme=gr.themes.Soft()
58
+ ) as demo:
59
+
60
+ # Header
61
+ gr.Markdown("""
62
+ # 🇹🇷 Mizan Turkish Evaluation Leaderboard
63
+
64
+ Performance comparison for Turkish embedding models
65
+ """)
66
 
67
+ with gr.Tabs():
68
+ # Tab 1: Leaderboard
69
+ with gr.Tab("Leaderboard"):
70
+ self._leaderboard_tab = LeaderboardTab(data=self.data)
71
+ self._leaderboard_tab.build()
72
 
73
+ # Tab 2: Submit
74
+ with gr.Tab("Submit"):
75
+ self._submit_tab = SubmitTab()
76
+ self._submit_tab.build()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # Tab 3: Dataset Information
79
+ with gr.Tab("Dataset Information"):
80
+ self._dataset_tab = DatasetTab()
81
+ self._dataset_tab.build()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ return demo
84
+
85
+ def run(self):
86
+ """Run the application."""
87
+ logger.info("Starting Mizan Turkish Leaderboard...")
88
+
89
+ # Build and launch
90
+ demo = self.build_interface()
91
+ demo.launch(
92
+ server_name="0.0.0.0",
93
+ server_port=settings.ui.port,
94
+ share=False
95
+ )
96
 
97
 
98
  def main():
99
+ """Main entry point."""
100
+ app = MizanApp()
101
+ app.run()
 
 
 
 
 
 
102
 
103
 
104
  if __name__ == "__main__":
config.py DELETED
@@ -1,28 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Configuration module for MTEB Turkish Leaderboard
4
- Centralizes environment variables and configuration settings
5
- """
6
-
7
- import os
8
- from dotenv import load_dotenv
9
-
10
- # Load environment variables from .env file
11
- load_dotenv()
12
-
13
- # API Configuration
14
- API_URL = os.environ.get("API_URL")
15
- USERNAME = os.environ.get("API_USERNAME")
16
- PASSWORD = os.environ.get("API_PASSWORD")
17
-
18
- # API Configuration (public settings)
19
- API_BASE_URL = "http://localhost:8000"
20
- API_TIMEOUT = 30
21
-
22
- # Polling and refresh intervals (public settings)
23
- POLL_INTERVAL = 5 # seconds
24
- LEADERBOARD_REFRESH_INTERVAL = 30 # seconds
25
-
26
- # CSV file path for leaderboard data
27
- CSV_FILE_PATH = "leaderboard_data.csv"
28
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_processor.py DELETED
@@ -1,208 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Data Processing module for MTEB Turkish Leaderboard - HF Spaces Version
4
- Simplified version for loading and processing CSV data
5
- """
6
-
7
- import os
8
- import pandas as pd
9
- from pandas.io.formats.style import Styler
10
- from matplotlib.colors import LinearSegmentedColormap
11
- import html
12
-
13
- # CSV file path
14
- CSV_FILE_PATH = "leaderboard_data.csv"
15
-
16
-
17
- def load_leaderboard_from_csv() -> pd.DataFrame:
18
- """Load leaderboard data from CSV file"""
19
- try:
20
- if not os.path.exists(CSV_FILE_PATH):
21
- print(f"❌ CSV file not found: {CSV_FILE_PATH}")
22
- return create_empty_leaderboard_dataframe()
23
-
24
- df = pd.read_csv(CSV_FILE_PATH)
25
- print(f"✅ Loaded {len(df)} records from {CSV_FILE_PATH}")
26
-
27
- # Convert to leaderboard format
28
- leaderboard_df = csv_to_leaderboard_format(df)
29
-
30
- # Sort by Mean (Task) score and add rankings
31
- leaderboard_df = leaderboard_df.sort_values("Mean (Task)", ascending=False).reset_index(drop=True)
32
- leaderboard_df["Rank"] = range(1, len(leaderboard_df) + 1)
33
-
34
- return leaderboard_df
35
-
36
- except Exception as e:
37
- print(f"❌ Error loading CSV: {e}")
38
- return create_empty_leaderboard_dataframe()
39
-
40
-
41
- def create_empty_leaderboard_dataframe() -> pd.DataFrame:
42
- """Create an empty DataFrame with proper leaderboard column structure"""
43
- return pd.DataFrame(columns=[
44
- "Rank",
45
- "Model",
46
- "Mean (Task)",
47
- "Mean (TaskType)",
48
- "Classification",
49
- "Clustering",
50
- "Pair Classification",
51
- "Retrieval",
52
- "STS",
53
- "Correlation",
54
- "Parameters",
55
- "Embed Dim",
56
- "Max Sequence Length",
57
- "Vocab Size",
58
- ])
59
-
60
-
61
- def csv_to_leaderboard_format(df: pd.DataFrame) -> pd.DataFrame:
62
- """Convert CSV data to leaderboard format"""
63
- data = []
64
- for idx, row in df.iterrows():
65
- model_name = row['Model']
66
-
67
- # Prepare model name for display
68
- model_name_clean = html.escape(model_name)
69
-
70
- # Create clickable HuggingFace link for model name
71
- hf_link = f"https://huggingface.co/{model_name_clean}"
72
- clickable_model = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{model_name_clean}</a>'
73
-
74
- # Handle different column name variations
75
- embedding_dim_col = 'Embedding Dim'
76
- max_seq_col = 'Max Seq Length'
77
- pair_classification_col = 'Pair Classification'
78
-
79
- data_row = {
80
- "Rank": idx + 1, # Initial ranking, will be recalculated
81
- "Model": clickable_model,
82
- "Mean (Task)": round(float(row['Mean (Task)']), 2),
83
- "Mean (TaskType)": round(float(row['Mean (TaskType)']), 2),
84
- "Classification": round(float(row['Classification']), 2),
85
- "Clustering": round(float(row['Clustering']), 2),
86
- "Pair Classification": round(float(row[pair_classification_col]), 2),
87
- "Retrieval": round(float(row['Retrieval']), 2),
88
- "STS": round(float(row['STS']), 2),
89
- "Correlation": round(float(row['Correlation']), 3) if not pd.isna(row['Correlation']) else "N/A",
90
- "Parameters": row['Number of Parameters'],
91
- "Embed Dim": int(float(row[embedding_dim_col])) if not pd.isna(row[embedding_dim_col]) else 0,
92
- "Max Sequence Length": "N/A" if pd.isna(row[max_seq_col]) or row[max_seq_col] == "Unknown" else int(float(row[max_seq_col])),
93
- "Vocab Size": int(float(row['Vocab Size'])) if 'Vocab Size' in row and not pd.isna(row['Vocab Size']) else 0
94
- }
95
- data.append(data_row)
96
-
97
- result_df = pd.DataFrame(data)
98
- return result_df
99
-
100
-
101
- def create_excel_like_cmap():
102
- """Create Excel-like colormap for score visualization"""
103
- colors = [
104
- (0.9, 0.1, 0.2), # Red
105
- (1.0, 1.0, 0.0), # Yellow
106
- (0/255, 176/255, 80/255) # Excel-style Green
107
- ]
108
-
109
- return LinearSegmentedColormap.from_list("excel_like", colors, N=256)
110
-
111
-
112
- def rgb_to_hex(rgb_tuple):
113
- """Convert RGB tuple to hex color"""
114
- r, g, b = [int(x * 255) for x in rgb_tuple[:3]]
115
- return f"#{r:02x}{g:02x}{b:02x}"
116
-
117
-
118
- def create_colored_cell(value: float, min_val: float, max_val: float, colormap) -> str:
119
- """Create colored cell HTML for score visualization"""
120
- if pd.isna(value) or value == "N/A":
121
- return str(value)
122
-
123
- try:
124
- # Normalize value to 0-1 range
125
- if max_val > min_val:
126
- normalized = (float(value) - min_val) / (max_val - min_val)
127
- else:
128
- normalized = 0.5
129
-
130
- # Get color from colormap
131
- color_rgba = colormap(normalized)
132
- color_hex = rgb_to_hex(color_rgba)
133
-
134
- # Create colored cell HTML with data-sort attribute for proper numeric sorting
135
- return f'<div style="background-color: {color_hex}; padding: 4px 8px; border-radius: 4px; text-align: center; font-weight: bold; color: #333;" data-sort="{value}">{value}</div>'
136
-
137
- except (ValueError, TypeError):
138
- return str(value)
139
-
140
-
141
- def create_styled_leaderboard_dataframe(df: pd.DataFrame) -> Styler:
142
- """Create styled leaderboard dataframe with color coding and clickable model names using pandas Styler
143
-
144
- Returns a pandas Styler object that Gradio Dataframe can render with both colors AND correct sorting.
145
- """
146
- if df.empty:
147
- return df.style
148
-
149
- colormap = create_excel_like_cmap()
150
-
151
- # Score columns to colorize
152
- score_columns = ["Mean (Task)", "Mean (TaskType)", "Classification", "Clustering",
153
- "Pair Classification", "Retrieval", "STS", "Correlation"]
154
-
155
- # Calculate min/max for each score column for normalization
156
- color_ranges = {}
157
- for col in score_columns:
158
- if col in df.columns:
159
- numeric_values = pd.to_numeric(df[col], errors='coerce')
160
- if not numeric_values.isna().all():
161
- color_ranges[col] = {
162
- 'min': numeric_values.min(),
163
- 'max': numeric_values.max()
164
- }
165
-
166
- # Create styler with background colors for score columns
167
- def apply_color_gradient(val, col_name):
168
- """Apply background color based on value"""
169
- if col_name not in color_ranges:
170
- return ''
171
-
172
- if pd.isna(val) or val == "N/A":
173
- return ''
174
-
175
- try:
176
- min_val = color_ranges[col_name]['min']
177
- max_val = color_ranges[col_name]['max']
178
-
179
- # Normalize value to 0-1 range
180
- if max_val > min_val:
181
- normalized = (float(val) - min_val) / (max_val - min_val)
182
- else:
183
- normalized = 0.5
184
-
185
- # Get color from colormap
186
- color_rgba = colormap(normalized)
187
- color_hex = rgb_to_hex(color_rgba)
188
-
189
- return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
190
- except (ValueError, TypeError):
191
- return ''
192
-
193
- # Apply styling to score columns using map (applymap is deprecated)
194
- styler = df.style
195
- for col in score_columns:
196
- if col in df.columns:
197
- styler = styler.map(lambda val, c=col: apply_color_gradient(val, c), subset=[col])
198
-
199
- # Format score columns to 2 decimal places
200
- format_dict = {}
201
- for col in score_columns:
202
- if col in df.columns:
203
- format_dict[col] = '{:.2f}'
204
-
205
- if format_dict:
206
- styler = styler.format(format_dict, na_rep='N/A')
207
-
208
- return styler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation_service.py DELETED
@@ -1,190 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Evaluation Service module for MTEB Turkish Leaderboard
4
- Handles evaluation submissions and status tracking
5
- """
6
-
7
- import time
8
- import re
9
- from typing import Optional, Tuple, List
10
- import traceback
11
- import pandas as pd
12
- import gradio as gr
13
-
14
- from api_client import send_evaluation_request_to_api, get_evaluation_status, cancel_evaluation_request
15
-
16
- # Global state management for active evaluations
17
- active_evaluations = {} # request_id -> {"status": str, "model_name": str, "email": str, "start_time": float}
18
-
19
-
20
- def get_active_evaluations_status() -> str:
21
- """Show status of active evaluations"""
22
- if not active_evaluations:
23
- return "🟢 No active evaluation requests"
24
-
25
- status_lines = []
26
- for request_id, info in active_evaluations.items():
27
- model_name = info["model_name"]
28
- email = info["email"]
29
- elapsed = int(time.time() - info["start_time"])
30
- status = info.get("status", "PENDING")
31
- status_lines.append(f"🔄 {model_name} ({email}) - {request_id} [{status}] ({elapsed}s)")
32
-
33
- return "\n".join(status_lines)
34
-
35
-
36
- def get_active_evaluations_with_cancel_options() -> Tuple[str, List[str]]:
37
- """Get active evaluations status and cancellation options"""
38
- status_text = get_active_evaluations_status()
39
-
40
- cancel_options = []
41
- for request_id, info in active_evaluations.items():
42
- model_name = info["model_name"]
43
- cancel_options.append(f"{request_id} - {model_name}")
44
-
45
- return status_text, cancel_options
46
-
47
-
48
- def clear_active_evaluations() -> str:
49
- """Clear all active evaluations from tracking"""
50
- global active_evaluations
51
- count = len(active_evaluations)
52
- active_evaluations.clear()
53
- return f"✅ Cleared {count} active evaluation(s) from tracking"
54
-
55
-
56
- def cancel_active_evaluation(selection: str) -> str:
57
- """Cancel a selected active evaluation"""
58
- if not selection:
59
- return "❌ No evaluation selected for cancellation"
60
-
61
- try:
62
- request_id = selection.split(" - ")[0]
63
-
64
- if request_id not in active_evaluations:
65
- return f"❌ Evaluation {request_id} not found in active evaluations"
66
-
67
- # Try to cancel via API
68
- success = cancel_evaluation_request(request_id)
69
-
70
- if success:
71
- model_name = active_evaluations[request_id]["model_name"]
72
- del active_evaluations[request_id]
73
- return f"✅ Successfully cancelled evaluation for {model_name} (ID: {request_id})"
74
- else:
75
- return f"❌ Failed to cancel evaluation {request_id}. Check API connection."
76
-
77
- except Exception as e:
78
- return f"❌ Error cancelling evaluation: {str(e)}"
79
-
80
-
81
- def _validate_evaluation_request(model_name: str, email: str = None) -> Optional[str]:
82
- """Validate evaluation request parameters"""
83
- # Model name validation
84
- if not model_name or not model_name.strip():
85
- return "❌ Model name cannot be empty!"
86
-
87
- model_name = model_name.strip()
88
-
89
- # Check model name length (format: org/model-name)
90
- if len(model_name) < 3:
91
- return "❌ Model name too short!"
92
-
93
- if len(model_name) > 256:
94
- return "❌ Model name too long (maximum 256 characters)!"
95
-
96
- # Check for valid HuggingFace model name format (must be org/model)
97
- if '/' not in model_name:
98
- return "❌ Invalid model name format! Must include organization (e.g., organization/model-name)"
99
-
100
- if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
101
- return "❌ Invalid model name format! Use format: organization/model-name"
102
-
103
- # Email validation
104
- if not email or not email.strip():
105
- return "❌ Email address cannot be empty!"
106
-
107
- email = email.strip()
108
-
109
- if len(email) > 254:
110
- return "❌ Email address too long!"
111
-
112
- email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
113
- if not re.match(email_pattern, email):
114
- return "❌ Invalid email address format!"
115
-
116
- return None
117
-
118
-
119
- def submit_evaluation(model_name: str, email: str, batch_size: int, current_data: pd.DataFrame, progress=gr.Progress()) -> Tuple[str, Optional[pd.DataFrame]]:
120
- try:
121
- # Input validation
122
- error_msg = _validate_evaluation_request(model_name, email)
123
- if error_msg:
124
- return error_msg, None
125
-
126
- # Show progress
127
- progress(0.1, desc="Sending evaluation request to API...")
128
-
129
- # Send request to API - regardless of backend response, show success to user
130
- api_response = send_evaluation_request_to_api(model_name, batch_size, email)
131
-
132
- # Always show success message to user
133
- # Backend errors (like duplicate requests) are handled by API and communicated via email
134
- progress(1.0, desc="Request submitted successfully!")
135
-
136
- # Return success message regardless of backend response
137
- success_msg = f"""
138
- ✅ Evaluation request submitted successfully!
139
-
140
- 🤖 Model: {model_name}
141
- 📧 Email: {email}
142
-
143
- 📋 Next Steps:
144
- ⏱️ Your request will be reviewed by our system
145
- 📧 You will receive email notifications about the status of your evaluation
146
- 🔄 If you've submitted this model before, you'll be notified via email
147
-
148
- Thank you for contributing to the Mizan Leaderboard!
149
- """
150
-
151
- return success_msg.strip(), current_data
152
-
153
- except Exception as e:
154
- # Log error for debugging
155
- print(f"❌ Error submitting evaluation: {str(e)}")
156
- traceback.print_exc()
157
-
158
- error_msg = f"""
159
- ❌ Failed to submit evaluation request
160
-
161
- 🤖 Model: {model_name}
162
- 📧 Email: {email}
163
-
164
- ⚠️ Error: Unable to connect to the evaluation service.
165
-
166
- Please try again later or contact support if the problem persists.
167
- """
168
- return error_msg.strip(), None
169
-
170
-
171
- def refresh_evaluation_status() -> str:
172
- """Refresh status of all active evaluations"""
173
- if not active_evaluations:
174
- return "🟢 No active evaluations to refresh"
175
-
176
- updated_count = 0
177
- for request_id, info in active_evaluations.items():
178
- try:
179
- status_data = get_evaluation_status(request_id)
180
- if status_data and "status" in status_data:
181
- old_status = info.get("status", "UNKNOWN")
182
- new_status = status_data["status"]
183
- if old_status != new_status:
184
- info["status"] = new_status
185
- updated_count += 1
186
- print(f"Status updated for {request_id}: {old_status} -> {new_status}")
187
- except Exception as e:
188
- print(f"Error refreshing status for {request_id}: {e}")
189
-
190
- return f"🔄 Refreshed status for {len(active_evaluations)} evaluation(s). {updated_count} status change(s) detected."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data.csv CHANGED
@@ -1,33 +1,52 @@
1
- Model,Number of Parameters,Embedding Dim,Max Seq Length,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Correlation,Vocab Size
2
- BAAI/bge-m3,567M,1024,8192,69.39,63.51,75.68,35.26,78.88,57.89,69.83,0.61,250002
3
- intfloat/multilingual-e5-large,559M,1024,512,66.61,62.08,71.8,41.2,72.76,57.17,67.49,0.58,250002
4
- newmindai/TurkEmbed4STS,305M,768,8192,65.66,62.03,69.69,44.29,81.77,47.6,66.79,0.68,250048
5
- ytu-ce-cosmos/turkish-e5-large,559M,1024,512,64.93,59.73,72.42,38.51,70.86,47.6,69.24,0.56,250002
6
- intfloat/multilingual-e5-large-instruct,559M,1024,512,64.33,58.57,72.25,33.16,72.92,44.95,69.56,0.57,250002
7
- nomic-ai/nomic-embed-text-v2-moe,475M,768,512,64.28,60.15,70.07,41.28,63.87,56.4,69.16,0.53,250048
8
- Alibaba-NLP/gte-multilingual-base,305M,768,32768,63.86,60.04,68.0,39.16,76.0,50.12,66.94,0.62,250048
9
- sentence-transformers/paraphrase-multilingual-mpnet-base-v2,278M,768,512,63.33,57.63,70.88,41.35,83.6,33.81,58.51,0.65,250002
10
- newmindai/modernbert-base-tr-uncased-allnli-stsb,134M,768,8192,61.29,54.09,71.47,35.46,82.83,24.81,55.89,0.66,32000
11
- numind/NuSentiment-multilingual,278M,768,512,60.52,49.65,73.67,14.96,76.89,32.76,49.96,0.52,250002
12
- newmindai/TurkEmbed4Retrieval,305M,768,512,60.5,58.04,64.78,47.47,64.04,47.82,66.1,0.57,250048
13
- Qwen/Qwen3-Embedding-0.6B,595M,1024,131072,60.18,56.53,64.68,33.36,66.02,50.06,68.55,0.48,151669
14
- sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,117M,384,512,59.95,54.8,67.21,42.31,79.3,29.95,55.24,0.6,250037
15
- newmindai/TurkEmbed4STS-HD,305M,768,8192,59.94,53.06,67.61,34.24,80.08,35.88,47.47,0.65,250048
16
- emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,110M,768,512,59.92,52.65,68.38,24.61,74.94,39.0,56.3,0.62,32000
17
- ibm-granite/granite-embedding-278m-multilingual,278M,768,512,55.9,54.48,58.64,41.98,60.13,45.08,66.57,0.41,250002
18
- newmindai/ModernBERT-tr-uncased-stsb-HD,134M,768,8192,54.51,43.94,67.17,17.96,82.51,16.08,35.98,0.61,32000
19
- ibm-granite/granite-embedding-107m-multilingual,106M,384,512,52.68,50.72,55.75,34.17,59.86,39.97,63.85,0.38,250002
20
- minishlab/potion-multilingual-128M,128M,256,N/A,50.39,44.47,58.34,23.47,59.76,30.84,49.93,0.43,500358
21
- google/embeddinggemma-300m,307M,768,2048,49.08,44.98,55.23,22.84,61.02,26.92,58.91,0.27,262144
22
- nomic-ai/nomic-embed-text-v1,136M,768,8192,45.12,41.46,48.3,9.45,59.75,32.9,56.88,0.42,30528
23
- nomic-ai/nomic-embed-text-v1.5,136M,768,8192,44.63,40.04,48.92,9.69,58.53,32.19,50.89,0.41,30528
24
- mixedbread-ai/mxbai-embed-large-v1,335M,1024,512,44.0,39.23,49.49,15.99,56.66,27.75,46.25,0.37,30522
25
- sentence-transformers/multi-qa-MiniLM-L6-cos-v1,22M,384,512,38.82,32.39,44.08,5.55,58.29,25.16,28.88,0.34,30522
26
- boun-tabi-LMG/TURNA,495M,1024,1024,38.36,30.96,47.17,10.26,56.62,13.04,27.73,0.22,32128
27
- sentence-transformers/all-MiniLM-L12-v2,33M,384,512,38.28,31.13,44.77,7.82,58.2,21.64,23.24,0.36,30522
28
- nielsr/lilt-xlm-roberta-base,284M,768,512,38.01,29.57,50.1,12.79,55.35,2.45,27.14,0.22,250002
29
- sentence-transformers/all-MiniLM-L6-v2,22M,384,512,37.95,31.97,44.46,6.58,56.75,16.48,35.55,0.31,30522
30
- sentence-transformers/all-mpnet-base-v2,109M,768,512,37.21,31.31,43.75,10.56,55.99,15.16,31.08,0.31,30527
31
- minishlab/potion-base-8M,7M,256,N/A,36.85,30.01,42.51,2.26,57.86,21.75,25.64,0.36,29528
32
- sentence-transformers/paraphrase-MiniLM-L6-v2,22M,384,512,36.26,28.19,44.02,4.53,56.62,17.47,18.29,0.33,30522
33
- newmindai/lettucedect-210m-eurobert-tr-v1,211M,768,8192,27.66,21.55,34.32,1.54,52.34,0.22,19.34,0.1,128256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank (Borda),Model,Model Architecture,Tokenizer Type,Unique Token Count,Turkish Token Count,Turkish Token %,Pure Token Count,Pure Token %,Mean (Task),Mean (TaskType),Classification,Clustering,Pair Classification,Retrieval,STS,Contracts,Regulation,Caselaw,Score(Legal),Memory Usage (MB),Number of Parameters,Embed Dim,Vocab Size,Max Seq Length,Correlation,Model Type
2
+ 1,google/embeddinggemma-300m,Gemma3TextModel,GemmaTokenizer,13697.0,5910.0,43.15,3980.0,29.06,67.23,65.42,77.74,45.05,80.02,55.06,69.22,83.97,39.56,28.38,50.63,1173.0,307M,768.0,262144.0,2048,0.51,Embedding
3
+ 2,newmindai/bge-m3-stsb,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.46,63.53068666666667,74.24768333333334,43.9295,78.50975,50.142,70.8245,82.609,38.141000000000005,29.167,49.97233333333333,2165.0,567M,1024.0,250002.0,8194,0.6350506506291465,Embedding
4
+ 3,BAAI/bge-m3,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,64.75,62.87,75.35,35.86,78.88,54.42,69.83,86.08,38.09,29.3,51.16,2165.0,567M,1024.0,250002.0,8194,0.61,Embedding
5
+ 4,Lajavaness/bilingual-embedding-large,BilingualModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.62,62.468826666666665,74.15278333333333,42.2467,73.0609,52.248250000000006,70.6355,82.14099999999999,35.399,24.551,47.36366666666666,2135.0,559M,1024.0,250002.0,514,0.611419419101738,Embedding
6
+ 5,newmindai/TurkEmbed4STS,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.67,62.41829666666666,69.69163333333334,44.2897,81.76675,49.135,67.2084,78.877,35.18,27.635,47.23066666666666,1164.0,305M,768.0,250048.0,8192,0.6839028854791485,Embedding
7
+ 6,intfloat/multilingual-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,63.14,61.50873666666666,71.79943333333334,41.1967,72.76185000000001,54.29849999999999,67.4872,85.38,33.178000000000004,22.299,46.952333333333335,2135.0,559M,1024.0,250002.0,514,0.5844910512151045,Embedding
8
+ 7,ytu-ce-cosmos/turkish-e5-large,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,62.0,60.36150666666667,72.41818333333333,38.1709,70.86345,51.114,69.241,80.729,37.384,26.476,48.196333333333335,2135.0,559M,1024.0,250002.0,514,0.5608614724386807,Embedding
9
+ 8,Alibaba-NLP/gte-multilingual-base,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,61.18,60.12285333333333,67.99526666666667,39.1645,75.99780000000001,50.516000000000005,66.94069999999999,76.012,36.391,27.066000000000003,46.489666666666665,1164.0,305M,768.0,250048.0,8192,0.6170556873432124,Embedding
10
+ 9,nomic-ai/nomic-embed-text-v2-moe,NomicBertModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,60.63,59.54449333333332,68.53571666666666,43.3523,64.42945,52.6895,68.71549999999999,84.466,39.939,27.849,50.75133333333333,1813.0,475M,768.0,250048.0,2048,0.530989593067926,Embedding
11
+ 10,magibu/embeddingmagibu-200m,Gemma3TextModel,GemmaTokenizer,29799.0,18946.0,63.58,8515.0,28.57,59.989025,59.247110000000006,66.4086,40.1472,74.98685,48.2505,66.4424,75.745,33.984,27.033,45.587,789.0,206M,768.0,131072.0,8192,0.585573508421718,Embedding
12
+ 11,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.62,58.92842666666667,70.87778333333333,41.799,83.59875000000001,39.8555,58.511100000000006,65.403,7.61,1.289,24.767333333333337,1060.0,278M,768.0,250002.0,514,0.6495769869027372,Embedding
13
+ 12,intfloat/multilingual-e5-large-instruct,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.91,58.85126,72.24580000000002,31.5179,72.91635,48.01275,69.5635,78.985,35.735,25.351000000000003,46.690333333333335,2135.0,559M,1024.0,250002.0,514,0.5663941110812728,Embedding
14
+ 13,newmindai/TurkEmbed4Retrieval,NewModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,59.1,58.36369333333333,64.78041666666665,47.468700000000005,64.0415,48.86425,66.6636,74.626,36.121,28.898000000000003,46.54833333333334,1164.0,305M,768.0,250048.0,512,0.5743432298546475,Embedding
15
+ 14,newmindai/Mursit-Embed-Qwen3-1.7B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,58.08,56.84,68.46,42.22,59.67,50.1,63.77,70.22,17.94,16.11,34.76,6563.0,1.7B,2048.0,151936.0,40960,0.44,CLM-Embedding
16
+ 15,newmindai/Mursit-Large-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.57,56.43,67.47,38.76,59.88,51.59,64.44,81.63,32.39,25.24,46.42,1539.0,403M,1024.0,59008.0,2048,0.49,Embedding
17
+ 16,newmindai/modernbert-base-tr-uncased-allnli-stsb,ModernBertModel,PreTrainedTokenizerFast,20502.0,16007.0,78.08,6077.0,29.64,56.35,56.31918666666665,71.45993333333332,35.4615,82.83494999999999,35.11075,56.7288,62.937,15.297,17.466,31.899999999999995,514.0,134M,768.0,32000.0,8192,0.6637952581670423,Embedding
18
+ 17,newmindai/Mursit-Base-TR-Retrieval,ModernBertModel,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,58.01,55.86,66.25,39.75,61.31,50.07,61.9,80.4,34.1,28.07,47.52,593.0,155M,768.0,59008.0,1024,0.49,Embedding
19
+ 18,emrecan/bert-base-turkish-cased-mean-nli-stsb-tr,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,56.03,54.33,68.42,23.64,74.94,42.29,62.39,72.83,22.88,20.78,38.83,421.0,110M,768.0,32000.0,512,0.62,Embedding
20
+ 19,newmindai/TurkEmbed4STS-HD,NewForTokenClassification,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,56.14,54.25491999999999,67.61245,36.856100000000005,80.07815000000001,39.2535,47.4744,70.233,4.837000000000001,6.1690000000000005,27.079666666666668,1164.0,305M,768.0,250048.0,8192,0.6504462482545317,Embedding
21
+ 20,ibm-granite/granite-embedding-278m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.68,53.93412333333333,58.37791666666666,39.4453,60.1335,45.139,66.5749,67.254,24.53,16.229,36.004333333333335,1060.0,278M,768.0,250002.0,514,0.4137480806327822,Embedding
22
+ 21,newmindai/Mursit-Embed-Qwen3-4B-TR,Qwen3ForCausalLM,Qwen2TokenizerFast,10226.0,4128.0,40.37,2865.0,28.02,56.47,53.65,67.29,36.68,58.36,51.12,54.77,69.25,24.21,17.56,37.0,15344.0,4B,2560.0,151936.0,40960,0.34,CLM-Embedding
23
+ 22,nvidia/llama-embed-nemotron-8b,LlamaBidirectionalModel,PreTrainedTokenizerFast,12041.0,5485.0,45.55,3507.0,29.13,51.06448333333333,53.52449666666666,68.51398333333334,39.8189,58.1497,30.656,70.4839,52.095,28.802,16.756999999999998,32.55133333333333,28629.0,8B,4096.0,128256.0,131072,0.3817553384080386,CLM-Embedding
24
+ 23,KaLM-Embedding/KaLM-embedding-multilingual-mini-instruct-v2.5,Qwen2Model,Qwen2TokenizerFast,10262.0,3234.0,31.51,2294.0,22.35,52.71,52.83622666666668,64.64263333333334,37.6148,57.5669,35.511500000000005,68.8453,32.014,35.608000000000004,30.239,32.62033333333334,1884.0,494M,896.0,151936.0,131072,0.4053257465375148,CLM-Embedding
25
+ 24,ibm-granite/granite-embedding-107m-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,50.52,51.07249,55.654500000000006,34.6266,59.86395,41.3655,63.85189999999999,60.72,20.033,11.705,30.819333333333333,408.0,106M,384.0,250002.0,514,0.3807947055039975,Embedding
26
+ 25,sentence-transformers/LaBSE,BertModel,BertTokenizerFast,19595.0,11061.0,56.45,5800.0,29.6,51.83,50.72844,63.18349999999999,25.5499,64.0111,38.4625,62.4352,63.809000000000005,15.122,13.838,30.923,1798.0,471M,768.0,501153.0,512,0.4794392790632775,Embedding
27
+ 26,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,BertModel,BertTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,53.99,50.29915666666667,67.18508333333332,42.3102,79.30365,35.82925,26.867600000000003,56.875,0.8410000000000001,0.713,19.476333333333333,448.0,117M,384.0,250037.0,512,0.6043096711195243,Embedding
28
+ 27,numind/NuSentiment-multilingual,XLMRobertaModel,XLMRobertaTokenizerFast,14860.0,8443.0,56.82,4884.0,32.87,54.0,50.16527553055566,73.67280773306626,14.960431297201202,76.8943051047361,35.343,49.95583351777477,64.037,10.431,10.38,28.282666666666668,1060.0,278M,768.0,250002.0,514,0.5183345151582207,Embedding
29
+ 28,dbmdz/bert-base-turkish-uncased,BertModel,BertTokenizerFast,14807.0,10953.0,73.97,5876.0,39.68,51.99,46.44,67.93,34.76,60.54,31.98,37.01,52.48,12.02,10.09,24.86,421.0,110M,768.0,32000.0,512,0.36,MLM
30
+ 29,minishlab/potion-multilingual-128M,StaticModel,XLMRobertaTokenizerFast,18943.0,12657.0,66.82,5986.0,31.6,47.96,45.95582333333334,58.34376666666668,25.4021,59.76105,36.3395,49.9327,65.022,21.481,14.031,33.51133333333334,488.0,128M,256.0,500358.0,∞,0.4306555947403001,Embedding
31
+ 30,ytu-ce-cosmos/turkish-large-bert-cased,BertForPreTraining,BertTokenizerFast,21076.0,16830.0,79.85,8670.0,41.14,50.7,45.3,67.43,34.24,60.11,28.68,36.04,47.57,5.93,3.85,19.12,1286.0,337M,1024.0,32000.0,1024,0.33,MLM
32
+ 31,dbmdz/bert-base-turkish-cased,BertModel,BertTokenizerFast,21076.0,17028.0,80.79,7263.0,34.46,47.89,45.17,66.39,35.28,60.05,30.52,33.62,54.03,10.13,9.07,24.41,421.0,110M,768.0,32000.0,512,0.33,MLM
33
+ 32,newmindai/TurkEmbed4STS-Static,StaticModel,Tokenizer,13258.0,7304.0,55.09,4075.0,30.74,45.45,43.05512,57.01745,19.3065,65.30815000000001,32.834500000000006,40.809,63.33800000000001,19.964,12.687,31.996333333333336,244.0,64M,256.0,250002.0,∞,0.4254717954565192,Embedding
34
+ 33,KocLab-Bilkent/BERTurk-Legal,BertForMaskedLM,BertTokenizerFast,27482.0,19590.0,71.28,8228.0,29.94,46.44,42.02,60.61,26.24,59.51,25.8,37.94,61.4,15.51,20.99,32.63,703.0,184M,768.0,128000.0,512,0.34,MLM
35
+ 34,newmindai/Mursit-Large,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,44.65,41.75,62.95,25.34,58.04,27.4,35.01,42.74,11.29,17.1,23.71,1539.0,403M,1024.0,59008.0,2048,0.28,MLM
36
+ 35,nomic-ai/nomic-embed-text-v1,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.75,41.66643666666667,47.90213333333333,9.1279,60.08205,34.3415,56.8786,58.672,23.771,15.572,32.67166666666667,521.0,136M,768.0,30528.0,8192,0.426704518889946,Embedding
37
+ 36,ytu-ce-cosmos/turkish-base-bert-uncased,BertForPreTraining,BertTokenizerFast,17128.0,14329.0,83.66,6062.0,35.39,50.54,40.95,66.2,25.68,58.21,20.46,34.2,45.94,10.21,6.28,20.81,421.0,110M,768.0,32000.0,512,0.3,MLM
38
+ 37,nomic-ai/nomic-embed-text-v1.5,NomicBertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,41.21,40.30043666666667,48.92313333333334,9.3571,58.52505,33.8085,50.8884,56.711,13.358,5.783,25.284,521.0,136M,768.0,30528.0,8192,0.4147406606805225,Embedding
39
+ 38,newmindai/Mursit-Base,ModernBertForMaskedLM,PreTrainedTokenizerFast,30047.0,20130.0,67.0,8724.0,29.03,41.34,40.23,59.78,25.48,58.65,20.82,36.45,36.0,7.4,10.4,17.93,593.0,155M,768.0,59008.0,1024,0.28,MLM
40
+ 39,mixedbread-ai/mxbai-embed-large-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,40.92,40.03663,49.5437,15.9903,56.6587,31.74075,46.2497,43.591,10.564,9.052,21.069,1278.0,335M,1024.0,30522.0,512,0.3720971359650719,Embedding
41
+ 40,jhu-clsp/mmBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,13585.0,5611.0,41.3,5710.0,42.03,43.87,39.65,61.84,26.77,59.25,15.83,34.56,34.45,1.33,0.68,12.15,1170.0,306M,768.0,256000.0,8192,0.34,MLM
42
+ 41,boun-tabilab/TabiBERT,ModernBertForMaskedLM,PreTrainedTokenizerFast,32444.0,20388.0,62.84,12186.0,37.56,42.15,37.77,59.63,25.75,58.19,14.96,30.32,32.02,1.86,0.63,11.5,567.0,148M,768.0,50176.0,8192,0.32,MLM
43
+ 42,sentence-transformers/all-MiniLM-L12-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.56,33.19119,44.84295,7.693999999999999,58.1998,20.928,34.2912,38.948,2.771,2.557,14.758666666666668,127.0,33M,384.0,30522.0,512,0.3620264346982647,Embedding
44
+ 43,sentence-transformers/multi-qa-MiniLM-L6-cos-v1,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,33.81,32.343716666666666,44.079283333333336,5.5512,58.2895,24.92,28.8786,36.243,4.816,5.283,15.447333333333336,86.0,22M,384.0,30522.0,512,0.3352620465291676,Embedding
45
+ 44,boun-tabi-LMG/TURNA,T5ForConditionalGeneration,T5TokenizerFast,21630.0,18600.0,85.99,7923.0,36.63,31.74,31.622866666666663,47.17373333333333,10.2619,56.6155,16.333,27.7302,34.89,8.883000000000001,4.55,16.107666666666667,1889.0,495M,1024.0,32128.0,1024,0.2188615462224458,Seq2Seq
46
+ 45,sentence-transformers/all-mpnet-base-v2,MPNetForMaskedLM,MPNetTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.51,31.580113333333333,43.75221666666667,10.0253,55.9924,17.051750000000002,31.0789,32.477000000000004,2.243,3.31,12.67666666666667,417.0,109M,768.0,30527.0,514,0.3072208676420578,Embedding
47
+ 46,sentence-transformers/all-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,30.84,30.223826666666668,44.49228333333334,6.576,56.7533,16.46825,26.8293,32.039,3.052,3.514,12.868333333333334,86.0,22M,384.0,30522.0,512,0.3117993950335187,Embedding
48
+ 47,minishlab/potion-base-8M,StaticModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,31.26,30.1419,42.5097,2.2195,57.8614,22.4745,25.6444,46.72,13.243,9.77,23.244333333333334,28.0,7M,256.0,29528.0,∞,0.363850332504128,Embedding
49
+ 48,sentence-transformers/paraphrase-MiniLM-L6-v2,BertModel,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,29.68,28.88314666666667,44.08553333333333,5.963100000000001,56.6191,14.424,23.324,22.977,4.347,2.266,9.863333333333337,86.0,22M,384.0,30522.0,512,0.3273012423895394,Embedding
50
+ 49,answerdotai/ModernBERT-base,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.33,23.8,39.06,2.01,53.95,2.1,21.91,7.92,0.62,0.43,2.99,568.0,149M,768.0,50368.0,8192,0.23,MLM
51
+ 50,answerdotai/ModernBERT-large,ModernBertForMaskedLM,PreTrainedTokenizerFast,8170.0,3329.0,40.75,2188.0,26.78,22.46,23.74,39.44,3.9,53.73,1.8,19.85,6.12,0.62,0.59,2.44,1505.0,394M,1024.0,50368.0,8192,0.2,MLM
52
+ 51,google-bert/bert-base-uncased,BertForMaskedLM,BertTokenizerFast,5820.0,1999.0,34.35,1277.0,21.94,22.86,23.49519,40.2581,2.7069,53.06465,2.8455,18.6008,8.535,0.393,0.912,3.2800000000000007,417.0,109M,768.0,30522.0,512,0.1652374209194242,MLM
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- gradio>=5.49.1
2
  pandas>=2.3.3
3
  numpy>=2.3.4
4
- matplotlib>=3.10.7
 
5
  requests>=2.32.5
6
  python-dotenv>=1.1.1
7
  itsdangerous>=2.2.0
 
1
+ gradio==5.50.0
2
  pandas>=2.3.3
3
  numpy>=2.3.4
4
+ plotly>=6.5.0
5
+ matplotlib>=3.10.0
6
  requests>=2.32.5
7
  python-dotenv>=1.1.1
8
  itsdangerous>=2.2.0
src/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mizan Turkish Leaderboard - HuggingFace Space Version
3
+
4
+ Clean, modular architecture for the public leaderboard.
5
+ """
6
+
7
+ from .core import column_registry, settings
8
+ from .data import DataTransformer, LeaderboardStyler
9
+ from .components import LeaderboardTab, DatasetTab, SubmitTab
10
+
11
+ __all__ = [
12
+ "column_registry",
13
+ "settings",
14
+ "DataTransformer",
15
+ "LeaderboardStyler",
16
+ "LeaderboardTab",
17
+ "DatasetTab",
18
+ "SubmitTab",
19
+ ]
src/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src/__pycache__/__init__.cpython-312.pyc and b/src/__pycache__/__init__.cpython-312.pyc differ
 
src/api/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """API client modules."""
2
+
3
+ from .client import EvaluationApiClient
4
+
5
+ __all__ = [
6
+ "EvaluationApiClient",
7
+ ]
src/api/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src/api/__pycache__/__init__.cpython-312.pyc and b/src/api/__pycache__/__init__.cpython-312.pyc differ
 
src/api/__pycache__/client.cpython-312.pyc CHANGED
Binary files a/src/api/__pycache__/client.cpython-312.pyc and b/src/api/__pycache__/client.cpython-312.pyc differ
 
src/api/client.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API Client Module
3
+
4
+ Handles communication with the evaluation backend.
5
+ """
6
+
7
+ import logging
8
+ from typing import Optional
9
+ import requests
10
+
11
+ from ..core.config import settings
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class EvaluationApiClient:
17
+ """
18
+ Client for evaluation API operations.
19
+
20
+ Handles submission of evaluation requests to the backend.
21
+ """
22
+
23
+ def __init__(self):
24
+ self.api_url = settings.api.url
25
+ self.auth = (settings.api.username, settings.api.password)
26
+ self.timeout = settings.api.timeout
27
+
28
+ def submit_evaluation(
29
+ self,
30
+ model_name: str,
31
+ email: str,
32
+ batch_size: int = 32
33
+ ) -> bool:
34
+ """
35
+ Submit an evaluation request to the API.
36
+
37
+ Args:
38
+ model_name: HuggingFace model identifier.
39
+ email: Email for notifications.
40
+ batch_size: Batch size for evaluation.
41
+
42
+ Returns:
43
+ True if submission was successful.
44
+ """
45
+ if not settings.api.is_configured:
46
+ logger.error("API not configured - cannot submit evaluation")
47
+ return False
48
+
49
+ try:
50
+ payload = {
51
+ "model_name": model_name,
52
+ "model_repo": model_name.split("/")[0] if "/" in model_name else "unknown",
53
+ "batch_size": batch_size,
54
+ "email": email,
55
+ "model_type": "sentence-transformer"
56
+ }
57
+
58
+ response = requests.post(
59
+ f"{self.api_url}/api/mteb/request",
60
+ json=payload,
61
+ timeout=self.timeout,
62
+ auth=self.auth
63
+ )
64
+
65
+ if response.status_code == 200:
66
+ logger.info(f"Evaluation submitted successfully for {model_name}")
67
+ return True
68
+ else:
69
+ logger.error(f"API returned status {response.status_code}")
70
+ return False
71
+
72
+ except Exception as e:
73
+ logger.error(f"Error submitting evaluation: {e}")
74
+ return False
src/components/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """UI Components for Gradio interface."""
2
+
3
+ from .leaderboard import LeaderboardTab
4
+ from .dataset import DatasetTab
5
+ from .submit import SubmitTab
6
+
7
+ __all__ = [
8
+ "LeaderboardTab",
9
+ "DatasetTab",
10
+ "SubmitTab",
11
+ ]
src/components/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src/components/__pycache__/__init__.cpython-312.pyc and b/src/components/__pycache__/__init__.cpython-312.pyc differ
 
src/components/__pycache__/dataset.cpython-312.pyc CHANGED
Binary files a/src/components/__pycache__/dataset.cpython-312.pyc and b/src/components/__pycache__/dataset.cpython-312.pyc differ
 
src/components/__pycache__/leaderboard.cpython-312.pyc CHANGED
Binary files a/src/components/__pycache__/leaderboard.cpython-312.pyc and b/src/components/__pycache__/leaderboard.cpython-312.pyc differ
 
src/components/__pycache__/submit.cpython-312.pyc CHANGED
Binary files a/src/components/__pycache__/submit.cpython-312.pyc and b/src/components/__pycache__/submit.cpython-312.pyc differ
 
src/components/dataset.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Tab Component
3
+
4
+ Displays task and dataset information.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import html
10
+
11
+
12
+ class DatasetTab:
13
+ """
14
+ Dataset information tab component.
15
+
16
+ Shows details about the evaluation tasks and datasets.
17
+ """
18
+
19
+ def build(self) -> None:
20
+ """Build the dataset tab UI."""
21
+ gr.Markdown("### MTEB Turkish + Turkish Legal Dataset Overview")
22
+
23
+ # Task name to dataset path mapping
24
+ task_to_dataset = {
25
+ 'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
26
+ 'XQuADRetrieval': 'google/xquad',
27
+ 'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
28
+ 'MKQARetrieval': 'apple/mkqa',
29
+ 'MassiveIntentClassification': 'mteb/amazon_massive_intent',
30
+ 'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
31
+ 'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
32
+ 'SIB200Classification': 'mteb/sib200',
33
+ 'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
34
+ 'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
35
+ 'SIB200ClusteringS2S': 'mteb/sib200',
36
+ 'XNLI': 'mteb/xnli',
37
+ 'XNLIV2': 'mteb/xnli2.0-multi-pair',
38
+ 'STS22.v2': 'mteb/sts22-crosslingual-sts'
39
+ }
40
+
41
+ # Create clickable task names
42
+ clickable_task_names = []
43
+ task_list = [
44
+ 'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
45
+ 'MassiveIntentClassification', 'MassiveScenarioClassification',
46
+ 'MultilingualSentimentClassification', 'SIB200Classification',
47
+ 'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
48
+ 'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
49
+ ]
50
+
51
+ for task_name in task_list:
52
+ dataset_path = task_to_dataset[task_name]
53
+ hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
54
+ clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
55
+ clickable_task_names.append(clickable_name)
56
+
57
+ # Create dataset information table
58
+ dataset_data = pd.DataFrame({
59
+ 'Task Name': clickable_task_names,
60
+ 'Task Type': [
61
+ 'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
62
+ 'Classification', 'Classification',
63
+ 'Classification', 'Classification',
64
+ 'Classification', 'Classification',
65
+ 'Clustering', 'PairClassification', 'PairClassification', 'STS'
66
+ ],
67
+ 'Description': [
68
+ 'Turkish FAQ retrieval task',
69
+ 'Turkish question answering retrieval',
70
+ 'Historical Turkish document retrieval',
71
+ 'Multilingual knowledge QA retrieval',
72
+ 'Intent classification for Turkish',
73
+ 'Scenario classification for Turkish',
74
+ 'Multilingual sentiment classification',
75
+ 'SIB200 language identification',
76
+ 'Turkish movie review sentiment',
77
+ 'Turkish product review sentiment',
78
+ 'SIB200 clustering task',
79
+ 'Turkish natural language inference',
80
+ 'Enhanced Turkish NLI task',
81
+ 'Turkish semantic textual similarity'
82
+ ],
83
+ 'Domain': [
84
+ 'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
85
+ 'Intent', 'Scenario',
86
+ 'Sentiment', 'Language ID',
87
+ 'Movies', 'Products',
88
+ 'Language ID', 'NLI', 'NLI', 'STS'
89
+ ],
90
+ 'Samples': [
91
+ '~145K', '~1.19K', '~1.33K', '~10K',
92
+ '~5K', '~5K',
93
+ '211', '~899',
94
+ '~2.64K', '800',
95
+ '99', '~7.5K', '~5.01K', '~208'
96
+ ]
97
+ })
98
+
99
+ gr.Dataframe(
100
+ value=dataset_data,
101
+ label="MTEB Turkish Task Details",
102
+ interactive=False,
103
+ wrap=True,
104
+ datatype=["html", "str", "str", "str", "str"]
105
+ )
106
+
107
+ # Turkish Legal Tasks Section
108
+ self._build_legal_tasks_section()
109
+
110
+ # Task distribution
111
+ self._build_task_distribution_section()
112
+
113
+ # Metrics explanation
114
+ self._build_metrics_explanation_section()
115
+
116
+ def _build_legal_tasks_section(self):
117
+ """Build the Turkish Legal Tasks section."""
118
+ gr.Markdown("---")
119
+ gr.Markdown("### Turkish Legal Tasks")
120
+
121
+ legal_task_to_dataset = {
122
+ 'TurkishLegalQA': 'newmindai/contract-retrieval',
123
+ 'TurkishTaxRulings': 'newmindai/regulation-retrieval',
124
+ 'TurkishCourtOfCassation': 'newmindai/caselaw-retrieval'
125
+ }
126
+
127
+ clickable_legal_task_names = []
128
+ for task_name in ['TurkishLegalQA', 'TurkishTaxRulings', 'TurkishCourtOfCassation']:
129
+ dataset_path = legal_task_to_dataset[task_name]
130
+ hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}"
131
+ clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>'
132
+ clickable_legal_task_names.append(clickable_name)
133
+
134
+ legal_task_data = pd.DataFrame({
135
+ 'Task Name': clickable_legal_task_names,
136
+ 'Task Type': ['Contracts', 'Regulation', 'Case Law'],
137
+ 'Description': [
138
+ 'Turkish legal question answering retrieval',
139
+ 'Turkish legal tax rulings retrieval',
140
+ 'Turkish Court of Cassation caselaw retrieval'
141
+ ],
142
+ 'Domain': ['Contracts', 'Regulation', 'Caselaw'],
143
+ 'Samples': ['272', '~120K', '~1.39K']
144
+ })
145
+
146
+ gr.Dataframe(
147
+ value=legal_task_data,
148
+ label="Turkish Legal Task Details",
149
+ interactive=False,
150
+ wrap=True,
151
+ datatype=["html", "str", "str", "str", "str"]
152
+ )
153
+
154
+ def _build_task_distribution_section(self):
155
+ """Build the task distribution section."""
156
+ gr.Markdown("""
157
+ ### Task Distribution:
158
+
159
+ **Turkish Tasks (14):**
160
+ - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
161
+ - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
162
+ - **Pair Classification**: 2 tasks (natural language inference)
163
+ - **Clustering**: 1 task (language clustering)
164
+ - **STS**: 1 task (semantic textual similarity)
165
+
166
+ **Turkish Legal Tasks (3):**
167
+ - **Contracts**: 1 task (Turkish legal QA retrieval)
168
+ - **Regulation**: 1 task (Turkish tax rulings retrieval)
169
+ - **Caselaw**: 1 task (Turkish Court of Cassation case law retrieval)
170
+
171
+ **Total: 17 tasks across 8 categories**
172
+ """)
173
+
174
+ # Statistics summary
175
+ stats_data = pd.DataFrame({
176
+ 'Metric': [
177
+ 'Total Tasks',
178
+ 'Turkish Tasks',
179
+ 'Legal Tasks',
180
+ 'Task Categories',
181
+ 'Languages',
182
+ 'Avg. Tokens per Sample'
183
+ ],
184
+ 'Value': [
185
+ '17 tasks',
186
+ '14 tasks',
187
+ '3 tasks',
188
+ '8 categories',
189
+ 'Turkish',
190
+ '~150 tokens'
191
+ ],
192
+ 'Notes': [
193
+ 'Comprehensive evaluation: Turkish NLP + Legal',
194
+ 'Classification, Retrieval, STS, NLI, Clustering',
195
+ 'Contracts, Regulation, Caselaw',
196
+ 'Turkish: 5 types, Legal: 3 types',
197
+ 'Turkish-focused',
198
+ 'Varies by task type and domain'
199
+ ]
200
+ })
201
+
202
+ gr.Dataframe(
203
+ value=stats_data,
204
+ label="Dataset Statistics Summary",
205
+ interactive=False
206
+ )
207
+
208
+ def _build_metrics_explanation_section(self):
209
+ """Build the metrics explanation section."""
210
+ gr.Markdown("""
211
+ ---
212
+ ### Metrics Explanation:
213
+
214
+ **Task Categories:**
215
+ - **MTEB Score**: Average performance by task categories (refers to Mean (TaskType))
216
+ - **Mean (Task)**: Average performance across all individual tasks
217
+ - **Classification**: Performance on Turkish classification tasks
218
+ - **Clustering**: Performance on Turkish clustering tasks
219
+ - **Pair Classification**: Performance on pair classification tasks (like NLI)
220
+ - **Retrieval**: Performance on Turkish information retrieval tasks
221
+ - **STS**: Performance on Semantic Textual Similarity tasks
222
+
223
+ **Turkish Legal Categories:**
224
+ - **Contracts**: Performance on Turkish legal contract analysis tasks
225
+ - **Regulation**: Performance on Turkish legal regulation analysis tasks
226
+ - **Caselaw**: Performance on Turkish Court of Cassation case law retrieval tasks
227
+
228
+ ### Tokenizer Quality Metrics:
229
+ - **Unique Token Count**: Number of unique tokens generated by the tokenizer on Turkish MMLU dataset
230
+ - **Turkish Token Count**: How many unique tokens are valid Turkish words/morphemes
231
+ - **Turkish Token %**: Percentage of unique tokens that are linguistically valid Turkish
232
+ - **Pure Token Count**: How many unique tokens are morphologically pure (root words)
233
+ - **Pure Token %**: Percentage of unique tokens that are root words without suffixes
234
+
235
+ ### Model Information:
236
+ - **Parameters**: Number of model parameters
237
+ - **Embed Dim**: Embedding dimension size
238
+ - **Max Seq Length**: Maximum sequence length the model can process
239
+ - **Vocab Size**: Size of the model's vocabulary
240
+ - **Model Architecture**: The underlying model architecture
241
+ - **Tokenizer Type**: The tokenizer implementation used
242
+ """)
243
+
244
+ # About, Contact, and Links section
245
+ self._build_about_section()
246
+
247
+ def _build_about_section(self):
248
+ """Build the about, contact, and links section."""
249
+ gr.Markdown("""
250
+ ---
251
+ ### About Mizan:
252
+ This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models
253
+ on Turkish language tasks across multiple domains including:
254
+ - Text classification and sentiment analysis
255
+ - Information retrieval and search
256
+ - Semantic textual similarity
257
+ - Text clustering and pair classification
258
+ - **Turkish Legal**: Contract analysis, regulation, and case law retrieval
259
+
260
+ ### Submit Your Model:
261
+ Use the **Submit** tab to submit your Turkish embedding model for evaluation.
262
+ Your request will be reviewed by administrators and you'll receive email notifications about the progress.
263
+
264
+ ### Contact:
265
+ For any questions or feedback, please contact info@newmind.ai
266
+
267
+ ### Links:
268
+ - **GitHub**: [embeddings-benchmark/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon)
269
+ - **Github**: [malibayram/tokenizer_benchmark](https://github.com/malibayram/tokenizer_benchmark) - Tokenizer evaluation is done with code from this repository, developed by Mehmet Ali Bayram, which utilizes ITU NLP tools for Turkish linguistic analysis.
270
+ """)
src/components/leaderboard.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard Tab Component
3
+
4
+ Main leaderboard display with column filtering.
5
+ """
6
+
7
+ import logging
8
+ from typing import Dict, List, Optional
9
+ import gradio as gr
10
+ import pandas as pd
11
+ import numpy as np
12
+ import plotly.graph_objects as go
13
+
14
+ from ..core.columns import column_registry, ColumnGroup
15
+ from ..core.config import settings
16
+ from ..data import DataTransformer, LeaderboardStyler
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class LeaderboardTab:
22
+ """
23
+ Leaderboard tab component.
24
+
25
+ Displays the main ranking table with:
26
+ - Color-coded scores
27
+ - Column filtering via checkbox groups
28
+ - Clickable model links
29
+ """
30
+
31
+ def __init__(self, data: pd.DataFrame):
32
+ self.data = data
33
+ self.transformer = DataTransformer()
34
+ self.styler = LeaderboardStyler()
35
+
36
+ # UI components (will be set during build)
37
+ self.leaderboard: Optional[gr.Dataframe] = None
38
+ self._column_checkboxes: Dict[str, gr.CheckboxGroup] = {}
39
+ self._selected_columns_state: Optional[gr.State] = None
40
+ self._model_type_filter_state: Optional[gr.State] = None
41
+ self._search_state: Optional[gr.State] = None
42
+
43
+ def _get_styled_data(
44
+ self,
45
+ columns: List[str],
46
+ model_type_filter: str = "All"
47
+ ) -> "pd.io.formats.style.Styler":
48
+ """Get styled DataFrame for given columns."""
49
+ if self.data is None or self.data.empty:
50
+ empty = self.transformer.create_empty_dataframe()
51
+ return empty.style
52
+
53
+ # Apply model type filter
54
+ filtered_data = self.data.copy()
55
+ if model_type_filter != "All" and "Model Type" in filtered_data.columns:
56
+ filtered_data = filtered_data[filtered_data["Model Type"] == model_type_filter]
57
+
58
+ filtered = self.transformer.prepare_for_display(filtered_data, columns, add_links=False)
59
+ return self.styler.apply_styling(filtered)
60
+
61
+ def _get_column_groups(self) -> Dict[str, List[str]]:
62
+ """Get optional columns organized by group (exclude default columns)."""
63
+ groups = {}
64
+
65
+ # Get default column names to exclude
66
+ default_cols = set(column_registry.default_columns)
67
+
68
+ # MTEB Task Scores (only optional ones)
69
+ mteb_cols = [col for col in column_registry.get_group_names(ColumnGroup.MTEB) if col not in default_cols]
70
+ if mteb_cols:
71
+ groups["MTEB Scores"] = mteb_cols
72
+
73
+ # Legal Task Scores (only optional ones)
74
+ legal_cols = [col for col in column_registry.get_group_names(ColumnGroup.LEGAL) if col not in default_cols]
75
+ if legal_cols:
76
+ groups["Legal Scores"] = legal_cols
77
+
78
+ # Correlation
79
+ corr_cols = [col for col in column_registry.get_group_names(ColumnGroup.CORRELATION) if col not in default_cols]
80
+ if corr_cols:
81
+ groups["Correlation"] = corr_cols
82
+
83
+ # Tokenizer Quality (only optional ones)
84
+ tok_cols = [col for col in column_registry.get_group_names(ColumnGroup.TOKENIZER) if col not in default_cols]
85
+ if tok_cols:
86
+ groups["Tokenizer Quality"] = tok_cols
87
+
88
+ # Additional Model Info (only optional ones)
89
+ model_info_cols = [col for col in column_registry.get_group_names(ColumnGroup.MODEL_INFO) if col not in default_cols]
90
+ if model_info_cols:
91
+ groups["Model Info"] = model_info_cols
92
+
93
+ return groups
94
+
95
+ def _filter_columns_handler(
96
+ self,
97
+ previous_selected: List[str],
98
+ model_type_filter: str,
99
+ *checkbox_values
100
+ ) -> tuple:
101
+ """Handle checkbox group changes with click-order tracking."""
102
+ # Collect all currently selected columns from all checkbox groups
103
+ currently_selected = set()
104
+ for selected_list in checkbox_values:
105
+ if selected_list:
106
+ for col_name in selected_list:
107
+ currently_selected.add(col_name)
108
+
109
+ previous_set = set(previous_selected)
110
+
111
+ # Find newly added columns (in current but not in previous)
112
+ newly_added = currently_selected - previous_set
113
+
114
+ # Find removed columns (in previous but not in current)
115
+ removed = previous_set - currently_selected
116
+
117
+ # Update the ordered list: keep previous order, remove deselected, append new
118
+ updated_selected = [col for col in previous_selected if col not in removed]
119
+ for col in newly_added:
120
+ updated_selected.append(col)
121
+
122
+ # Build final column list: defaults + selected optional in order
123
+ ordered_columns = list(column_registry.default_columns) + updated_selected
124
+
125
+ # Get styled data with model type filter
126
+ styled = self._get_styled_data(ordered_columns, model_type_filter)
127
+ datatypes = self.styler.get_datatypes(ordered_columns)
128
+ widths = self.styler.get_column_widths(ordered_columns)
129
+
130
+ return gr.update(value=styled, datatype=datatypes, column_widths=widths), updated_selected
131
+
132
+ def _model_type_filter_handler(self, previous_selected: List[str], model_type_filter: str) -> tuple:
133
+ """Handle model type filter changes."""
134
+ # Build final column list: defaults + selected optional in order
135
+ ordered_columns = list(column_registry.default_columns) + previous_selected
136
+
137
+ # Get styled data with model type filter
138
+ styled = self._get_styled_data(ordered_columns, model_type_filter)
139
+ datatypes = self.styler.get_datatypes(ordered_columns)
140
+ widths = self.styler.get_column_widths(ordered_columns)
141
+
142
+ return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter
143
+
144
+ def _model_type_and_plots_handler(
145
+ self,
146
+ previous_selected: List[str],
147
+ model_type_filter: str
148
+ ) -> tuple:
149
+ """Handle model type filter changes and update both leaderboard and plots."""
150
+ # Build final column list: defaults + selected optional in order
151
+ ordered_columns = list(column_registry.default_columns) + previous_selected
152
+
153
+ # Get styled data with model type filter
154
+ styled = self._get_styled_data(ordered_columns, model_type_filter)
155
+ datatypes = self.styler.get_datatypes(ordered_columns)
156
+ widths = self.styler.get_column_widths(ordered_columns)
157
+
158
+ # Update plots with filtered data
159
+ plot1 = self._get_pure_vs_mean_task_plot(model_type_filter)
160
+ plot2 = self._get_pure_vs_legal_score_plot(model_type_filter)
161
+
162
+ return gr.update(value=styled, datatype=datatypes, column_widths=widths), model_type_filter, plot1, plot2
163
+
164
+
165
+ def _create_bubble_plot(self, x_col: str, y_col: str, size_col: str,
166
+ title: str, xlabel: str, ylabel: str, model_type_filter: str = "All") -> Optional[go.Figure]:
167
+ """
168
+ Create an interactive Plotly bubble plot for tokenizer visualization.
169
+
170
+ Features:
171
+ - Interactive hover, zoom, pan
172
+ - Text annotations on bubbles
173
+ - Viridis colormap
174
+ - Matches matplotlib styling
175
+ - Model type filtering
176
+ """
177
+ try:
178
+ # Load leaderboard summary
179
+ file_path = settings.data.csv_file
180
+ if not file_path.exists():
181
+ logger.warning(f"Leaderboard data not found: {file_path}")
182
+ return None
183
+
184
+ df = pd.read_csv(file_path)
185
+
186
+ # Apply column name mappings from CSV to display names
187
+ csv_mapping = column_registry.get_csv_mapping()
188
+ df = df.rename(columns=csv_mapping)
189
+
190
+ # Apply model type filter
191
+ if model_type_filter != "All" and "Model Type" in df.columns:
192
+ df = df[df["Model Type"] == model_type_filter]
193
+
194
+ # Filter rows that have the required columns
195
+ required_cols = [x_col, y_col, size_col, 'Model']
196
+ if not all(col in df.columns for col in required_cols):
197
+ logger.warning(f"Missing required columns for plot")
198
+ return None
199
+
200
+ # Filter out rows with missing data
201
+ plot_df = df[required_cols].copy()
202
+ plot_df = plot_df.dropna(subset=[x_col, y_col, size_col])
203
+
204
+ if plot_df.empty:
205
+ logger.warning(f"No data available for plotting {x_col} vs {y_col}")
206
+ return None
207
+
208
+ # Prepare data
209
+ x = plot_df[x_col]
210
+ y = plot_df[y_col]
211
+ sizes = plot_df[size_col]
212
+ models = plot_df['Model']
213
+
214
+ # Normalize sizes for bubble plot (smaller bubbles for cleaner look)
215
+ size_min, size_max = sizes.min(), sizes.max()
216
+ if size_max > size_min:
217
+ normalized_sizes = 8 + (sizes - size_min) / (size_max - size_min) * 35
218
+ else:
219
+ normalized_sizes = np.full(len(sizes), 20)
220
+
221
+ # Create Plotly figure
222
+ fig = go.Figure()
223
+
224
+ # Add scatter trace with bubbles
225
+ fig.add_trace(go.Scatter(
226
+ x=x,
227
+ y=y,
228
+ mode='markers',
229
+ marker=dict(
230
+ size=normalized_sizes,
231
+ color=sizes, # Color by Turkish Token Count
232
+ colorscale='Viridis',
233
+ showscale=True,
234
+ colorbar=dict(
235
+ title=dict(text="Turkish<br>Token<br>Count", font=dict(size=12, family='Arial, sans-serif')),
236
+ thickness=12,
237
+ len=1
238
+ ),
239
+ line=dict(width=0.5, color='rgba(0,0,0,0.3)'),
240
+ opacity=0.7
241
+ ),
242
+ text=models,
243
+ hovertemplate='<b>%{text}</b><br>' +
244
+ f'{xlabel}: %{{x:.2f}}<br>' +
245
+ f'{ylabel}: %{{y:.0f}}<br>' +
246
+ f'{size_col}: %{{marker.color:.0f}}<br>' +
247
+ '<extra></extra>',
248
+ name='',
249
+ showlegend=False
250
+ ))
251
+
252
+ # Get top 5 models by Pure Token Count for custom legend
253
+ top_5_df = plot_df.nlargest(5, y_col)
254
+ top_5_models = top_5_df['Model'].tolist()
255
+
256
+ # Build custom legend text using annotations (pixel-perfect control)
257
+ legend_lines = ["<b>Top 5 Models</b>"] + [f"{i}. {name}" for i, name in enumerate(top_5_models, 1)]
258
+ legend_text = "<br>".join(legend_lines)
259
+
260
+ # Update layout for responsive, clean display
261
+ fig.update_layout(
262
+ title=dict(
263
+ text=title,
264
+ font=dict(size=14, family='Arial, sans-serif', color='black'),
265
+ x=0.5,
266
+ xanchor='center',
267
+ y=0.98,
268
+ yanchor='top'
269
+ ),
270
+ xaxis=dict(
271
+ title=dict(text=xlabel, font=dict(size=12, family='Arial, sans-serif')),
272
+ gridcolor='rgba(128,128,128,0.2)',
273
+ gridwidth=0.5,
274
+ showgrid=True,
275
+ zeroline=False
276
+ ),
277
+ yaxis=dict(
278
+ title=dict(text=ylabel, font=dict(size=12, family='Arial, sans-serif')),
279
+ gridcolor='rgba(128,128,128,0.2)',
280
+ gridwidth=0.5,
281
+ showgrid=True,
282
+ zeroline=False
283
+ ),
284
+ plot_bgcolor='white',
285
+ paper_bgcolor='white',
286
+ autosize=True,
287
+ hovermode='closest',
288
+ showlegend=False,
289
+ margin=dict(l=60, r=60, t=80, b=60)
290
+ )
291
+
292
+ # Add custom legend as annotation
293
+ fig.add_annotation(
294
+ text=legend_text,
295
+ xref='paper',
296
+ yref='paper',
297
+ x=1.14,
298
+ y=1.255,
299
+ xanchor='right',
300
+ yanchor='top',
301
+ showarrow=False,
302
+ font=dict(size=9, family='Arial, sans-serif', color='#333'),
303
+ align='left',
304
+ bgcolor='rgba(255,255,255,0.9)',
305
+ bordercolor='rgba(0,0,0,0.15)',
306
+ borderwidth=1,
307
+ borderpad=4
308
+ )
309
+
310
+ # Expand x-axis range for better spacing
311
+ x_min, x_max = x.min(), x.max()
312
+ x_range = x_max - x_min
313
+ fig.update_xaxes(range=[x_min - x_range * 0.05, x_max + x_range * 0.05])
314
+
315
+ return fig
316
+
317
+ except Exception as e:
318
+ logger.error(f"Error creating bubble plot: {e}")
319
+ return None
320
+
321
+ def _get_pure_vs_mean_task_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
322
+ """Get Plotly figure for Pure Token Count vs MTEB Score plot."""
323
+ return self._create_bubble_plot(
324
+ x_col='MTEB Score',
325
+ y_col='Pure Token Count',
326
+ size_col='Turkish Token Count',
327
+ title='Pure Token Count vs MTEB Score',
328
+ xlabel='MTEB Score (%)',
329
+ ylabel='Pure Token Count',
330
+ model_type_filter=model_type_filter
331
+ )
332
+
333
+ def _get_pure_vs_legal_score_plot(self, model_type_filter: str = "All") -> Optional[go.Figure]:
334
+ """Get Plotly figure for Pure Token Count vs Legal Score plot."""
335
+ return self._create_bubble_plot(
336
+ x_col='Legal Score',
337
+ y_col='Pure Token Count',
338
+ size_col='Turkish Token Count',
339
+ title='Pure Token Count vs Legal Score',
340
+ xlabel='Legal Score (%)',
341
+ ylabel='Pure Token Count',
342
+ model_type_filter=model_type_filter
343
+ )
344
+
345
+ def build(self) -> gr.Dataframe:
346
+ """
347
+ Build the leaderboard tab UI.
348
+
349
+ Returns:
350
+ The main leaderboard Dataframe component.
351
+ """
352
+ # Initial styled data (filter to All by default)
353
+ initial_columns = column_registry.default_columns
354
+ initial_styled = self._get_styled_data(initial_columns, "All")
355
+ initial_datatypes = self.styler.get_datatypes(initial_columns)
356
+ initial_widths = self.styler.get_column_widths(initial_columns)
357
+
358
+ # State to track selected columns in click order
359
+ self._selected_columns_state = gr.State([])
360
+
361
+ # State to track model type filter
362
+ self._model_type_filter_state = gr.State("All")
363
+
364
+ # Get column groups
365
+ column_groups = self._get_column_groups()
366
+
367
+ # Model Type Filter (Radio buttons)
368
+ model_type_choices = ["All", "CLM-Embedding", "Embedding", "MLM", "Seq2Seq"]
369
+ model_type_radio = gr.Radio(
370
+ choices=model_type_choices,
371
+ value="All",
372
+ label="Filter by Model Type",
373
+ container=True,
374
+ )
375
+
376
+ # Create checkbox groups in a compact accordion layout
377
+ checkbox_components = []
378
+
379
+ with gr.Accordion("Optional Columns", open=False):
380
+ with gr.Row():
381
+ for group_name, columns in column_groups.items():
382
+ checkbox = gr.CheckboxGroup(
383
+ choices=columns,
384
+ value=[],
385
+ label=group_name,
386
+ container=True,
387
+ )
388
+ self._column_checkboxes[group_name] = checkbox
389
+ checkbox_components.append(checkbox)
390
+
391
+ # Main leaderboard
392
+ self.leaderboard = gr.Dataframe(
393
+ value=initial_styled,
394
+ datatype=initial_datatypes,
395
+ column_widths=initial_widths,
396
+ interactive=False,
397
+ wrap=True,
398
+ max_height=settings.ui.max_table_height,
399
+ show_search=True,
400
+ show_copy_button=True,
401
+ show_fullscreen_button=True,
402
+ )
403
+
404
+ # Tokenizer visualizations
405
+ gr.Markdown("### Tokenizer Quality Visualizations")
406
+ gr.Markdown("""
407
+ Interactive bubble plots showing tokenizer quality metrics vs model performance.
408
+ Bubble size and color represent Turkish Token Count. Hover for details, zoom, and pan.
409
+ """)
410
+
411
+ with gr.Row():
412
+ # Plot 1: Pure Token Count vs Mean Task
413
+ self.plot_mean_task = gr.Plot(
414
+ value=self._get_pure_vs_mean_task_plot("All"),
415
+ label="Pure Token Count vs Mean Task (MTEB)",
416
+ show_label=False,
417
+ )
418
+
419
+ # Plot 2: Pure Token Count vs Legal Score
420
+ self.plot_legal_score = gr.Plot(
421
+ value=self._get_pure_vs_legal_score_plot("All"),
422
+ label="Pure Token Count vs Score(Legal)",
423
+ show_label=False,
424
+ )
425
+
426
+ # Usage instructions
427
+ gr.Markdown("""
428
+ ### How to Use:
429
+ - **Search**: Use the search box to find specific models
430
+ - **Color Coding**: Scores are color-coded from red (low) to green (high)
431
+ - **Sorting**: Click on column headers to sort
432
+ - **Rankings**: Models ranked by MTEB Score
433
+ - **Toggle Columns**: Use the checkboxes above to show/hide additional metrics
434
+ - **Filter by Model Type**: Use the radio buttons to filter models by their type
435
+ """)
436
+
437
+ # Wire up events
438
+ self._setup_events(checkbox_components, model_type_radio)
439
+
440
+ return self.leaderboard
441
+
442
+ def _setup_events(
443
+ self,
444
+ checkbox_components: List[gr.CheckboxGroup],
445
+ model_type_radio: gr.Radio
446
+ ):
447
+ """Set up event handlers."""
448
+ # Each checkbox group triggers column filtering with state tracking
449
+ for checkbox in checkbox_components:
450
+ checkbox.change(
451
+ fn=self._filter_columns_handler,
452
+ inputs=[self._selected_columns_state, self._model_type_filter_state] + checkbox_components,
453
+ outputs=[self.leaderboard, self._selected_columns_state]
454
+ )
455
+
456
+ # Model type radio triggers filtering and plot updates
457
+ model_type_radio.change(
458
+ fn=self._model_type_and_plots_handler,
459
+ inputs=[self._selected_columns_state, model_type_radio],
460
+ outputs=[self.leaderboard, self._model_type_filter_state, self.plot_mean_task, self.plot_legal_score]
461
+ )
src/components/submit.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Submit Tab Component
3
+
4
+ Model evaluation submission with HuggingFace authentication.
5
+ """
6
+
7
+ import logging
8
+ import re
9
+ from typing import Optional, Tuple
10
+ import gradio as gr
11
+
12
+ from ..api import EvaluationApiClient
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class SubmitTab:
18
+ """
19
+ Submit evaluation tab component.
20
+
21
+ Provides:
22
+ - HuggingFace OAuth login
23
+ - Model submission form
24
+ - Email notification setup
25
+ """
26
+
27
+ def __init__(self):
28
+ self.api_client = EvaluationApiClient()
29
+
30
+ # UI components (will be set during build)
31
+ self.model_input: Optional[gr.Textbox] = None
32
+ self.email_input: Optional[gr.Textbox] = None
33
+ self.submit_btn: Optional[gr.Button] = None
34
+ self.login_button: Optional[gr.LoginButton] = None
35
+ self.result_output: Optional[gr.HTML] = None
36
+
37
+ def _validate_model_name(self, model_name: str) -> Optional[str]:
38
+ """Validate model name format."""
39
+ if not model_name or not model_name.strip():
40
+ return "Model name cannot be empty!"
41
+
42
+ model_name = model_name.strip()
43
+
44
+ if len(model_name) < 3:
45
+ return "Model name too short!"
46
+
47
+ if len(model_name) > 256:
48
+ return "Model name too long (maximum 256 characters)!"
49
+
50
+ if '/' not in model_name:
51
+ return "Invalid format! Must include organization (e.g., organization/model-name)"
52
+
53
+ if not re.match(r'^[a-zA-Z0-9._-]+/[a-zA-Z0-9._-]+$', model_name):
54
+ return "Invalid format! Use format: organization/model-name"
55
+
56
+ return None
57
+
58
+ def _validate_email(self, email: str) -> Optional[str]:
59
+ """Validate email format."""
60
+ if not email or not email.strip():
61
+ return "Email address cannot be empty!"
62
+
63
+ email = email.strip()
64
+
65
+ if len(email) > 254:
66
+ return "Email address too long!"
67
+
68
+ email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
69
+ if not re.match(email_pattern, email):
70
+ return "Invalid email address format!"
71
+
72
+ return None
73
+
74
+ def _handle_submit(self, model_name: str, email: str, profile) -> str:
75
+ """Handle evaluation submission."""
76
+ # Authentication check
77
+ if profile is None:
78
+ return "<p style='color: red; font-weight: bold;'>⚠️ Authentication required. Please log in with your Hugging Face account.</p>"
79
+
80
+ # Check for local dev mock auth
81
+ if isinstance(profile, str) and profile == "Sign in with Hugging Face":
82
+ return "<p style='color: orange; font-weight: bold;'>⚠️ HF authentication required.</p>"
83
+
84
+ # Validate model name
85
+ model_error = self._validate_model_name(model_name)
86
+ if model_error:
87
+ return f"<p style='color: red; font-weight: bold;'>❌ {model_error}</p>"
88
+
89
+ # Validate email
90
+ email_error = self._validate_email(email)
91
+ if email_error:
92
+ return f"<p style='color: red; font-weight: bold;'>❌ {email_error}</p>"
93
+
94
+ # Submit to API
95
+ model_name = model_name.strip()
96
+ email = email.strip()
97
+
98
+ try:
99
+ success = self.api_client.submit_evaluation(model_name, email)
100
+
101
+ if success:
102
+ return f"""
103
+ <div style='padding: 16px; background: #d4edda; border-radius: 8px; border: 1px solid #c3e6cb; color: #155724;'>
104
+ <h3 style='color: #155724; margin: 0 0 12px 0;'>✅ Evaluation Request Submitted!</h3>
105
+ <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Model:</strong> {model_name}</p>
106
+ <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Email:</strong> {email}</p>
107
+ <hr style='margin: 12px 0; border-color: #c3e6cb;'>
108
+ <p style='color: #155724; margin: 4px 0;'><strong style='color: #155724;'>Next Steps:</strong></p>
109
+ <ul style='color: #155724; margin: 8px 0; padding-left: 20px;'>
110
+ <li style='color: #155724;'>Your request will be reviewed by our system</li>
111
+ <li style='color: #155724;'>You will receive email notifications about the status</li>
112
+ <li style='color: #155724;'>Results will appear on the leaderboard when complete</li>
113
+ </ul>
114
+ <p style='color: #155724; margin-top: 12px; font-style: italic;'>Thank you for contributing to the Mizan Leaderboard!</p>
115
+ </div>
116
+ """
117
+ else:
118
+ return """
119
+ <div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
120
+ <h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Submission Failed</h3>
121
+ <p>Unable to connect to the evaluation service. Please try again later.</p>
122
+ </div>
123
+ """
124
+ except Exception as e:
125
+ logger.error(f"Error submitting evaluation: {e}")
126
+ return f"""
127
+ <div style='padding: 16px; background: #f8d7da; border-radius: 8px; border: 1px solid #f5c6cb;'>
128
+ <h3 style='color: #721c24; margin: 0 0 8px 0;'>❌ Error</h3>
129
+ <p>An unexpected error occurred. Please try again later.</p>
130
+ </div>
131
+ """
132
+
133
+ def build(self) -> None:
134
+ """Build the submit tab UI."""
135
+ gr.Markdown("### Submit Model for Evaluation")
136
+ gr.Markdown("""
137
+ Submit your Turkish embedding model for evaluation on the Mizan benchmark.
138
+ **Authentication with Hugging Face is required to submit evaluations.**
139
+ """)
140
+
141
+ # OAuth login button
142
+ self.login_button = gr.LoginButton(value="Sign in with Hugging Face")
143
+
144
+ self.model_input = gr.Textbox(
145
+ label="Model Name",
146
+ placeholder="sentence-transformers/your-model",
147
+ info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
148
+ )
149
+
150
+ self.email_input = gr.Textbox(
151
+ label="Email Address",
152
+ placeholder="your.email@example.com",
153
+ info="Email for notifications about evaluation status and results"
154
+ )
155
+
156
+ self.submit_btn = gr.Button(
157
+ "Submit",
158
+ variant="primary",
159
+ size="lg"
160
+ )
161
+
162
+ # Result output
163
+ self.result_output = gr.HTML(label="Status")
164
+
165
+ # Wire up submit button
166
+ self.submit_btn.click(
167
+ fn=self._handle_submit,
168
+ inputs=[self.model_input, self.email_input, self.login_button],
169
+ outputs=[self.result_output]
170
+ )
171
+
172
+ # Information about the evaluation process
173
+ gr.Markdown("""
174
+ ### Evaluation Process:
175
+ 1. **Sign In**: First, sign in with your Hugging Face account using the button above
176
+ 2. **Submit Request**: Fill out the form with your model details and email
177
+ 3. **Admin Review**: Your request will be reviewed by administrators
178
+ 4. **Evaluation**: If approved, your model will be evaluated on Mizan benchmark
179
+ 5. **Results**: You'll receive email notifications and results will appear on the leaderboard
180
+
181
+ ### Important Notes:
182
+ - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
183
+ - You'll receive email updates about your request status
184
+ - Make sure your model is publicly available on HuggingFace
185
+ - Valid email address is required for receiving results
186
+ """)
src/core/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core modules - configuration and column definitions."""
2
+
3
+ from .columns import column_registry, ColumnType, ColumnGroup, ColumnDefinition
4
+ from .config import settings
5
+
6
+ __all__ = [
7
+ "column_registry",
8
+ "ColumnType",
9
+ "ColumnGroup",
10
+ "ColumnDefinition",
11
+ "settings",
12
+ ]
src/core/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src/core/__pycache__/__init__.cpython-312.pyc and b/src/core/__pycache__/__init__.cpython-312.pyc differ
 
src/core/__pycache__/columns.cpython-312.pyc CHANGED
Binary files a/src/core/__pycache__/columns.cpython-312.pyc and b/src/core/__pycache__/columns.cpython-312.pyc differ
 
src/core/__pycache__/config.cpython-312.pyc CHANGED
Binary files a/src/core/__pycache__/config.cpython-312.pyc and b/src/core/__pycache__/config.cpython-312.pyc differ
 
src/core/columns.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized Column Definitions
3
+
4
+ Single source of truth for all leaderboard columns.
5
+ Add new columns here and they propagate everywhere automatically.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from enum import Enum, auto
10
+ from typing import List, Dict, Optional
11
+
12
+
13
+ class ColumnType(Enum):
14
+ """Column data types for Gradio."""
15
+ NUMBER = "number"
16
+ STRING = "str"
17
+ HTML = "html"
18
+
19
+
20
+ class ColumnGroup(Enum):
21
+ """Column groupings for organization and filtering."""
22
+ CORE = auto() # Always visible: Rank, Model
23
+ LEGAL = auto() # Legal benchmark scores
24
+ MTEB = auto() # MTEB task type scores
25
+ TOKENIZER = auto() # Tokenizer quality metrics
26
+ MODEL_INFO = auto() # Model metadata
27
+ CORRELATION = auto() # Correlation metrics
28
+
29
+
30
+ @dataclass
31
+ class ColumnDefinition:
32
+ """
33
+ Complete definition for a leaderboard column.
34
+
35
+ This is the single source of truth - all column metadata lives here.
36
+ """
37
+ name: str
38
+ api_name: Optional[str] = None
39
+ column_type: ColumnType = ColumnType.STRING
40
+ group: ColumnGroup = ColumnGroup.CORE
41
+ width: str = "120px"
42
+ decimals: int = 2
43
+ default_visible: bool = True
44
+ colorize: bool = False
45
+ description: str = ""
46
+
47
+ @property
48
+ def csv_key(self) -> str:
49
+ """Get the key used in CSV files."""
50
+ return self.api_name or self.name
51
+
52
+
53
+ COLUMN_DEFINITIONS: List[ColumnDefinition] = [
54
+ # 1. Rank (always first)
55
+ ColumnDefinition(
56
+ name="Rank",
57
+ column_type=ColumnType.NUMBER,
58
+ group=ColumnGroup.CORE,
59
+ width="50px",
60
+ decimals=0,
61
+ default_visible=True,
62
+ description="Rank by MTEB Score (Mean TaskType)"
63
+ ),
64
+ # 2. Model (always second)
65
+ ColumnDefinition(
66
+ name="Model",
67
+ column_type=ColumnType.HTML,
68
+ group=ColumnGroup.CORE,
69
+ width="280px",
70
+ default_visible=True,
71
+ colorize=False,
72
+ description="Model name with HuggingFace link"
73
+ ),
74
+ # 3. MTEB Score - default
75
+ ColumnDefinition(
76
+ name="MTEB Score",
77
+ api_name="Mean (TaskType)",
78
+ column_type=ColumnType.NUMBER,
79
+ group=ColumnGroup.MTEB,
80
+ width="140px",
81
+ default_visible=True,
82
+ colorize=True,
83
+ description="MTEB Score: Average of task type category scores"
84
+ ),
85
+ # 4. Legal Score - default
86
+ ColumnDefinition(
87
+ name="Legal Score",
88
+ api_name="Score(Legal)",
89
+ column_type=ColumnType.NUMBER,
90
+ group=ColumnGroup.LEGAL,
91
+ width="120px",
92
+ default_visible=True,
93
+ colorize=True,
94
+ description="Mean of legal benchmark scores (Contracts, Regulation, Caselaw)"
95
+ ),
96
+ # 5. Pure Token Count - default
97
+ ColumnDefinition(
98
+ name="Pure Token Count",
99
+ column_type=ColumnType.NUMBER,
100
+ group=ColumnGroup.TOKENIZER,
101
+ width="150px",
102
+ decimals=0,
103
+ default_visible=True,
104
+ description="Tokens that are morphologically pure"
105
+ ),
106
+ # 6. Max Sequence Length - default
107
+ ColumnDefinition(
108
+ name="Max Sequence Length",
109
+ api_name="Max Tokens",
110
+ column_type=ColumnType.NUMBER,
111
+ group=ColumnGroup.MODEL_INFO,
112
+ width="160px",
113
+ decimals=0,
114
+ default_visible=True,
115
+ description="Maximum sequence length"
116
+ ),
117
+ # 7. Parameters - default
118
+ ColumnDefinition(
119
+ name="Parameters",
120
+ api_name="Number of Parameters",
121
+ column_type=ColumnType.NUMBER,
122
+ group=ColumnGroup.MODEL_INFO,
123
+ width="120px",
124
+ decimals=0,
125
+ default_visible=True,
126
+ description="Number of model parameters (e.g., 1.2B)"
127
+ ),
128
+ # 8. Model Architecture - default
129
+ ColumnDefinition(
130
+ name="Model Architecture",
131
+ column_type=ColumnType.STRING,
132
+ group=ColumnGroup.MODEL_INFO,
133
+ width="180px",
134
+ default_visible=True,
135
+ description="Underlying model architecture (e.g., XLMRobertaModel)"
136
+ ),
137
+ # 9. Mean (Task) - optional
138
+ ColumnDefinition(
139
+ name="Mean (Task)",
140
+ column_type=ColumnType.NUMBER,
141
+ group=ColumnGroup.MTEB,
142
+ width="120px",
143
+ default_visible=False,
144
+ colorize=True,
145
+ description="Average of all individual task scores"
146
+ ),
147
+ # 10. Contracts - optional
148
+ ColumnDefinition(
149
+ name="Contracts",
150
+ column_type=ColumnType.NUMBER,
151
+ group=ColumnGroup.LEGAL,
152
+ width="110px",
153
+ default_visible=False,
154
+ colorize=True,
155
+ description="Performance on Turkish legal contract analysis"
156
+ ),
157
+ # 11. Regulation - optional
158
+ ColumnDefinition(
159
+ name="Regulation",
160
+ column_type=ColumnType.NUMBER,
161
+ group=ColumnGroup.LEGAL,
162
+ width="110px",
163
+ default_visible=False,
164
+ colorize=True,
165
+ description="Performance on Turkish tax rulings retrieval"
166
+ ),
167
+ # 12. Caselaw - optional
168
+ ColumnDefinition(
169
+ name="Caselaw",
170
+ column_type=ColumnType.NUMBER,
171
+ group=ColumnGroup.LEGAL,
172
+ width="110px",
173
+ default_visible=False,
174
+ colorize=True,
175
+ description="Performance on Court of Cassation case retrieval"
176
+ ),
177
+ # 13. Classification - optional
178
+ ColumnDefinition(
179
+ name="Classification",
180
+ column_type=ColumnType.NUMBER,
181
+ group=ColumnGroup.MTEB,
182
+ width="130px",
183
+ default_visible=False,
184
+ colorize=True,
185
+ description="Performance on Turkish classification tasks"
186
+ ),
187
+ # 14. Clustering - optional
188
+ ColumnDefinition(
189
+ name="Clustering",
190
+ column_type=ColumnType.NUMBER,
191
+ group=ColumnGroup.MTEB,
192
+ width="120px",
193
+ default_visible=False,
194
+ colorize=True,
195
+ description="Performance on Turkish clustering tasks"
196
+ ),
197
+ # 15. Pair Classification - optional
198
+ ColumnDefinition(
199
+ name="Pair Classification",
200
+ api_name="PairClassification",
201
+ column_type=ColumnType.NUMBER,
202
+ group=ColumnGroup.MTEB,
203
+ width="150px",
204
+ default_visible=False,
205
+ colorize=True,
206
+ description="Performance on pair classification tasks (NLI)"
207
+ ),
208
+ # 16. Retrieval - optional
209
+ ColumnDefinition(
210
+ name="Retrieval",
211
+ column_type=ColumnType.NUMBER,
212
+ group=ColumnGroup.MTEB,
213
+ width="120px",
214
+ default_visible=False,
215
+ colorize=True,
216
+ description="Performance on information retrieval tasks"
217
+ ),
218
+ # 17. STS - optional
219
+ ColumnDefinition(
220
+ name="STS",
221
+ column_type=ColumnType.NUMBER,
222
+ group=ColumnGroup.MTEB,
223
+ width="100px",
224
+ default_visible=False,
225
+ colorize=True,
226
+ description="Performance on Semantic Textual Similarity tasks"
227
+ ),
228
+ # 18. Correlation - optional
229
+ ColumnDefinition(
230
+ name="Correlation",
231
+ column_type=ColumnType.NUMBER,
232
+ group=ColumnGroup.CORRELATION,
233
+ width="120px",
234
+ decimals=3,
235
+ default_visible=False,
236
+ colorize=True,
237
+ description="Weighted average of correlation metrics"
238
+ ),
239
+ # 19. Tokenizer Type - optional
240
+ ColumnDefinition(
241
+ name="Tokenizer Type",
242
+ column_type=ColumnType.STRING,
243
+ group=ColumnGroup.TOKENIZER,
244
+ width="180px",
245
+ default_visible=False,
246
+ description="Tokenizer implementation type"
247
+ ),
248
+ # 20. Unique Token Count - optional
249
+ ColumnDefinition(
250
+ name="Unique Token Count",
251
+ column_type=ColumnType.NUMBER,
252
+ group=ColumnGroup.TOKENIZER,
253
+ width="160px",
254
+ decimals=0,
255
+ default_visible=False,
256
+ description="Number of unique tokens on Turkish MMLU"
257
+ ),
258
+ # 21. Turkish Token Count - optional
259
+ ColumnDefinition(
260
+ name="Turkish Token Count",
261
+ column_type=ColumnType.NUMBER,
262
+ group=ColumnGroup.TOKENIZER,
263
+ width="170px",
264
+ decimals=0,
265
+ default_visible=False,
266
+ description="Unique tokens that are valid Turkish"
267
+ ),
268
+ # 22. Turkish Token % - optional
269
+ ColumnDefinition(
270
+ name="Turkish Token %",
271
+ column_type=ColumnType.NUMBER,
272
+ group=ColumnGroup.TOKENIZER,
273
+ width="140px",
274
+ default_visible=False,
275
+ description="Percentage of valid Turkish tokens"
276
+ ),
277
+ # 23. Pure Token % - optional
278
+ ColumnDefinition(
279
+ name="Pure Token %",
280
+ column_type=ColumnType.NUMBER,
281
+ group=ColumnGroup.TOKENIZER,
282
+ width="130px",
283
+ default_visible=False,
284
+ description="Percentage of pure root word tokens"
285
+ ),
286
+ # 24. Embed Dim - optional
287
+ ColumnDefinition(
288
+ name="Embed Dim",
289
+ api_name="Embedding Dimensions",
290
+ column_type=ColumnType.NUMBER,
291
+ group=ColumnGroup.MODEL_INFO,
292
+ width="120px",
293
+ decimals=0,
294
+ default_visible=False,
295
+ description="Embedding dimension size"
296
+ ),
297
+ # 25. Vocab Size - optional
298
+ ColumnDefinition(
299
+ name="Vocab Size",
300
+ column_type=ColumnType.NUMBER,
301
+ group=ColumnGroup.MODEL_INFO,
302
+ width="120px",
303
+ decimals=0,
304
+ default_visible=False,
305
+ description="Vocabulary size"
306
+ ),
307
+ # 26. Model Type - optional
308
+ ColumnDefinition(
309
+ name="Model Type",
310
+ column_type=ColumnType.STRING,
311
+ group=ColumnGroup.MODEL_INFO,
312
+ width="130px",
313
+ default_visible=False,
314
+ description="Model type: Embedding, MLM, CLM-Embedding, or Seq2Seq"
315
+ ),
316
+ ]
317
+
318
+
319
+ class ColumnRegistry:
320
+ """
321
+ Central registry for column definitions.
322
+
323
+ Provides convenient access methods for column metadata.
324
+ """
325
+
326
+ def __init__(self, definitions: List[ColumnDefinition] = None):
327
+ self._definitions = definitions or COLUMN_DEFINITIONS
328
+ self._by_name: Dict[str, ColumnDefinition] = {
329
+ col.name: col for col in self._definitions
330
+ }
331
+ self._by_csv_key: Dict[str, ColumnDefinition] = {
332
+ col.csv_key: col for col in self._definitions
333
+ }
334
+
335
+ @property
336
+ def all_columns(self) -> List[str]:
337
+ """All column names in order."""
338
+ return [col.name for col in self._definitions]
339
+
340
+ @property
341
+ def default_columns(self) -> List[str]:
342
+ """Columns visible by default."""
343
+ return [col.name for col in self._definitions if col.default_visible]
344
+
345
+ @property
346
+ def optional_columns(self) -> List[str]:
347
+ """Columns that can be toggled on/off."""
348
+ return [col.name for col in self._definitions if not col.default_visible]
349
+
350
+ @property
351
+ def score_columns(self) -> List[str]:
352
+ """Columns that should be colorized."""
353
+ return [col.name for col in self._definitions if col.colorize]
354
+
355
+ @property
356
+ def numeric_columns(self) -> List[str]:
357
+ """Columns with numeric type."""
358
+ return [col.name for col in self._definitions if col.column_type == ColumnType.NUMBER]
359
+
360
+ def get(self, name: str) -> Optional[ColumnDefinition]:
361
+ """Get column definition by name."""
362
+ return self._by_name.get(name)
363
+
364
+ def get_by_csv_key(self, csv_key: str) -> Optional[ColumnDefinition]:
365
+ """Get column definition by CSV key."""
366
+ return self._by_csv_key.get(csv_key)
367
+
368
+ def get_by_group(self, group: ColumnGroup) -> List[ColumnDefinition]:
369
+ """Get all columns in a group."""
370
+ return [col for col in self._definitions if col.group == group]
371
+
372
+ def get_group_names(self, group: ColumnGroup) -> List[str]:
373
+ """Get column names for a group."""
374
+ return [col.name for col in self.get_by_group(group)]
375
+
376
+ def get_datatypes(self, columns: List[str]) -> List[str]:
377
+ """Get Gradio datatypes for given columns."""
378
+ return [
379
+ self._by_name[col].column_type.value
380
+ for col in columns
381
+ if col in self._by_name
382
+ ]
383
+
384
+ def get_widths(self, columns: List[str]) -> List[str]:
385
+ """Get column widths for given columns."""
386
+ return [
387
+ self._by_name[col].width
388
+ for col in columns
389
+ if col in self._by_name
390
+ ]
391
+
392
+ def get_csv_mapping(self) -> Dict[str, str]:
393
+ """Get mapping from CSV keys to display names."""
394
+ return {
395
+ col.csv_key: col.name
396
+ for col in self._definitions
397
+ if col.csv_key != col.name
398
+ }
399
+
400
+
401
+ # Global registry instance
402
+ column_registry = ColumnRegistry()
src/core/config.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration Module for HuggingFace Space
3
+
4
+ Simplified configuration for the public Mizan leaderboard.
5
+ """
6
+
7
+ import os
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+
16
+ @dataclass
17
+ class ApiSettings:
18
+ """API settings for evaluation submissions."""
19
+ url: str = field(default_factory=lambda: os.environ.get("API_URL", ""))
20
+ username: str = field(default_factory=lambda: os.environ.get("API_USERNAME", ""))
21
+ password: str = field(default_factory=lambda: os.environ.get("API_PASSWORD", ""))
22
+ timeout: int = 30
23
+
24
+ @property
25
+ def is_configured(self) -> bool:
26
+ """Check if API is fully configured."""
27
+ return bool(self.url and self.username and self.password)
28
+
29
+
30
+ @dataclass
31
+ class UISettings:
32
+ """UI-specific settings."""
33
+ port: int = 7860
34
+ max_table_height: int = 600
35
+ debug: bool = field(default_factory=lambda: os.environ.get("DEBUG", "false").lower() == "true")
36
+
37
+
38
+ @dataclass
39
+ class DataSettings:
40
+ """Data file settings."""
41
+ csv_file: Path = field(default_factory=lambda: Path("leaderboard_data.csv"))
42
+
43
+
44
+ @dataclass
45
+ class Settings:
46
+ """Main application settings container."""
47
+ api: ApiSettings = field(default_factory=ApiSettings)
48
+ ui: UISettings = field(default_factory=UISettings)
49
+ data: DataSettings = field(default_factory=DataSettings)
50
+
51
+
52
+ # Global settings instance
53
+ settings = Settings()
src/data/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data processing modules."""
2
+
3
+ from .transformer import DataTransformer, parse_parameter_string, format_parameter_count
4
+ from .styler import LeaderboardStyler
5
+
6
+ __all__ = [
7
+ "DataTransformer",
8
+ "LeaderboardStyler",
9
+ "parse_parameter_string",
10
+ "format_parameter_count",
11
+ ]
src/data/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src/data/__pycache__/__init__.cpython-312.pyc and b/src/data/__pycache__/__init__.cpython-312.pyc differ
 
src/data/__pycache__/styler.cpython-312.pyc CHANGED
Binary files a/src/data/__pycache__/styler.cpython-312.pyc and b/src/data/__pycache__/styler.cpython-312.pyc differ
 
src/data/__pycache__/transformer.cpython-312.pyc CHANGED
Binary files a/src/data/__pycache__/transformer.cpython-312.pyc and b/src/data/__pycache__/transformer.cpython-312.pyc differ
 
src/data/styler.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard Styling Module
3
+
4
+ Handles color gradients and visual styling for the leaderboard.
5
+ """
6
+
7
+ import logging
8
+ import html
9
+ from typing import Dict, Tuple, List
10
+ import pandas as pd
11
+ from matplotlib.colors import LinearSegmentedColormap
12
+
13
+ from ..core.columns import column_registry
14
+ from .transformer import format_parameter_count
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LeaderboardStyler:
20
+ """
21
+ Applies visual styling to leaderboard DataFrames.
22
+
23
+ Uses Excel-like Red-Yellow-Green color gradients for score columns.
24
+ """
25
+
26
+ # Excel-style color gradient: Red -> Yellow -> Green
27
+ GRADIENT_COLORS = [
28
+ (0.9, 0.1, 0.2), # Red (low scores)
29
+ (1.0, 1.0, 0.0), # Yellow (medium scores)
30
+ (0/255, 176/255, 80/255) # Excel Green (high scores)
31
+ ]
32
+
33
+ def __init__(self):
34
+ self._colormap = LinearSegmentedColormap.from_list(
35
+ "ExcelRedYellowGreen",
36
+ self.GRADIENT_COLORS,
37
+ N=256
38
+ )
39
+
40
+ @staticmethod
41
+ def rgb_to_hex(rgb: Tuple[float, float, float]) -> str:
42
+ """Convert RGB tuple (0-1 range) to hex color."""
43
+ r = int(rgb[0] * 255)
44
+ g = int(rgb[1] * 255)
45
+ b = int(rgb[2] * 255)
46
+ return f"#{r:02x}{g:02x}{b:02x}"
47
+
48
+ def get_color_for_value(self, value: float, min_val: float, max_val: float) -> str:
49
+ """Get hex color for a value within a range."""
50
+ if max_val == min_val:
51
+ normalized = 0.5
52
+ else:
53
+ normalized = (value - min_val) / (max_val - min_val)
54
+
55
+ # Clamp to [0, 0.999] to avoid edge case at exactly 1.0
56
+ normalized = max(0, min(0.999, normalized))
57
+
58
+ rgba = self._colormap(normalized)
59
+ return self.rgb_to_hex(rgba[:3])
60
+
61
+ def calculate_color_ranges(self, df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
62
+ """Calculate min/max for each score column."""
63
+ ranges = {}
64
+
65
+ for col_name in column_registry.score_columns:
66
+ if col_name not in df.columns:
67
+ continue
68
+
69
+ numeric_values = pd.to_numeric(df[col_name], errors='coerce')
70
+ if numeric_values.isna().all():
71
+ continue
72
+
73
+ ranges[col_name] = {
74
+ 'min': numeric_values.min(),
75
+ 'max': numeric_values.max()
76
+ }
77
+
78
+ return ranges
79
+
80
+ def apply_styling(self, df: pd.DataFrame) -> "pd.io.formats.style.Styler":
81
+ """
82
+ Apply color styling to DataFrame.
83
+
84
+ Returns a pandas Styler object that Gradio can render.
85
+ """
86
+ if df.empty:
87
+ return df.style
88
+
89
+ df_copy = df.copy()
90
+
91
+ # Convert "N/A" to NaN for proper formatting
92
+ for col in column_registry.score_columns:
93
+ if col in df_copy.columns:
94
+ df_copy[col] = df_copy[col].replace("N/A", pd.NA)
95
+ df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
96
+
97
+ # Calculate color ranges
98
+ color_ranges = self.calculate_color_ranges(df_copy)
99
+
100
+ # Create style function
101
+ def apply_gradient(val, col_name: str):
102
+ if col_name not in color_ranges:
103
+ return ''
104
+
105
+ if pd.isna(val):
106
+ return ''
107
+
108
+ try:
109
+ min_val = color_ranges[col_name]['min']
110
+ max_val = color_ranges[col_name]['max']
111
+ color_hex = self.get_color_for_value(float(val), min_val, max_val)
112
+ return f'background-color: {color_hex}; text-align: center; font-weight: bold; color: #333;'
113
+ except (ValueError, TypeError):
114
+ return ''
115
+
116
+ # Apply styling
117
+ styler = df_copy.style
118
+
119
+ for col in column_registry.score_columns:
120
+ if col in df_copy.columns:
121
+ styler = styler.map(
122
+ lambda val, c=col: apply_gradient(val, c),
123
+ subset=[col]
124
+ )
125
+
126
+ # Format numeric columns
127
+ format_dict = {}
128
+ for col_name in column_registry.numeric_columns:
129
+ if col_name in df_copy.columns:
130
+ col_def = column_registry.get(col_name)
131
+ # Special handling for Parameters column - use human-readable format
132
+ if col_name == "Parameters":
133
+ format_dict[col_name] = format_parameter_count
134
+ elif col_def and col_def.decimals == 0:
135
+ format_dict[col_name] = '{:.0f}'
136
+ elif col_def and col_def.decimals == 3:
137
+ format_dict[col_name] = '{:.3f}'
138
+ else:
139
+ format_dict[col_name] = '{:.2f}'
140
+
141
+ # Format model column as hyperlink without mutating the underlying data
142
+ if "Model" in df_copy.columns:
143
+ def _model_link_formatter(value: object) -> str:
144
+ model_name = html.escape(str(value))
145
+ return (
146
+ f'<a href="https://huggingface.co/{model_name}" target="_blank" '
147
+ f'style="color: #2563eb; text-decoration: underline;">{model_name}</a>'
148
+ )
149
+
150
+ format_dict["Model"] = _model_link_formatter
151
+
152
+ if format_dict:
153
+ # Don't replace NA values - let them display as they are in the CSV
154
+ styler = styler.format(format_dict, na_rep='', escape=None)
155
+
156
+ return styler
157
+
158
+ def get_datatypes(self, columns: List[str]) -> List[str]:
159
+ """Get Gradio datatypes for columns."""
160
+ return column_registry.get_datatypes(columns)
161
+
162
+ def get_column_widths(self, columns: List[str]) -> List[str]:
163
+ """Get column widths for columns."""
164
+ return column_registry.get_widths(columns)
src/data/transformer.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Transformation Module
3
+
4
+ Handles DataFrame transformations and CSV loading.
5
+ """
6
+
7
+ import logging
8
+ import html
9
+ import re
10
+ from typing import List, Optional, Union
11
+ from pathlib import Path
12
+ import pandas as pd
13
+ import numpy as np
14
+
15
+ from ..core.columns import column_registry, ColumnType
16
+ from ..core.config import settings
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def parse_parameter_string(value: Union[str, float, int]) -> Optional[float]:
22
+ """
23
+ Parse parameter strings like '307M', '1B', '1.7B', '4B' to numeric values.
24
+
25
+ Args:
26
+ value: Parameter string (e.g., '307M', '1B', '1.7B') or numeric value.
27
+
28
+ Returns:
29
+ Numeric value (in millions for consistency) or None if parsing fails.
30
+ """
31
+ if pd.isna(value):
32
+ return None
33
+
34
+ # If already numeric, return as-is
35
+ if isinstance(value, (int, float)):
36
+ return float(value)
37
+
38
+ value_str = str(value).strip().upper()
39
+
40
+ # Handle special cases
41
+ if value_str in ('', 'N/A', 'NA', 'NAN', 'NONE', '∞'):
42
+ return None
43
+
44
+ # Pattern to match numbers with optional suffix (K, M, B, T)
45
+ pattern = r'^([\d.]+)\s*([KMBT])?$'
46
+ match = re.match(pattern, value_str)
47
+
48
+ if not match:
49
+ return None
50
+
51
+ try:
52
+ number = float(match.group(1))
53
+ suffix = match.group(2)
54
+
55
+ # Convert to raw count based on suffix
56
+ multipliers = {
57
+ None: 1,
58
+ 'K': 1_000,
59
+ 'M': 1_000_000,
60
+ 'B': 1_000_000_000,
61
+ 'T': 1_000_000_000_000
62
+ }
63
+
64
+ return number * multipliers.get(suffix, 1)
65
+ except (ValueError, TypeError):
66
+ return None
67
+
68
+
69
+ def format_parameter_count(value: Union[float, int, None]) -> str:
70
+ """
71
+ Format a numeric parameter count to human-readable string.
72
+
73
+ Args:
74
+ value: Numeric parameter count.
75
+
76
+ Returns:
77
+ Formatted string like '307M', '1.7B', '4B'.
78
+ """
79
+ if pd.isna(value) or value is None:
80
+ return ''
81
+
82
+ try:
83
+ value = float(value)
84
+ except (ValueError, TypeError):
85
+ return str(value)
86
+
87
+ if value >= 1_000_000_000_000:
88
+ formatted = value / 1_000_000_000_000
89
+ return f"{formatted:.1f}T" if formatted != int(formatted) else f"{int(formatted)}T"
90
+ elif value >= 1_000_000_000:
91
+ formatted = value / 1_000_000_000
92
+ return f"{formatted:.1f}B" if formatted != int(formatted) else f"{int(formatted)}B"
93
+ elif value >= 1_000_000:
94
+ formatted = value / 1_000_000
95
+ return f"{formatted:.0f}M" if formatted >= 10 else f"{formatted:.1f}M".rstrip('0').rstrip('.')+"M" if formatted != int(formatted) else f"{int(formatted)}M"
96
+ elif value >= 1_000:
97
+ formatted = value / 1_000
98
+ return f"{formatted:.0f}K" if formatted >= 10 else f"{formatted:.1f}K"
99
+ else:
100
+ return str(int(value))
101
+
102
+
103
+ class DataTransformer:
104
+ """
105
+ Transforms data between different formats.
106
+
107
+ Handles CSV -> DataFrame conversions and display preparation.
108
+ """
109
+
110
+ @staticmethod
111
+ def create_empty_dataframe() -> pd.DataFrame:
112
+ """Create an empty DataFrame with all column definitions."""
113
+ return pd.DataFrame(columns=column_registry.all_columns)
114
+
115
+ @staticmethod
116
+ def load_from_csv(file_path: Path = None) -> pd.DataFrame:
117
+ """
118
+ Load leaderboard data from CSV file.
119
+
120
+ Args:
121
+ file_path: Path to CSV file (uses default if None).
122
+
123
+ Returns:
124
+ DataFrame with leaderboard data.
125
+ """
126
+ path = file_path or settings.data.csv_file
127
+
128
+ if not path.exists():
129
+ logger.warning(f"CSV file not found: {path}")
130
+ return DataTransformer.create_empty_dataframe()
131
+
132
+ try:
133
+ df = pd.read_csv(path)
134
+ logger.info(f"Loaded {len(df)} records from {path}")
135
+
136
+ # Convert to display format
137
+ df = DataTransformer._normalize_columns(df)
138
+ df = DataTransformer._convert_parameters_to_numeric(df)
139
+ df = DataTransformer._sort_by_rank(df)
140
+
141
+ return df
142
+
143
+ except Exception as e:
144
+ logger.error(f"Error loading CSV: {e}")
145
+ return DataTransformer.create_empty_dataframe()
146
+
147
+ @staticmethod
148
+ def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
149
+ """Normalize column names from CSV variations to standard names."""
150
+ # Column name mappings for variations
151
+ column_mappings = {
152
+ "Mean (TaskType)": "MTEB Score",
153
+ "Score(Legal)": "Legal Score",
154
+ "Embedding Dimensions": "Embed Dim",
155
+ "Embedding Dim": "Embed Dim",
156
+ "Max Tokens": "Max Sequence Length",
157
+ "Max Seq Length": "Max Sequence Length",
158
+ "Number of Parameters": "Parameters",
159
+ "PairClassification": "Pair Classification",
160
+ "Vocabulary Size": "Vocab Size",
161
+ "Vocabulary": "Vocab Size",
162
+ }
163
+
164
+ df = df.copy()
165
+
166
+ # Rename columns based on mappings
167
+ for old_name, new_name in column_mappings.items():
168
+ if old_name in df.columns and new_name not in df.columns:
169
+ df = df.rename(columns={old_name: new_name})
170
+
171
+ return df
172
+
173
+ @staticmethod
174
+ def _sort_by_rank(df: pd.DataFrame) -> pd.DataFrame:
175
+ """Sort DataFrame by MTEB Score descending and recalculate ranks."""
176
+ if "MTEB Score" in df.columns:
177
+ # Sort by MTEB Score descending (higher is better)
178
+ df = df.sort_values("MTEB Score", ascending=False, na_position='last').reset_index(drop=True)
179
+ # Recalculate ranks as 1, 2, 3, 4... (no ties)
180
+ df["Rank"] = range(1, len(df) + 1)
181
+ elif "Rank" in df.columns:
182
+ # Fallback to existing rank if MTEB Score not available
183
+ df = df.sort_values("Rank", ascending=True).reset_index(drop=True)
184
+ return df
185
+
186
+ @staticmethod
187
+ def _convert_parameters_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
188
+ """
189
+ Convert Parameters column from string format to numeric for proper sorting.
190
+
191
+ Converts values like '307M', '1B', '1.7B' to numeric values.
192
+ """
193
+ if "Parameters" not in df.columns:
194
+ return df
195
+
196
+ df = df.copy()
197
+ df["Parameters"] = df["Parameters"].apply(parse_parameter_string)
198
+ return df
199
+
200
+ @staticmethod
201
+ def add_model_links(df: pd.DataFrame) -> pd.DataFrame:
202
+ """Add clickable HuggingFace links to Model column."""
203
+ if "Model" not in df.columns:
204
+ return df
205
+
206
+ df = df.copy()
207
+ df["Model"] = df["Model"].apply(
208
+ lambda x: f'<a href="https://huggingface.co/{html.escape(str(x))}" target="_blank" '
209
+ f'style="color: #2563eb; text-decoration: underline;">{html.escape(str(x))}</a>'
210
+ )
211
+ return df
212
+
213
+ @staticmethod
214
+ def ensure_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
215
+ """Convert numeric columns to proper types."""
216
+ df = df.copy()
217
+
218
+ for col_name in column_registry.numeric_columns:
219
+ if col_name not in df.columns:
220
+ continue
221
+
222
+ col_def = column_registry.get(col_name)
223
+ if col_def is None:
224
+ continue
225
+
226
+ # Handle "N/A" and empty values
227
+ df[col_name] = df[col_name].replace("N/A", pd.NA)
228
+ df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
229
+
230
+ # Round to specified decimals
231
+ if col_def.decimals == 0:
232
+ # Keep as float to preserve NaN, format later
233
+ pass
234
+ else:
235
+ df[col_name] = df[col_name].round(col_def.decimals)
236
+
237
+ return df
238
+
239
+ @staticmethod
240
+ def filter_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
241
+ """Filter DataFrame to only include specified columns (preserves order)."""
242
+ available = [col for col in columns if col in df.columns]
243
+ return df[available]
244
+
245
+ @classmethod
246
+ def prepare_for_display(
247
+ cls,
248
+ df: pd.DataFrame,
249
+ columns: List[str] = None,
250
+ add_links: bool = True
251
+ ) -> pd.DataFrame:
252
+ """
253
+ Prepare DataFrame for Gradio display.
254
+
255
+ Args:
256
+ df: Source DataFrame.
257
+ columns: Columns to include (preserves order passed in).
258
+ add_links: Whether to add HuggingFace links.
259
+
260
+ Returns:
261
+ Prepared DataFrame.
262
+ """
263
+ if df is None or df.empty:
264
+ return cls.create_empty_dataframe()
265
+
266
+ # Work with a copy
267
+ result = df.copy()
268
+
269
+ # Filter columns if specified (preserves the order passed in)
270
+ if columns:
271
+ result = cls.filter_columns(result, columns)
272
+
273
+ # Convert numeric columns
274
+ result = cls.ensure_numeric_columns(result)
275
+
276
+ # Add model links
277
+ if add_links and "Model" in result.columns:
278
+ result = cls.add_model_links(result)
279
+
280
+ return result
ui_components.py DELETED
@@ -1,259 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- UI Components module for MTEB Turkish Leaderboard - HF Spaces Version
4
- Simplified version with only leaderboard and dataset components
5
- """
6
-
7
- import gradio as gr
8
- import pandas as pd
9
- from data_processor import (create_styled_leaderboard_dataframe,
10
- create_empty_leaderboard_dataframe)
11
-
12
-
13
- def create_leaderboard_tab(current_data: pd.DataFrame):
14
- """Create the main leaderboard tab with color styling"""
15
-
16
- # Handle empty or invalid data
17
- if current_data.empty or "Model" not in current_data.columns:
18
- print("⚠️ Warning: Empty or invalid data, using empty leaderboard structure")
19
- current_data = create_empty_leaderboard_dataframe()
20
-
21
- # Apply color styling to score columns using pandas Styler
22
- styled_data = create_styled_leaderboard_dataframe(current_data)
23
-
24
- leaderboard = gr.Dataframe(
25
- value=styled_data,
26
- interactive=False,
27
- wrap=True,
28
- max_height=600,
29
- show_search=True,
30
- datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "str", "number", "str", "number"], # Model column as HTML for clickable links
31
- column_widths=["70px", "250px", "130px", "130px", "160px", "130px", "170px", "130px", "100px", "130px", "120px", "120px", "120px", "120px"]
32
- )
33
-
34
- # Information about the leaderboard
35
- gr.Markdown("""
36
- ### 🔍 How to Use the Leaderboard:
37
- - **Search**: Use the search box to find specific models
38
- - **Color Coding**: Scores are color-coded from red (low) to green (high)
39
- - **Sorting**: Click on column headers to sort by different metrics
40
- - **Rankings**: Models are ranked by Mean (Task) score
41
-
42
- ### 📊 Performance Insights:
43
- - **Top Performers**: Models with Mean (Task) > 65 show strong overall performance
44
- - **Specialized Models**: Some models excel in specific tasks (e.g., retrieval vs classification)
45
- - **Model Size vs Performance**: Larger models generally perform better but with exceptions
46
- """)
47
-
48
- return leaderboard
49
-
50
-
51
- def create_dataset_tab():
52
- """Create the dataset information tab"""
53
-
54
- gr.Markdown("### 📊 MTEB Turkish Dataset Overview")
55
-
56
- # Task name to dataset path mapping
57
- task_to_dataset = {
58
- 'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval',
59
- 'XQuADRetrieval': 'google/xquad',
60
- 'TurHistQuadRetrieval': 'asparius/TurHistQuAD',
61
- 'MKQARetrieval': 'apple/mkqa',
62
- 'MassiveIntentClassification': 'mteb/amazon_massive_intent',
63
- 'MassiveScenarioClassification': 'mteb/amazon_massive_scenario',
64
- 'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification',
65
- 'SIB200Classification': 'mteb/sib200',
66
- 'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review',
67
- 'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review',
68
- 'SIB200ClusteringS2S': 'mteb/sib200',
69
- 'XNLI': 'mteb/xnli',
70
- 'XNLIV2': 'mteb/xnli2.0-multi-pair',
71
- 'STS22.v2': 'mteb/sts22-crosslingual-sts'
72
- }
73
-
74
- # Create clickable task names
75
- clickable_task_names = []
76
- for task_name in [
77
- 'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval',
78
- 'MassiveIntentClassification', 'MassiveScenarioClassification',
79
- 'MultilingualSentimentClassification', 'SIB200Classification',
80
- 'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification',
81
- 'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2'
82
- ]:
83
- dataset_path = task_to_dataset[task_name]
84
- hf_link = f"https://huggingface.co/datasets/{dataset_path}"
85
- clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{task_name}</a>'
86
- clickable_task_names.append(clickable_name)
87
-
88
- # Create dataset information table
89
- dataset_data = pd.DataFrame({
90
- 'Task Name': clickable_task_names,
91
- 'Task Type': [
92
- 'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval',
93
- 'Classification', 'Classification',
94
- 'Classification', 'Classification',
95
- 'Classification', 'Classification',
96
- 'Clustering', 'PairClassification', 'PairClassification', 'STS'
97
- ],
98
- 'Description': [
99
- 'Turkish FAQ retrieval task',
100
- 'Turkish question answering retrieval',
101
- 'Historical Turkish document retrieval',
102
- 'Multilingual knowledge QA retrieval',
103
- 'Intent classification for Turkish',
104
- 'Scenario classification for Turkish',
105
- 'Multilingual sentiment classification',
106
- 'SIB200 language identification',
107
- 'Turkish movie review sentiment',
108
- 'Turkish product review sentiment',
109
- 'SIB200 clustering task',
110
- 'Turkish natural language inference',
111
- 'Enhanced Turkish NLI task',
112
- 'Turkish semantic textual similarity'
113
- ],
114
- 'Domain': [
115
- 'FAQ/QA', 'QA', 'Historical', 'Knowledge QA',
116
- 'Intent', 'Scenario',
117
- 'Sentiment', 'Language ID',
118
- 'Movies', 'Products',
119
- 'Language ID', 'NLI', 'NLI', 'STS'
120
- ],
121
- 'Samples': [
122
- '~135K', '~10K', '~1.4K', '~10K',
123
- '~11K', '~11K',
124
- '~4.5K', '~700',
125
- '~8K', '~4.8K',
126
- '~1K', '~1.4K', '~1.4K', '~400'
127
- ]
128
- })
129
-
130
- dataset_table = gr.Dataframe(
131
- value=dataset_data,
132
- label="MTEB Turkish Task Details",
133
- interactive=False,
134
- wrap=True,
135
- datatype=["html", "str", "str", "str", "str"] # First column (Task Name) as HTML for clickable links
136
- )
137
-
138
- # Task type distribution
139
- gr.Markdown("""
140
- ### 📈 Task Distribution:
141
-
142
- **By Task Type:**
143
- - **Classification**: 6 tasks (sentiment, intent, scenario, language identification)
144
- - **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA)
145
- - **Pair Classification**: 2 tasks (natural language inference)
146
- - **Clustering**: 1 task (language clustering)
147
- - **STS**: 1 task (semantic textual similarity)
148
-
149
- **By Domain:**
150
- - **Sentiment Analysis**: Movie and product reviews
151
- - **Question Answering**: FAQ, reading comprehension, and knowledge QA
152
- - **Intent/Scenario**: Conversational AI applications
153
- - **Language Tasks**: NLI, STS, clustering
154
- - **Multilingual**: Cross-lingual evaluation capabilities
155
- """)
156
-
157
- # Statistics summary
158
- stats_data = pd.DataFrame({
159
- 'Metric': [
160
- 'Total Tasks',
161
- 'Total Samples',
162
- 'Task Types',
163
- 'Languages',
164
- 'Avg. Tokens per Sample'
165
- ],
166
- 'Value': [
167
- '14 tasks',
168
- '~190K samples',
169
- '5 types',
170
- 'Turkish + Multilingual',
171
- '~150 tokens'
172
- ],
173
- 'Notes': [
174
- 'Comprehensive evaluation across domains',
175
- 'Large-scale evaluation dataset',
176
- 'Classification, Retrieval, STS, NLI, Clustering',
177
- 'Focus on Turkish with multilingual support',
178
- 'Varies by task type and domain'
179
- ]
180
- })
181
-
182
- gr.Dataframe(
183
- value=stats_data,
184
- label="Dataset Statistics Summary",
185
- interactive=False
186
- )
187
-
188
- gr.Markdown("""
189
- ### 🎯 Evaluation Methodology:
190
-
191
- **Scoring:**
192
- - Each task uses task-specific metrics (accuracy, F1, recall@k, etc.)
193
- - **Mean (Task)**: Direct average of all individual task scores
194
- - **Mean (TaskType)**: Average of task category means
195
- - **Individual Categories**: Performance in each task type
196
-
197
- **Model Ranking:**
198
- - Primary ranking by **Mean (Task)** score
199
- - Correlation metrics provide additional insights
200
- - Task-specific performance shows model strengths
201
-
202
- **Quality Assurance:**
203
- - Standardized evaluation protocols
204
- - Consistent preprocessing across tasks
205
- - Multiple metrics per task for robustness
206
- """)
207
-
208
- return dataset_table
209
-
210
- def create_submit_evaluation_tab():
211
- """Create the submit evaluation tab with form"""
212
-
213
- gr.Markdown("### 🚀 Submit Model for Evaluation")
214
- gr.Markdown("""
215
- Submit your Turkish embedding model for evaluation on the MTEB Turkish benchmark.
216
- **Authentication with Hugging Face is required to submit evaluations.**
217
- """)
218
-
219
- # OAuth login button
220
- login_button = gr.LoginButton(value="Sign in with Hugging Face")
221
-
222
- model_input = gr.Textbox(
223
- label="🤖 Model Name",
224
- placeholder="sentence-transformers/your-model",
225
- info="HuggingFace model identifier (e.g., sentence-transformers/your-model-name)"
226
- )
227
-
228
- email_input = gr.Textbox(
229
- label="📧 Email Address",
230
- placeholder="your.email@example.com",
231
- info="Email for notifications about evaluation status and results"
232
- )
233
-
234
- submit_btn = gr.Button(
235
- "🚀 Submit",
236
- variant="primary",
237
- size="lg"
238
- )
239
-
240
- # Result output for authentication messages
241
- result_output = gr.HTML(label="Status")
242
-
243
- # Information about the evaluation process
244
- gr.Markdown("""
245
- ### 📋 Evaluation Process:
246
- 1. **Sign In**: First, sign in with your Hugging Face account using the button above
247
- 2. **Submit Request**: Fill out the form with your model details and email
248
- 3. **Admin Review**: Your request will be reviewed by administrators
249
- 4. **Evaluation**: If approved, your model will be evaluated on MTEB Turkish benchmark
250
- 5. **Results**: You'll receive email notifications and results will appear on the leaderboard
251
-
252
- ### ⚠️ Important Notes:
253
- - **Authentication Required**: You must be logged in with Hugging Face to submit evaluations
254
- - You'll receive email updates about your request status
255
- - Make sure your model is publicly available on HuggingFace
256
- - Valid email address is required for receiving results
257
- """)
258
-
259
- return (model_input, email_input, submit_btn, login_button, result_output)