File size: 9,206 Bytes
bc37111 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | """
Data Transformation Module
Handles DataFrame transformations and CSV loading.
"""
import logging
import html
import re
from typing import List, Optional, Union
from pathlib import Path
import pandas as pd
import numpy as np
from ..core.columns import column_registry, ColumnType
from ..core.config import settings
logger = logging.getLogger(__name__)
def parse_parameter_string(value: Union[str, float, int]) -> Optional[float]:
"""
Parse parameter strings like '307M', '1B', '1.7B', '4B' to numeric values.
Args:
value: Parameter string (e.g., '307M', '1B', '1.7B') or numeric value.
Returns:
Numeric value (in millions for consistency) or None if parsing fails.
"""
if pd.isna(value):
return None
# If already numeric, return as-is
if isinstance(value, (int, float)):
return float(value)
value_str = str(value).strip().upper()
# Handle special cases
if value_str in ('', 'N/A', 'NA', 'NAN', 'NONE', '∞'):
return None
# Pattern to match numbers with optional suffix (K, M, B, T)
pattern = r'^([\d.]+)\s*([KMBT])?$'
match = re.match(pattern, value_str)
if not match:
return None
try:
number = float(match.group(1))
suffix = match.group(2)
# Convert to raw count based on suffix
multipliers = {
None: 1,
'K': 1_000,
'M': 1_000_000,
'B': 1_000_000_000,
'T': 1_000_000_000_000
}
return number * multipliers.get(suffix, 1)
except (ValueError, TypeError):
return None
def format_parameter_count(value: Union[float, int, None]) -> str:
"""
Format a numeric parameter count to human-readable string.
Args:
value: Numeric parameter count.
Returns:
Formatted string like '307M', '1.7B', '4B'.
"""
if pd.isna(value) or value is None:
return ''
try:
value = float(value)
except (ValueError, TypeError):
return str(value)
if value >= 1_000_000_000_000:
formatted = value / 1_000_000_000_000
return f"{formatted:.1f}T" if formatted != int(formatted) else f"{int(formatted)}T"
elif value >= 1_000_000_000:
formatted = value / 1_000_000_000
return f"{formatted:.1f}B" if formatted != int(formatted) else f"{int(formatted)}B"
elif value >= 1_000_000:
formatted = value / 1_000_000
return f"{formatted:.0f}M" if formatted >= 10 else f"{formatted:.1f}M".rstrip('0').rstrip('.')+"M" if formatted != int(formatted) else f"{int(formatted)}M"
elif value >= 1_000:
formatted = value / 1_000
return f"{formatted:.0f}K" if formatted >= 10 else f"{formatted:.1f}K"
else:
return str(int(value))
class DataTransformer:
"""
Transforms data between different formats.
Handles CSV -> DataFrame conversions and display preparation.
"""
@staticmethod
def create_empty_dataframe() -> pd.DataFrame:
"""Create an empty DataFrame with all column definitions."""
return pd.DataFrame(columns=column_registry.all_columns)
@staticmethod
def load_from_csv(file_path: Path = None) -> pd.DataFrame:
"""
Load leaderboard data from CSV file.
Args:
file_path: Path to CSV file (uses default if None).
Returns:
DataFrame with leaderboard data.
"""
path = file_path or settings.data.csv_file
if not path.exists():
logger.warning(f"CSV file not found: {path}")
return DataTransformer.create_empty_dataframe()
try:
df = pd.read_csv(path)
logger.info(f"Loaded {len(df)} records from {path}")
# Convert to display format
df = DataTransformer._normalize_columns(df)
df = DataTransformer._convert_parameters_to_numeric(df)
df = DataTransformer._sort_by_rank(df)
return df
except Exception as e:
logger.error(f"Error loading CSV: {e}")
return DataTransformer.create_empty_dataframe()
@staticmethod
def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Normalize column names from CSV variations to standard names."""
# Column name mappings for variations
column_mappings = {
"Mean (TaskType)": "MTEB Score",
"Score(Legal)": "Legal Score",
"Embedding Dimensions": "Embed Dim",
"Embedding Dim": "Embed Dim",
"Max Tokens": "Max Sequence Length",
"Max Seq Length": "Max Sequence Length",
"Number of Parameters": "Parameters",
"PairClassification": "Pair Classification",
"Vocabulary Size": "Vocab Size",
"Vocabulary": "Vocab Size",
}
df = df.copy()
# Rename columns based on mappings
for old_name, new_name in column_mappings.items():
if old_name in df.columns and new_name not in df.columns:
df = df.rename(columns={old_name: new_name})
return df
@staticmethod
def _sort_by_rank(df: pd.DataFrame) -> pd.DataFrame:
"""Sort DataFrame by MTEB Score descending and recalculate ranks."""
if "MTEB Score" in df.columns:
# Sort by MTEB Score descending (higher is better)
df = df.sort_values("MTEB Score", ascending=False, na_position='last').reset_index(drop=True)
# Recalculate ranks as 1, 2, 3, 4... (no ties)
df["Rank"] = range(1, len(df) + 1)
elif "Rank" in df.columns:
# Fallback to existing rank if MTEB Score not available
df = df.sort_values("Rank", ascending=True).reset_index(drop=True)
return df
@staticmethod
def _convert_parameters_to_numeric(df: pd.DataFrame) -> pd.DataFrame:
"""
Convert Parameters column from string format to numeric for proper sorting.
Converts values like '307M', '1B', '1.7B' to numeric values.
"""
if "Parameters" not in df.columns:
return df
df = df.copy()
df["Parameters"] = df["Parameters"].apply(parse_parameter_string)
return df
@staticmethod
def add_model_links(df: pd.DataFrame) -> pd.DataFrame:
"""Add clickable HuggingFace links to Model column."""
if "Model" not in df.columns:
return df
df = df.copy()
df["Model"] = df["Model"].apply(
lambda x: f'<a href="https://huggingface.co/{html.escape(str(x))}" target="_blank" '
f'style="color: #2563eb; text-decoration: underline;">{html.escape(str(x))}</a>'
)
return df
@staticmethod
def ensure_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Convert numeric columns to proper types."""
df = df.copy()
for col_name in column_registry.numeric_columns:
if col_name not in df.columns:
continue
col_def = column_registry.get(col_name)
if col_def is None:
continue
# Handle "N/A" and empty values
df[col_name] = df[col_name].replace("N/A", pd.NA)
df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
# Round to specified decimals
if col_def.decimals == 0:
# Keep as float to preserve NaN, format later
pass
else:
df[col_name] = df[col_name].round(col_def.decimals)
return df
@staticmethod
def filter_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""Filter DataFrame to only include specified columns (preserves order)."""
available = [col for col in columns if col in df.columns]
return df[available]
@classmethod
def prepare_for_display(
cls,
df: pd.DataFrame,
columns: List[str] = None,
add_links: bool = True
) -> pd.DataFrame:
"""
Prepare DataFrame for Gradio display.
Args:
df: Source DataFrame.
columns: Columns to include (preserves order passed in).
add_links: Whether to add HuggingFace links.
Returns:
Prepared DataFrame.
"""
if df is None or df.empty:
return cls.create_empty_dataframe()
# Work with a copy
result = df.copy()
# Filter columns if specified (preserves the order passed in)
if columns:
result = cls.filter_columns(result, columns)
# Convert numeric columns
result = cls.ensure_numeric_columns(result)
# Add model links
if add_links and "Model" in result.columns:
result = cls.add_model_links(result)
return result
|