import pandas as pd COLUMN_LABELS = { "model_name": "Model", "model_size": "Size", "mteb_avg": "MTEB", "sts_spearman": "STS", "retrieval_top20": "Retrieval", "msmarco_top10": "MS MARCO", } TRANSLIT_COLUMN_LABELS = { "model_name": "Model", "model_size": "Size", "retrieval_translit_top20": "Retrieval", "msmarco_translit_top10": "MS MARCO", } # Metrics used for computing overall average (native script only) SCORE_COLS = ["mteb_avg", "sts_spearman", "retrieval_top20", "msmarco_top10"] def prepare_leaderboard(df: pd.DataFrame) -> pd.DataFrame: """Prepare embedding benchmark leaderboard from raw results DataFrame.""" if df.empty: return df df = df.copy() # Format model_name as hyperlink if model_url exists if "model_url" in df.columns: df["model_name"] = df.apply( lambda row: f"[{row['model_name']}]({row['model_url']})" if pd.notna(row.get("model_url")) else row["model_name"], axis=1, ) # Calculate overall average (only native script metrics, exclude translit) available_cols = [c for c in SCORE_COLS if c in df.columns] if available_cols: df["average"] = df[available_cols].mean(axis=1).round(4) # Sort by average if "average" in df.columns: df = df.sort_values(by="average", ascending=False).reset_index(drop=True) df.insert(0, "Rank", range(1, len(df) + 1)) # Select only main leaderboard columns (exclude translit) # Include model_size if available size_col = ["model_size"] if "model_size" in df.columns else [] display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"] df = df[[c for c in display_cols if c in df.columns]] # Replace missing model_size with "-" if "model_size" in df.columns: df["model_size"] = df["model_size"].fillna("-").replace("", "-") # Round numeric columns df = df.round(4) # Rename columns for display df = df.rename(columns={**COLUMN_LABELS, "average": "Average"}) return df def prepare_detailed_leaderboards(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame: """Prepare a single combined detailed leaderboard with hierarchical columns. Args: detailed_results: Dict with DataFrames from ModelHandler.get_detailed_results() model_order: Optional list of model names in desired order. If provided, models will be displayed in this order instead of being sorted independently. use_multiindex: If True, return DataFrame with MultiIndex columns for proper hierarchical display (merged headers in HTML/Gradio). If False, use flat "Category | Metric" column names. Returns: pd.DataFrame: Combined table with dataset names as hierarchical column headers """ # Dataset configurations: (dataset_key, dataset_label, column_mappings) datasets = [ ("mteb", "MTEB", { "FloresBitextMining_devtest": "Flores", "NTREXBitextMining_test": "NTREX", "Tatoeba_test": "Tatoeba", "MassiveIntentClassification_test": "Intent", "MassiveScenarioClassification_test": "Scenario", "SIB200Classification_test": "SIB200 Cls", "SIB200ClusteringS2S_test": "SIB200 Clust", "ArmenianParaphrasePC_test": "Paraphrase", "BelebeleRetrieval_test": "Belebele", }), ("sts", "STS", { "Pearson_correlation": "Pearson", "Spearman_correlation": "Spearman", }), ("retrieval", "Retrieval", { "top1 within document": "Top-1 Doc", "top3 within document": "Top-3 Doc", "top5 within document": "Top-5 Doc", "top20 group mean macro": "Top-20 Type", "top20 all": "Top-20 All", }), ("msmarco", "MS MARCO", { "reranking_mrr": "Rerank MRR", "retrieval_mrr": "Retr. MRR", "retrieval_top5_accuracy": "Top-5", "retrieval_top10_accuracy": "Top-10", }), ] # Collect all models from all datasets all_models = set() for key, _, _ in datasets: df = detailed_results.get(key, pd.DataFrame()) if not df.empty and "model_name" in df.columns: all_models.update(df["model_name"].unique()) if not all_models: return pd.DataFrame() # Use provided model_order if available, otherwise sort alphabetically if model_order: # Filter model_order to only include models that exist in detailed_results ordered_models = [m for m in model_order if m in all_models] # Add any remaining models not in model_order (in case they're new) remaining = sorted([m for m in all_models if m not in ordered_models]) all_models_ordered = ordered_models + remaining else: all_models_ordered = sorted(all_models) # Build combined dataframe with flat columns first combined = pd.DataFrame({"Model": all_models_ordered}) column_tuples = [("", "Model")] # For MultiIndex: (level1, level2) for key, label, col_map in datasets: df = detailed_results.get(key, pd.DataFrame()) if df.empty: continue df = df.drop_duplicates(subset=["model_name"], keep="first") for orig_col, new_col in col_map.items(): if orig_col in df.columns: col_name = f"{label} | {new_col}" column_tuples.append((label, new_col)) merged = combined.merge( df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}), on="Model", how="left" ) combined = merged # Round numeric columns combined = combined.round(4) # If no model_order was provided, sort by first numeric column for backward compatibility if not model_order: numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist() if numeric_cols: combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last") # Always reset index to ensure proper row ordering combined = combined.reset_index(drop=True) combined.insert(0, "#", range(1, len(combined) + 1)) column_tuples.insert(0, ("", "#")) if use_multiindex: # Convert to MultiIndex columns for proper hierarchical display combined.columns = pd.MultiIndex.from_tuples(column_tuples) return combined def prepare_translit_leaderboard(df: pd.DataFrame) -> pd.DataFrame: """Prepare translit summary leaderboard from raw results DataFrame.""" if df.empty: return df df = df.copy() # Format model_name as hyperlink if model_url exists if "model_url" in df.columns: df["model_name"] = df.apply( lambda row: f"[{row['model_name']}]({row['model_url']})" if pd.notna(row.get("model_url")) else row["model_name"], axis=1, ) # Only include translit columns translit_cols = ["retrieval_translit_top20", "msmarco_translit_top10"] available_cols = [c for c in translit_cols if c in df.columns] if not available_cols: return pd.DataFrame() # Filter to models that have translit data df = df.dropna(subset=available_cols, how="all") if df.empty: return pd.DataFrame() # Calculate average df["average"] = df[available_cols].mean(axis=1).round(4) # Sort by average df = df.sort_values(by="average", ascending=False).reset_index(drop=True) df.insert(0, "Rank", range(1, len(df) + 1)) # Select columns - include model_size if available size_col = ["model_size"] if "model_size" in df.columns else [] display_cols = ["Rank", "model_name"] + size_col + available_cols + ["average"] df = df[[c for c in display_cols if c in df.columns]].round(4) # Replace missing model_size with "-" if it's in the data if "model_size" in df.columns: df["model_size"] = df["model_size"].fillna("-").replace("", "-") df = df.rename(columns={**TRANSLIT_COLUMN_LABELS, "average": "Average"}) return df def prepare_translit_detailed(detailed_results: dict, model_order: list = None, use_multiindex: bool = True) -> pd.DataFrame: """Prepare a single combined translit detailed leaderboard with hierarchical columns. Args: detailed_results: Dict with 'retrieval_translit' and 'msmarco_translit' DataFrames model_order: Optional list of model names in desired order. If provided, models will be displayed in this order instead of being sorted independently. use_multiindex: If True, return DataFrame with MultiIndex columns for proper hierarchical display (merged headers in HTML/Gradio). If False, use flat "Category | Metric" column names. Returns: pd.DataFrame: Combined table with dataset names as hierarchical column headers """ datasets = [ ("retrieval_translit", "Retrieval", { "top1 within document": "Top-1 Doc", "top3 within document": "Top-3 Doc", "top5 within document": "Top-5 Doc", "top20 group mean macro": "Top-20 Type", "top20 all": "Top-20 All", }), ("msmarco_translit", "MS MARCO", { "reranking_mrr": "Rerank MRR", "retrieval_mrr": "Retr. MRR", "retrieval_top5_accuracy": "Top-5", "retrieval_top10_accuracy": "Top-10", }), ] # Collect all models from all datasets all_models = set() for key, _, _ in datasets: df = detailed_results.get(key, pd.DataFrame()) if not df.empty and "model_name" in df.columns: all_models.update(df["model_name"].unique()) if not all_models: return pd.DataFrame() # Use provided model_order if available, otherwise sort alphabetically if model_order: # Filter model_order to only include models that exist in detailed_results ordered_models = [m for m in model_order if m in all_models] # Add any remaining models not in model_order (in case they're new) remaining = sorted([m for m in all_models if m not in ordered_models]) all_models_ordered = ordered_models + remaining else: all_models_ordered = sorted(all_models) # Build combined dataframe combined = pd.DataFrame({"Model": all_models_ordered}) column_tuples = [("", "Model")] # For MultiIndex: (level1, level2) for key, label, col_map in datasets: df = detailed_results.get(key, pd.DataFrame()) if df.empty: continue df = df.drop_duplicates(subset=["model_name"], keep="first") for orig_col, new_col in col_map.items(): if orig_col in df.columns: col_name = f"{label} | {new_col}" column_tuples.append((label, new_col)) merged = combined.merge( df[["model_name", orig_col]].rename(columns={"model_name": "Model", orig_col: col_name}), on="Model", how="left" ) combined = merged # Round numeric columns combined = combined.round(4) # If no model_order was provided, sort by first numeric column for backward compatibility if not model_order: numeric_cols = combined.select_dtypes(include=["number"]).columns.tolist() if numeric_cols: combined = combined.sort_values(by=numeric_cols[0], ascending=False, na_position="last") # Always reset index to ensure proper row ordering combined = combined.reset_index(drop=True) combined.insert(0, "#", range(1, len(combined) + 1)) column_tuples.insert(0, ("", "#")) if use_multiindex: # Convert to MultiIndex columns for proper hierarchical display combined.columns = pd.MultiIndex.from_tuples(column_tuples) return combined