Spaces:
Running
Running
| import pandas as pd | |
| from statistics import mean | |
| import pandas as pd | |
| import json | |
| import numpy as np | |
| from statistics import mean | |
| import re | |
| from datasets import load_dataset, concatenate_datasets | |
| import os | |
| from collections import defaultdict | |
| from src.envs import API, SAHARA_DATA, SAHARA_RESULTS | |
| TASKS_LIST={ | |
| 'xlni':'Cross-Lingual Natural Language Inference', | |
| 'lid':'Language Identification', | |
| 'news': 'News Classification', | |
| 'sentiment':'Sentiment Analysis', | |
| 'topic':'Topic Classification', | |
| 'mt_eng2xx':'Machine Translation - English to African', | |
| 'mt_fra2xx':'Machine Translation - French to African', | |
| 'mt_xx2xx':'Machine Translation - African to African', | |
| 'paraphrase':'Paraphrase', | |
| 'summary':'Summarization', | |
| 'title':'Title Generation', | |
| 'mmlu':'General Knowledge', | |
| 'mgsm':'Mathematical Word Problems', | |
| 'belebele':'Reading Comprehension', | |
| 'squad_qa':'Context-based Question Answering', | |
| 'ner':'Named Entity Recognition', | |
| 'phrase':'Phrase Chunking', | |
| 'pos':'Part-of-Speech Tagging', | |
| } | |
| CLUSTERS = { | |
| "Text Classification Tasks": [ | |
| 'xlni', 'lid', 'news', 'sentiment', 'topic', | |
| ], | |
| "Text Generation Tasks": [ | |
| 'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title', | |
| ], | |
| "MCCR Tasks": [ | |
| 'mmlu', 'mgsm', 'belebele', 'squad_qa', | |
| ], | |
| "Tokens Level Tasks": [ | |
| 'ner', 'phrase', 'pos', | |
| ], | |
| } | |
| ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster] | |
| # This dictionary maps each task ID to its parent cluster name | |
| TASK_TO_CLUSTER_MAP = { | |
| task: cluster_name | |
| for cluster_name, tasks in CLUSTERS.items() | |
| for task in tasks | |
| } | |
| # ===== Authenticate and Load Data From Private HF Repo ===== | |
| def load_private_leaderboard_df(): | |
| all_repo_files = API.list_repo_files(repo_id=SAHARA_DATA, repo_type="dataset") | |
| folder_path = "data/users/" | |
| jsonl_files_in_folder = [ | |
| f for f in all_repo_files | |
| if f.startswith(folder_path) and f.endswith(".jsonl") | |
| ] | |
| jsonl_files_in_folder.append(SAHARA_RESULTS) | |
| print("++++++",jsonl_files_in_folder) | |
| ds = load_dataset( | |
| path=SAHARA_DATA, | |
| name=None, | |
| data_files=jsonl_files_in_folder, | |
| split="train", | |
| download_mode="force_redownload" | |
| ) | |
| print(">>>>>>>", ds) | |
| return ds.to_pandas() | |
| metrics_list={ | |
| 'bleu_1k':'spBleu<sup>1K</sup>', | |
| 'accuracy':'Accuracy', | |
| 'f1':'Macro-F1', | |
| 'exact_match':'Exact Match', | |
| 'rougeL':'RougeL', | |
| } | |
| LANG_ISO2NAME = { | |
| 'eng': 'English', | |
| 'fra': 'French', | |
| # 'ara': 'Arabic', | |
| 'amh': 'Amharic', | |
| 'ewe': 'Ewe', | |
| 'hau': 'Hausa', | |
| 'ibo': 'Igbo', | |
| 'kin': 'Kinyarwanda', | |
| 'lin': 'Lingala', | |
| 'lug': 'Ganda', | |
| 'orm': 'Oromo', | |
| 'sna': 'Shona', | |
| 'sot': 'Southern Sotho', | |
| 'swa': 'Swahili', 'swh': 'Swahili', | |
| 'twi': 'Twi', | |
| 'wol': 'Wolof', | |
| 'xho': 'Xhosa', | |
| 'yor': 'Yoruba', | |
| 'zul': 'Zulu', | |
| 'afr': 'Afrikaans', | |
| 'run': 'Rundi', | |
| 'tir': 'Tigrinya', | |
| 'som': 'Somali', | |
| 'pcm': 'Nigerian Pidgin', | |
| 'teo': 'Teso', | |
| 'nyn': 'Nyankore',# (Nyankole)', | |
| 'lgg': 'Lugbara', | |
| 'bem': 'Bemba',# (Chibemba)', | |
| 'tsn': 'Tswana', | |
| 'bbj': 'GhomΓ‘lΓ‘', | |
| 'mos': 'Moore', | |
| 'bam': 'Bambara', | |
| 'fon': 'Fon', | |
| 'ach': 'Acholi', | |
| 'nso': 'Sepedi', | |
| 'tso': 'Tsonga', | |
| 'fuv': 'Fulfude Nigeria', | |
| 'gaz': 'Oromo', #, West Central', | |
| 'kea': 'Kabuverdianu', | |
| 'nya': 'Nyanja', | |
| 'ssw': 'Swati', | |
| 'luo': 'Dholuo',# (Luo)', | |
| 'ven': 'Venda', | |
| 'kir':"Kirundi", | |
| } | |
| # ===== Build Language NameβISOs map ===== | |
| def build_langname_to_isos(iso2name): | |
| name2isos = defaultdict(set) | |
| for iso, name in iso2name.items(): | |
| name2isos[name].add(iso) | |
| return name2isos | |
| def compare_models(model_1_name, model_2_name): | |
| """ | |
| Prepares a DataFrame comparing the performance of two models task-by-task. | |
| """ | |
| if model_1_name == model_2_name: | |
| return pd.DataFrame([{"Info": "Please select two different models to compare."}]) | |
| # Get data for each model from the main leaderboard results | |
| df1 = all_df[(all_df['model'] == model_1_name) & (all_df['leaderboard'] == 'main')][['task', 'score', 'metric']].rename(columns={'score': model_1_name}) | |
| df2 = all_df[(all_df['model'] == model_2_name) & (all_df['leaderboard'] == 'main')][['task', 'score']].rename(columns={'score': model_2_name}) | |
| if df1.empty or df2.empty: | |
| return pd.DataFrame([{"Info": "One or both selected models have no 'main' leaderboard data to compare."}]) | |
| # Merge the two dataframes on the task ID | |
| comp_df = pd.merge(df1, df2, on='task', how='outer') | |
| # Add descriptive columns | |
| comp_df['Cluster'] = comp_df['task'].map(TASK_TO_CLUSTER_MAP) | |
| comp_df['Task Name'] = comp_df['task'].map(TASKS_LIST) | |
| comp_df['Metric'] = comp_df['metric'].map(metrics_list) | |
| comp_df.fillna({'Cluster': 'Uncategorized'}, inplace=True) | |
| # Calculate the score difference, ensuring scores are numeric | |
| score1 = pd.to_numeric(comp_df[model_1_name], errors='coerce') | |
| score2 = pd.to_numeric(comp_df[model_2_name], errors='coerce') | |
| comp_df['Difference'] = score1 - score2 | |
| # Format the difference column with colors | |
| def format_diff(d): | |
| if pd.isna(d): | |
| return "---" | |
| if d > 0.001: # Model 1 is better | |
| return f"<span style='color:green !important; font-weight:bold !important;'>+{d:.2f}</span>" | |
| elif d < -0.001: # Model 2 is better | |
| return f"<span style='color:red !important; font-weight:bold !important;'>{d:.2f}</span>" | |
| else: | |
| return f"{d:.2f}" | |
| # Format all score columns | |
| comp_df[model_1_name] = comp_df[model_1_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| comp_df[model_2_name] = comp_df[model_2_name].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| comp_df['Difference'] = comp_df['Difference'].apply(format_diff) | |
| # --- MODIFIED: Added 'task' to the list of final columns --- | |
| final_cols = ['Cluster', 'Task Name', 'task', 'Metric', model_1_name, model_2_name, 'Difference'] | |
| comp_df = comp_df[final_cols] | |
| comp_df = comp_df.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True) | |
| # --- NEW: Renamed 'task' column to 'Task ID' for display --- | |
| comp_df.rename(columns={'task': 'Task ID'}, inplace=True) | |
| return comp_df | |
| def get_model_table(model_name): | |
| """ | |
| Generates a performance table for a specific model, showing cluster, task, and score. | |
| The table is sorted by Cluster and then by Task Name. | |
| """ | |
| # Filter for the selected model and only 'main' leaderboard entries | |
| model_df = all_df[(all_df['model'] == model_name) & (all_df['leaderboard'] == 'main')].copy() | |
| if model_df.empty: | |
| return pd.DataFrame([{"Info": f"No 'main' leaderboard data available for the model: {model_name}"}]) | |
| # --- NEW: Add the Cluster Name column using the map --- | |
| model_df['Cluster'] = model_df['task'].map(TASK_TO_CLUSTER_MAP) | |
| # Create other descriptive columns | |
| model_df['Task Name'] = model_df['task'].map(TASKS_LIST) | |
| model_df['Metric'] = model_df['metric'].map(metrics_list) | |
| model_df['Score'] = model_df['score'].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| # --- MODIFIED: Select the new 'Cluster' column for the final table --- | |
| table = model_df[['Cluster', 'Task Name', 'task', 'Metric', 'Score']].rename(columns={'task': 'Task ID'}) | |
| # --- MODIFIED: Sort by Cluster first, then by Task Name --- | |
| table = table.sort_values(by=['Cluster', 'Task Name']).reset_index(drop=True) | |
| # Handle cases where a task might not be in a cluster | |
| table['Cluster'].fillna('Uncategorized', inplace=True) | |
| return table | |
| def get_task_leaderboard(task_key): | |
| """ | |
| Generates a leaderboard for a specific task, showing model performance across all languages. | |
| """ | |
| # Filter the main DataFrame for the selected task | |
| task_df = all_df[all_df['task'] == task_key].copy() | |
| if task_df.empty: | |
| return pd.DataFrame([{"Info": f"No data available for the task: {TASKS_LIST.get(task_key, task_key)}"}]) | |
| # Get the metric for this task to display later | |
| metric_name = metrics_list.get(task_df['metric'].iloc[0], '') | |
| # Create a user-friendly column name for each language/leaderboard | |
| def make_lang_col(row): | |
| lb = row['leaderboard'] | |
| if lb == 'main': | |
| # Skip the 'main' leaderboard for task-specific views as it's an aggregate | |
| return None | |
| if '-' in lb: | |
| pair_lang = lb.split('-') | |
| # Handles cases where an ISO code might not be in our map | |
| src_lang = LANG_ISO2NAME.get(pair_lang[0], pair_lang[0]) | |
| tgt_lang = LANG_ISO2NAME.get(pair_lang[1], pair_lang[1]) | |
| return f"{src_lang} to {tgt_lang}" | |
| else: | |
| return LANG_ISO2NAME.get(lb, lb) | |
| if task_key not in ['lid']: | |
| task_df['lang_col'] = task_df.apply(make_lang_col, axis=1) | |
| task_df.dropna(subset=['lang_col'], inplace=True) # Remove rows where lang_col is None | |
| if task_df.empty: | |
| return pd.DataFrame([{"Info": f"No language-specific data for the task: {TASKS_LIST.get(task_key, task_key)}"}]) | |
| # Pivot the table to have models as rows and languages as columns | |
| table = task_df.pivot_table(index='model', columns='lang_col', values='score', aggfunc='mean').reset_index() | |
| else: | |
| table = task_df.pivot_table(index='model', columns='task', values='score', aggfunc='mean').reset_index() | |
| score_cols = [col for col in table.columns if col != 'model'] | |
| for col in score_cols: | |
| table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x) | |
| main_score_map = all_df[(all_df['task'] == task_key) & (all_df['leaderboard'] == 'main')].set_index('model')['score'] | |
| table.insert(1, 'Task Score', table['model'].map(main_score_map).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")) | |
| # Add ranking medals based on the "Task Score" | |
| table = add_medals_to_models(table, score_col="Task Score") | |
| # Rename columns to be more descriptive, including the metric | |
| # rename_cols = {col: f"{col}<br>Metric: {metric_name}" for col in score_cols} | |
| if task_key in ['belebele', 'ner', 'mgsm', 'mmlu']: | |
| # rename_cols = {col: f"<div class='rotate_div'><br>{next(iter(LANGNAME2ISOS.get(col)))}</div>" for col in score_cols} | |
| rename_cols = {col: f"<div class='rotate_div'><br>{col}</div>" for col in score_cols} | |
| else: | |
| rename_cols = {col: f"{col}" for col in score_cols} | |
| table.rename(columns=rename_cols, inplace=True) | |
| return table | |
| def get_task_metric_map(df): | |
| mapping = {} | |
| for _, row in df.iterrows(): | |
| mapping[row["task"]] = row["metric"] | |
| return mapping | |
| def cluster_average(row, tasks): | |
| vals = [] | |
| for t in tasks: | |
| try: | |
| v = float(row[t]) | |
| vals.append(v) | |
| except Exception: | |
| continue | |
| return np.mean(vals) if vals else np.nan | |
| def add_medals_to_models(df, score_col="overall score"): | |
| score_float_col = "__score_float" | |
| df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan) | |
| df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True) | |
| def get_rank_symbols(scores): | |
| unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
| symbols = ["π", "π₯", "π₯"] | |
| score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
| return [score_to_symbol.get(s, "") for s in scores] | |
| df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist()) | |
| df['model'] = df['rank_symbol'] + ' ' + df['model'] | |
| df = df.drop(columns=['rank_symbol', score_float_col]) | |
| return df | |
| def format_cluster_table(df, cluster_tasks, metric_map): | |
| col_order = ["model"] + cluster_tasks | |
| for t in cluster_tasks: | |
| if t not in df.columns: | |
| df[t] = '---' | |
| df = df[col_order] | |
| for t in cluster_tasks: | |
| df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
| df["Cluster Score"] = df[cluster_tasks].apply( | |
| lambda row: cluster_average(row, cluster_tasks), axis=1 | |
| ) | |
| df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| df = df[["model", "Cluster Score"] + cluster_tasks] | |
| # rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks} | |
| rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks} | |
| df = df.rename(columns=rename) | |
| df = add_medals_to_models(df, score_col="Cluster Score") | |
| return df | |
| def format_main_overall_table(df, metric_map): | |
| main = df.copy() | |
| for cname, tasks in CLUSTERS.items(): | |
| main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1) | |
| cluster_cols = list(CLUSTERS.keys()) | |
| main["Overall Score"] = main[cluster_cols].apply( | |
| lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1 | |
| ) | |
| for c in cluster_cols + ["Overall Score"]: | |
| main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
| main = main[["model", "Overall Score"] + cluster_cols] | |
| main = add_medals_to_models(main, score_col="Overall Score") | |
| main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True) | |
| return main | |
| def load_leaderboards(): | |
| df = load_private_leaderboard_df() | |
| metric_map = get_task_metric_map(df) | |
| main_df = df[df['leaderboard'] == 'main'].copy() | |
| if main_df.empty: | |
| cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS} | |
| main_overall_tab = pd.DataFrame([{"Info": "No data"}]) | |
| return cluster_tabs, main_overall_tab, [], {}, df, metric_map | |
| main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index() | |
| cluster_tabs = {} | |
| for cname, tasks in CLUSTERS.items(): | |
| cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map) | |
| for t in ALL_TASKS: | |
| if t not in main_tasks_df.columns: | |
| main_tasks_df[t] = np.nan | |
| main_overall_tab = format_main_overall_table(main_tasks_df, metric_map) | |
| all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']]) | |
| return cluster_tabs, main_overall_tab, df, metric_map | |
| def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400): | |
| # Remove any column whose name contains "task" | |
| drop_cols = [col for col in df.columns if "task" in col] | |
| df = df.drop(columns=drop_cols, errors="ignore") | |
| df.columns.name = None | |
| html = df.to_html(index=False, escape=False) | |
| return html | |
| cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards() | |
| LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME) | |
| #show only African langs | |
| LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']]) | |
| # TASK_NAME_LIST = sorted(list(TASKS_LIST.values())) | |
| # Create a list of choices in the format "Task Name (id)" | |
| TASK_NAME_LIST = sorted([f"{name} ({key})" for key, name in TASKS_LIST.items()]) | |
| TASK_NAME2KEY = {v: k for k, v in TASKS_LIST.items()} | |
| # Get the list of unique model names for the new dropdown | |
| MODEL_NAME_LIST = sorted(all_df['model'].unique()) if not all_df.empty else [] | |
| def get_lang_table(lang_name): | |
| iso_codes = LANGNAME2ISOS.get(lang_name, []) | |
| if not iso_codes: | |
| return pd.DataFrame([{"Info": "No data for this language"}]) | |
| # Find all leaderboards containing any ISO in this language group | |
| pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)") | |
| matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)] | |
| lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy() | |
| if lang_df.empty: | |
| return pd.DataFrame([{"Info": "No data for this language"}]) | |
| def make_task_col(row): | |
| lb = row['leaderboard'] | |
| task = row['task'] | |
| metric = row['metric'] | |
| if '-' in lb: | |
| pair_lang = lb.split('-') | |
| pair = lb.replace('-', '_') | |
| # return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}" | |
| return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}" | |
| else: | |
| return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}" | |
| lang_df['task_col'] = lang_df.apply(make_task_col, axis=1) | |
| table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index() | |
| score_cols = [col for col in table.columns if col != 'model'] | |
| for col in score_cols: | |
| table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
| def avg_score(row): | |
| vals = [] | |
| for col in score_cols: | |
| try: | |
| v = float(row[col]) | |
| vals.append(v) | |
| except Exception: | |
| continue | |
| return np.mean(vals) if vals else np.nan | |
| table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")) | |
| table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan) | |
| table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True) | |
| def get_rank_symbols(scores): | |
| unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
| symbols = ["π", "π₯", "π₯"] | |
| score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
| return [score_to_symbol.get(s, "") for s in scores] | |
| table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist()) | |
| table['model'] = table['rank_symbol'] + ' ' + table['model'] | |
| table = table.drop(columns=['rank_symbol', '__overall_score_float']) | |
| return table | |