""" Constants and mappings for PazaBench. This module contains all mapping dictionaries and configuration constants that are shared across the application. """ from pathlib import Path # ============================================================================= # File Paths # ============================================================================= RESULTS_CSV_PATH = Path("results_summary.csv") RESULTS_CSV_FILENAME = "results_summary.csv" # ============================================================================= # Filter Configuration # ============================================================================= FILTER_COLUMN_ORDER = ["model", "language", "dataset_group"] FILTER_PARAM_MAP = { "model": "models", "language": "languages", "dataset_group": "dataset_groups", } # ============================================================================= # Display Configuration # ============================================================================= ASR_DISPLAY_COLUMNS = [ "model_family", "model", "dataset_group", "split", "language", "region", "cer", "wer", "rtfx", "duration_sec", "inference_time_sec", "num_samples", ] ASR_NUMERIC_COLUMNS = ["wer", "cer", "rtfx", "duration_sec", "inference_time_sec", "num_samples"] ASR_TEXT_COLUMNS = ["model_family", "model", "dataset_group", "split", "language", "region"] # ============================================================================= # Metric Configuration # ============================================================================= METRIC_CONFIGS = { "cer": {"label": "CER", "better": "lower", "fmt": "{:.2f}"}, "wer": {"label": "WER", "better": "lower", "fmt": "{:.2f}"}, "rtfx": {"label": "RTFx", "better": "higher", "fmt": "{:.2f}"}, } VIEW_MODE_COLUMNS = { "Model families": "model_family", "Individual models": "model", } DEFAULT_VIEW_MODE = "Model families" # ============================================================================= # Language Normalization # ============================================================================= LANGUAGE_NAME_MAPPING = { "Ganda": "Luganda", "Luganda": "Luganda", "Dholuo": "Dholuo", } # ============================================================================= # Geographic Mappings # ============================================================================= # Language to country mapping for Africa map (using ISO 3166-1 alpha-3 codes) LANGUAGE_COUNTRY_MAP = { "Afrikaans": ["ZAF"], "Amharic": ["ETH"], "Arabic": ["EGY", "MAR", "DZA", "TUN", "LBY", "SDN"], "Basaa": ["CMR"], "Dholuo": ["KEN"], "Dioula": ["BFA", "CIV"], "Ekoti": ["MOZ"], "Fula": ["SEN", "MLI", "NGA", "GIN", "CMR", "NER"], "Luganda": ["UGA"], "Hausa": ["NGA", "NER"], "Igbo": ["NGA"], "Kabyle": ["DZA"], "Kalenjin": ["KEN"], "Kamba": ["KEN"], "Kidaw'ida": ["KEN"], "Kikuyu": ["KEN"], "Kinyarwanda": ["RWA"], "Lingala": ["COD", "COG"], "Maasai": ["KEN"], "Northern Sotho": ["ZAF"], "Nyanja": ["MWI", "ZMB"], "Nyungwe": ["MOZ"], "Oromo": ["ETH"], "Sesotho": ["ZAF", "LSO"], "Setswana": ["ZAF", "BWA"], "Shona": ["ZWE"], "Somali": ["KEN", "SOM"], "Swahili": ["KEN", "TZA", "COD"], "Tamazight": ["MAR", "DZA"], "Tigre": ["ERI"], "Tigrinya": ["ERI", "ETH"], "Tshivenda": ["ZAF", "ZWE"], "Twi": ["GHA"], "Umbundu": ["AGO"], "Wolof": ["SEN", "GMB"], "Xhosa": ["ZAF"], "Xitsonga": ["ZAF", "MOZ"], "Yoruba": ["NGA", "BEN"], "Zulu": ["ZAF"], } # Country code to name mapping COUNTRY_NAMES = { "ZAF": "South Africa", "ETH": "Ethiopia", "EGY": "Egypt", "MAR": "Morocco", "DZA": "Algeria", "TUN": "Tunisia", "LBY": "Libya", "SDN": "Sudan", "CMR": "Cameroon", "MOZ": "Mozambique", "KEN": "Kenya", "BFA": "Burkina Faso", "CIV": "Côte d'Ivoire", "SEN": "Senegal", "MLI": "Mali", "NGA": "Nigeria", "GIN": "Guinea", "NER": "Niger", "UGA": "Uganda", "RWA": "Rwanda", "COD": "DR Congo", "COG": "Congo", "TZA": "Tanzania", "MWI": "Malawi", "ZMB": "Zambia", "LSO": "Lesotho", "BWA": "Botswana", "ZWE": "Zimbabwe", "SOM": "Somalia", "ERI": "Eritrea", "GHA": "Ghana", "AGO": "Angola", "GMB": "Gambia", "BEN": "Benin", } # Language to countries mapping (full country names) # Used for language metadata and region lookups LANGUAGE_TO_COUNTRIES_MAP: dict[str, list[str]] = { # East Africa "Swahili": ["Kenya", "Tanzania", "Uganda", "DR Congo"], "Dholuo": ["Kenya"], "Dholuo (Luo)": ["Kenya"], "Kalenjin": ["Kenya"], "Kikuyu": ["Kenya"], "Kamba": ["Kenya"], "Maasai": ["Kenya", "Tanzania"], "Kidaw'ida": ["Kenya", "Tanzania"], "Luganda": ["Uganda"], "Ganda": ["Uganda"], "Ganda (Luganda)": ["Uganda"], "Kinyarwanda": ["Rwanda"], "Somali": ["Somalia", "Kenya", "Ethiopia"], "Amharic": ["Ethiopia"], "Tigrinya": ["Eritrea", "Ethiopia"], "Tigre": ["Eritrea"], "Oromo": ["Ethiopia"], # Southern Africa "Afrikaans": ["South Africa", "Namibia"], "Zulu": ["South Africa"], "Xhosa": ["South Africa"], "Setswana": ["Botswana", "South Africa"], "Sesotho": ["South Africa", "Lesotho"], "Northern Sotho": ["South Africa"], "Xitsonga": ["South Africa", "Mozambique"], "Tshivenda": ["South Africa"], "Shona": ["Zimbabwe"], "Nyanja": ["Malawi", "Zambia"], "Nyungwe": ["Mozambique"], "Ekoti": ["Mozambique"], # West Africa "Yoruba": ["Nigeria"], "Igbo": ["Nigeria"], "Hausa": ["Nigeria", "Niger"], "Wolof": ["Senegal", "Gambia"], "Fula": ["Senegal", "Guinea", "Mali", "Nigeria"], "Twi": ["Ghana"], "Dioula": ["Burkina Faso", "Côte d'Ivoire", "Mali"], "Basaa": ["Cameroon"], # Central Africa "Lingala": ["DR Congo", "Republic of Congo"], "Umbundu": ["Angola"], # North Africa "Arabic": ["Egypt", "Libya", "Tunisia", "Algeria", "Morocco", "Sudan"], "Kabyle": ["Algeria"], "Tamazight": ["Morocco", "Algeria"], } # Country to African region mapping (geographical) COUNTRY_TO_REGION_MAP: dict[str, str] = { # East Africa "Kenya": "East Africa", "Tanzania": "East Africa", "Uganda": "East Africa", "Rwanda": "East Africa", "Burundi": "East Africa", "Ethiopia": "East Africa", "Eritrea": "East Africa", "Somalia": "East Africa", "Djibouti": "East Africa", # Southern Africa "South Africa": "Southern Africa", "Namibia": "Southern Africa", "Botswana": "Southern Africa", "Zimbabwe": "Southern Africa", "Zambia": "Southern Africa", "Malawi": "Southern Africa", "Mozambique": "Southern Africa", "Angola": "Southern Africa", "Lesotho": "Southern Africa", "Eswatini": "Southern Africa", # West Africa "Nigeria": "West Africa", "Ghana": "West Africa", "Senegal": "West Africa", "Gambia": "West Africa", "Guinea": "West Africa", "Mali": "West Africa", "Burkina Faso": "West Africa", "Côte d'Ivoire": "West Africa", "Niger": "West Africa", "Cameroon": "West Africa", # Central Africa "DR Congo": "Central Africa", "Republic of Congo": "Central Africa", "Central African Republic": "Central Africa", "Gabon": "Central Africa", # North Africa "Egypt": "North Africa", "Libya": "North Africa", "Tunisia": "North Africa", "Algeria": "North Africa", "Morocco": "North Africa", "Sudan": "North Africa", } # ============================================================================= # Sample Counts # ============================================================================= # Language sample count data from PazaBench (39 African languages) # Note: These counts represent unique samples per language # Total: 204,492 samples LANGUAGE_SAMPLE_COUNTS = { "Hausa": 22628, "Yoruba": 20612, "Igbo": 18582, "Kabyle": 15003, "Kinyarwanda": 14800, "Swahili": 14422, "Kikuyu": 12980, "Luganda": 11875, "Arabic": 10508, "Kalenjin": 8881, "Dholuo": 7111, "Setswana": 5633, "Somali": 5601, "Xhosa": 4679, "Xitsonga": 4394, "Zulu": 3880, "Sesotho": 3719, "Maasai": 2903, "Wolof": 2371, "Tshivenda": 1652, "Tigre": 1607, "Basaa": 1550, "Amharic": 1132, "Kidaw'ida": 1004, "Shona": 925, "Kamba": 827, "Northern Sotho": 790, "Nyanja": 761, "Ganda": 723, "Fula": 660, "Lingala": 478, "Ekoti": 414, "Afrikaans": 389, "Umbundu": 379, "Nyungwe": 248, "Tamazight": 230, "Dioula": 63, "Oromo": 41, "Twi": 21, "Tigrinya": 16, } # ============================================================================= # Visualization Interpretation Text # ============================================================================= INTERPRETATIONS = { 'speed_accuracy': """ - Each bubble represents a specific model; **bubble size = model parameter count** - **X-axis (WER)**: Left is better (more accurate) - **Y-axis (RTFx)**: Up is better (faster processing) - **Top-left quadrant (⭐)**: Ideal zone - fast AND accurate models - Gray dashed lines show median values for reference - Uses median values to reduce impact of outliers - Hover over bubbles to see exact parameter counts (e.g., 1.5B, 300M) """, 'leaderboard': """ - **No language selected**: Shows model families (aggregated across all languages) - **Language(s) selected**: Shows top 15 individual models for those languages - The horizontal bars show the median Word Error Rate (WER) - Lower WER values (left side) indicate better accuracy - Error bars represent the standard deviation, showing variability - Bar colors correspond to each model family's assigned color - Hover over bars to see additional metrics like RTFx (speed) and total samples evaluated - Uses median instead of mean to reduce impact of outliers """, 'cer_leaderboard': """ - **No language selected**: Shows model families (aggregated across all languages) - **Language(s) selected**: Shows top 15 individual models for those languages - The horizontal bars show the median Character Error Rate (CER) - Lower CER values (left side) indicate better accuracy - CER is especially important for agglutinative and low-resource languages - Error bars represent the standard deviation, showing variability - Bar colors correspond to each model family's assigned color - Hover over bars to see additional metrics like WER, RTFx (speed) and total samples evaluated - Uses median instead of mean to reduce impact of outliers """, 'correlation': """ - Each point represents one evaluation result - Strong positive correlation means CER and WER move together - Models with high character errors typically also have high word errors - The trend line shows the overall relationship - **Below the line**: Models make more accurate character-level predictions (phonetically closer errors) - **Above the line**: Models make more severe character-level errors per word mistake - Can be filtered by language to analyze specific language patterns """, 'consistency': """ - Coefficient of Variation (CV) = (Standard Deviation / Median) × 100% - **Lower CV** = more consistent performance across different languages - **Higher CV** = performance varies widely depending on the language - Bar colors correspond to each model family's assigned color - Important for production deployment - you want consistent models - Outliers have been removed using IQR method for more robust analysis - Uses median instead of mean for more robust central tendency measure """ }