Spaces:

microsoft
/

paza-bench

Running

File size: 11,828 Bytes

0aa9a49

"""
Constants and mappings for PazaBench.

This module contains all mapping dictionaries and configuration constants
that are shared across the application.
"""

from pathlib import Path

# =============================================================================
# File Paths
# =============================================================================

RESULTS_CSV_PATH = Path("results_summary.csv")
RESULTS_CSV_FILENAME = "results_summary.csv"

# =============================================================================
# Filter Configuration
# =============================================================================

FILTER_COLUMN_ORDER = ["model", "language", "dataset_group"]
FILTER_PARAM_MAP = {
    "model": "models",
    "language": "languages",
    "dataset_group": "dataset_groups",
}

# =============================================================================
# Display Configuration
# =============================================================================

ASR_DISPLAY_COLUMNS = [
    "model_family",
    "model",
    "dataset_group",
    "split",
    "language",
    "region",
    "cer",
    "wer",
    "rtfx",
    "duration_sec",
    "inference_time_sec",
    "num_samples",
]
ASR_NUMERIC_COLUMNS = ["wer", "cer", "rtfx", "duration_sec", "inference_time_sec", "num_samples"]
ASR_TEXT_COLUMNS = ["model_family", "model", "dataset_group", "split", "language", "region"]

# =============================================================================
# Metric Configuration
# =============================================================================

METRIC_CONFIGS = {
    "cer": {"label": "CER", "better": "lower", "fmt": "{:.2f}"},
    "wer": {"label": "WER", "better": "lower", "fmt": "{:.2f}"},
    "rtfx": {"label": "RTFx", "better": "higher", "fmt": "{:.2f}"},
}

VIEW_MODE_COLUMNS = {
    "Model families": "model_family",
    "Individual models": "model",
}
DEFAULT_VIEW_MODE = "Model families"

# =============================================================================
# Language Normalization
# =============================================================================

LANGUAGE_NAME_MAPPING = {
    "Ganda": "Luganda",
    "Luganda": "Luganda",
    "Dholuo": "Dholuo",
}

# =============================================================================
# Geographic Mappings
# =============================================================================

# Language to country mapping for Africa map (using ISO 3166-1 alpha-3 codes)
LANGUAGE_COUNTRY_MAP = {
    "Afrikaans": ["ZAF"],
    "Amharic": ["ETH"],
    "Arabic": ["EGY", "MAR", "DZA", "TUN", "LBY", "SDN"],
    "Basaa": ["CMR"],
    "Dholuo": ["KEN"],
    "Dioula": ["BFA", "CIV"],
    "Ekoti": ["MOZ"],
    "Fula": ["SEN", "MLI", "NGA", "GIN", "CMR", "NER"],
    "Luganda": ["UGA"],
    "Hausa": ["NGA", "NER"],
    "Igbo": ["NGA"],
    "Kabyle": ["DZA"],
    "Kalenjin": ["KEN"],
    "Kamba": ["KEN"],
    "Kidaw'ida": ["KEN"],
    "Kikuyu": ["KEN"],
    "Kinyarwanda": ["RWA"],
    "Lingala": ["COD", "COG"],
    "Maasai": ["KEN"],
    "Northern Sotho": ["ZAF"],
    "Nyanja": ["MWI", "ZMB"],
    "Nyungwe": ["MOZ"],
    "Oromo": ["ETH"],
    "Sesotho": ["ZAF", "LSO"],
    "Setswana": ["ZAF", "BWA"],
    "Shona": ["ZWE"],
    "Somali": ["KEN", "SOM"],
    "Swahili": ["KEN", "TZA", "COD"],
    "Tamazight": ["MAR", "DZA"],
    "Tigre": ["ERI"],
    "Tigrinya": ["ERI", "ETH"],
    "Tshivenda": ["ZAF", "ZWE"],
    "Twi": ["GHA"],
    "Umbundu": ["AGO"],
    "Wolof": ["SEN", "GMB"],
    "Xhosa": ["ZAF"],
    "Xitsonga": ["ZAF", "MOZ"],
    "Yoruba": ["NGA", "BEN"],
    "Zulu": ["ZAF"],
}

# Country code to name mapping
COUNTRY_NAMES = {
    "ZAF": "South Africa", "ETH": "Ethiopia", "EGY": "Egypt", "MAR": "Morocco",
    "DZA": "Algeria", "TUN": "Tunisia", "LBY": "Libya", "SDN": "Sudan",
    "CMR": "Cameroon", "MOZ": "Mozambique", "KEN": "Kenya", "BFA": "Burkina Faso",
    "CIV": "Côte d'Ivoire", "SEN": "Senegal", "MLI": "Mali", "NGA": "Nigeria",
    "GIN": "Guinea", "NER": "Niger", "UGA": "Uganda", "RWA": "Rwanda",
    "COD": "DR Congo", "COG": "Congo", "TZA": "Tanzania", "MWI": "Malawi",
    "ZMB": "Zambia", "LSO": "Lesotho", "BWA": "Botswana", "ZWE": "Zimbabwe",
    "SOM": "Somalia", "ERI": "Eritrea", "GHA": "Ghana", "AGO": "Angola",
    "GMB": "Gambia", "BEN": "Benin",
}

# Language to countries mapping (full country names)
# Used for language metadata and region lookups
LANGUAGE_TO_COUNTRIES_MAP: dict[str, list[str]] = {
    # East Africa
    "Swahili": ["Kenya", "Tanzania", "Uganda", "DR Congo"],
    "Dholuo": ["Kenya"],
    "Dholuo (Luo)": ["Kenya"],
    "Kalenjin": ["Kenya"],
    "Kikuyu": ["Kenya"],
    "Kamba": ["Kenya"],
    "Maasai": ["Kenya", "Tanzania"],
    "Kidaw'ida": ["Kenya", "Tanzania"],
    "Luganda": ["Uganda"],
    "Kinyarwanda": ["Rwanda"],
    "Somali": ["Somalia", "Kenya", "Ethiopia"],
    "Amharic": ["Ethiopia"],
    "Tigrinya": ["Eritrea", "Ethiopia"],
    "Tigre": ["Eritrea"],
    "Oromo": ["Ethiopia"],
    # Southern Africa
    "Afrikaans": ["South Africa", "Namibia"],
    "Zulu": ["South Africa"],
    "Xhosa": ["South Africa"],
    "Setswana": ["Botswana", "South Africa"],
    "Sesotho": ["South Africa", "Lesotho"],
    "Northern Sotho": ["South Africa"],
    "Xitsonga": ["South Africa", "Mozambique"],
    "Tshivenda": ["South Africa"],
    "Shona": ["Zimbabwe"],
    "Nyanja": ["Malawi", "Zambia"],
    "Nyungwe": ["Mozambique"],
    "Ekoti": ["Mozambique"],
    # West Africa
    "Yoruba": ["Nigeria"],
    "Igbo": ["Nigeria"],
    "Hausa": ["Nigeria", "Niger"],
    "Wolof": ["Senegal", "Gambia"],
    "Fula": ["Senegal", "Guinea", "Mali", "Nigeria"],
    "Twi": ["Ghana"],
    "Dioula": ["Burkina Faso", "Côte d'Ivoire", "Mali"],
    "Basaa": ["Cameroon"],
    # Central Africa
    "Lingala": ["DR Congo", "Republic of Congo"],
    "Umbundu": ["Angola"],
    # North Africa
    "Arabic": ["Egypt", "Libya", "Tunisia", "Algeria", "Morocco", "Sudan"],
    "Kabyle": ["Algeria"],
    "Tamazight": ["Morocco", "Algeria"],
}

# Country to African region mapping (geographical)
COUNTRY_TO_REGION_MAP: dict[str, str] = {
    # East Africa
    "Kenya": "East Africa",
    "Tanzania": "East Africa",
    "Uganda": "East Africa",
    "Rwanda": "East Africa",
    "Burundi": "East Africa",
    "Ethiopia": "East Africa",
    "Eritrea": "East Africa",
    "Somalia": "East Africa",
    "Djibouti": "East Africa",
    # Southern Africa
    "South Africa": "Southern Africa",
    "Namibia": "Southern Africa",
    "Botswana": "Southern Africa",
    "Zimbabwe": "Southern Africa",
    "Zambia": "Southern Africa",
    "Malawi": "Southern Africa",
    "Mozambique": "Southern Africa",
    "Angola": "Southern Africa",
    "Lesotho": "Southern Africa",
    "Eswatini": "Southern Africa",
    # West Africa
    "Nigeria": "West Africa",
    "Ghana": "West Africa",
    "Senegal": "West Africa",
    "Gambia": "West Africa",
    "Guinea": "West Africa",
    "Mali": "West Africa",
    "Burkina Faso": "West Africa",
    "Côte d'Ivoire": "West Africa",
    "Niger": "West Africa",
    "Cameroon": "West Africa",
    # Central Africa
    "DR Congo": "Central Africa",
    "Republic of Congo": "Central Africa",
    "Central African Republic": "Central Africa",
    "Gabon": "Central Africa",
    # North Africa
    "Egypt": "North Africa",
    "Libya": "North Africa",
    "Tunisia": "North Africa",
    "Algeria": "North Africa",
    "Morocco": "North Africa",
    "Sudan": "North Africa",
}

# =============================================================================
# Sample Counts
# =============================================================================

# Language sample count data from PazaBench (39 African languages)
# Note: These counts represent unique samples per language
# Total: 204,492 samples
LANGUAGE_SAMPLE_COUNTS = {
    "Hausa": 22628,
    "Yoruba": 20612,
    "Igbo": 18582,
    "Kabyle": 15003,
    "Kinyarwanda": 14800,
    "Swahili": 14422,
    "Kikuyu": 12980,
    "Luganda": 12598,  # Combined Luganda (11875) and Ganda (723)
    "Arabic": 10508,
    "Kalenjin": 8881,
    "Dholuo": 7111,
    "Setswana": 5633,
    "Somali": 5601,
    "Xhosa": 4679,
    "Xitsonga": 4394,
    "Zulu": 3880,
    "Sesotho": 3719,
    "Maasai": 2903,
    "Wolof": 2371,
    "Tshivenda": 1652,
    "Tigre": 1607,
    "Basaa": 1550,
    "Amharic": 1132,
    "Kidaw'ida": 1004,
    "Shona": 925,
    "Kamba": 827,
    "Northern Sotho": 790,
    "Nyanja": 761,
    "Fula": 660,
    "Lingala": 478,
    "Ekoti": 414,
    "Afrikaans": 389,
    "Umbundu": 379,
    "Nyungwe": 248,
    "Tamazight": 230,
    "Dioula": 63,
    "Oromo": 41,
    "Twi": 21,
    "Tigrinya": 16,
}

# =============================================================================
# Visualization Interpretation Text
# =============================================================================

INTERPRETATIONS = {
    'speed_accuracy': """
    - Each bubble represents a specific model; **bubble size = model parameter count**
    - **X-axis (WER)**: Left is better (more accurate)
    - **Y-axis (RTFx)**: Up is better (faster processing)
    - **Top-left quadrant (⭐)**: Ideal zone - fast AND accurate models
    - Gray dashed lines show median values for reference
    - Uses median values to reduce impact of outliers
    - Hover over bubbles to see exact parameter counts (e.g., 1.5B, 300M)
    """,
    
    'leaderboard': """
    - **No language selected**: Shows model families (aggregated across all languages)
    - **Language(s) selected**: Shows top 15 individual models for those languages
    - The horizontal bars show the median Word Error Rate (WER)
    - Lower WER values (left side) indicate better accuracy
    - Error bars represent the standard deviation, showing variability
    - Bar colors correspond to each model family's assigned color
    - Hover over bars to see additional metrics like RTFx (speed) and total samples evaluated
    - Uses median instead of mean to reduce impact of outliers
    """,
    
    'cer_leaderboard': """
    - **No language selected**: Shows model families (aggregated across all languages)
    - **Language(s) selected**: Shows top 15 individual models for those languages
    - The horizontal bars show the median Character Error Rate (CER)
    - Lower CER values (left side) indicate better accuracy
    - CER is especially important for agglutinative and low-resource languages
    - Error bars represent the standard deviation, showing variability
    - Bar colors correspond to each model family's assigned color
    - Hover over bars to see additional metrics like WER, RTFx (speed) and total samples evaluated
    - Uses median instead of mean to reduce impact of outliers
    """,
    
    'correlation': """
    - Each point represents one evaluation result
    - Strong positive correlation means CER and WER move together
    - Models with high character errors typically also have high word errors
    - The trend line shows the overall relationship
    - **Below the line**: Models make more accurate character-level predictions (phonetically closer errors)
    - **Above the line**: Models make more severe character-level errors per word mistake
    - Can be filtered by language to analyze specific language patterns
    """,
    
    'consistency': """
    - Coefficient of Variation (CV) = (Standard Deviation / Median) × 100%
    - **Lower CV** = more consistent performance across different languages
    - **Higher CV** = performance varies widely depending on the language
    - Bar colors correspond to each model family's assigned color
    - Important for production deployment - you want consistent models
    - Outliers have been removed using IQR method for more robust analysis
    - Uses median instead of mean for more robust central tendency measure
    """
}