paza-bench / src /constants.py
muchai-mercy's picture
update pazabench space
53a73e0
"""
Constants and mappings for PazaBench.
This module contains all mapping dictionaries and configuration constants
that are shared across the application.
"""
from pathlib import Path
# =============================================================================
# File Paths
# =============================================================================
RESULTS_CSV_PATH = Path("results_summary.csv")
RESULTS_CSV_FILENAME = "results_summary.csv"
# =============================================================================
# Filter Configuration
# =============================================================================
FILTER_COLUMN_ORDER = ["model", "language", "dataset_group"]
FILTER_PARAM_MAP = {
"model": "models",
"language": "languages",
"dataset_group": "dataset_groups",
}
# =============================================================================
# Display Configuration
# =============================================================================
ASR_DISPLAY_COLUMNS = [
"model_family",
"model",
"dataset_group",
"split",
"language",
"region",
"cer",
"wer",
"rtfx",
"duration_sec",
"inference_time_sec",
"num_samples",
]
ASR_NUMERIC_COLUMNS = ["wer", "cer", "rtfx", "duration_sec", "inference_time_sec", "num_samples"]
ASR_TEXT_COLUMNS = ["model_family", "model", "dataset_group", "split", "language", "region"]
# =============================================================================
# Metric Configuration
# =============================================================================
METRIC_CONFIGS = {
"cer": {"label": "CER", "better": "lower", "fmt": "{:.2f}"},
"wer": {"label": "WER", "better": "lower", "fmt": "{:.2f}"},
"rtfx": {"label": "RTFx", "better": "higher", "fmt": "{:.2f}"},
}
VIEW_MODE_COLUMNS = {
"Model families": "model_family",
"Individual models": "model",
}
DEFAULT_VIEW_MODE = "Model families"
# =============================================================================
# Language Normalization
# =============================================================================
LANGUAGE_NAME_MAPPING = {
"Ganda": "Luganda",
"Luganda": "Luganda",
"Dholuo": "Dholuo",
}
# =============================================================================
# Geographic Mappings
# =============================================================================
# Language to country mapping for Africa map (using ISO 3166-1 alpha-3 codes)
LANGUAGE_COUNTRY_MAP = {
"Afrikaans": ["ZAF"],
"Amharic": ["ETH"],
"Arabic": ["EGY", "MAR", "DZA", "TUN", "LBY", "SDN"],
"Basaa": ["CMR"],
"Dholuo": ["KEN"],
"Dioula": ["BFA", "CIV"],
"Ekoti": ["MOZ"],
"Fula": ["SEN", "MLI", "NGA", "GIN", "CMR", "NER"],
"Luganda": ["UGA"],
"Hausa": ["NGA", "NER"],
"Igbo": ["NGA"],
"Kabyle": ["DZA"],
"Kalenjin": ["KEN"],
"Kamba": ["KEN"],
"Kidaw'ida": ["KEN"],
"Kikuyu": ["KEN"],
"Kinyarwanda": ["RWA"],
"Lingala": ["COD", "COG"],
"Maasai": ["KEN"],
"Northern Sotho": ["ZAF"],
"Nyanja": ["MWI", "ZMB"],
"Nyungwe": ["MOZ"],
"Oromo": ["ETH"],
"Sesotho": ["ZAF", "LSO"],
"Setswana": ["ZAF", "BWA"],
"Shona": ["ZWE"],
"Somali": ["KEN", "SOM"],
"Swahili": ["KEN", "TZA", "COD"],
"Tamazight": ["MAR", "DZA"],
"Tigre": ["ERI"],
"Tigrinya": ["ERI", "ETH"],
"Tshivenda": ["ZAF", "ZWE"],
"Twi": ["GHA"],
"Umbundu": ["AGO"],
"Wolof": ["SEN", "GMB"],
"Xhosa": ["ZAF"],
"Xitsonga": ["ZAF", "MOZ"],
"Yoruba": ["NGA", "BEN"],
"Zulu": ["ZAF"],
}
# Country code to name mapping
COUNTRY_NAMES = {
"ZAF": "South Africa", "ETH": "Ethiopia", "EGY": "Egypt", "MAR": "Morocco",
"DZA": "Algeria", "TUN": "Tunisia", "LBY": "Libya", "SDN": "Sudan",
"CMR": "Cameroon", "MOZ": "Mozambique", "KEN": "Kenya", "BFA": "Burkina Faso",
"CIV": "Côte d'Ivoire", "SEN": "Senegal", "MLI": "Mali", "NGA": "Nigeria",
"GIN": "Guinea", "NER": "Niger", "UGA": "Uganda", "RWA": "Rwanda",
"COD": "DR Congo", "COG": "Congo", "TZA": "Tanzania", "MWI": "Malawi",
"ZMB": "Zambia", "LSO": "Lesotho", "BWA": "Botswana", "ZWE": "Zimbabwe",
"SOM": "Somalia", "ERI": "Eritrea", "GHA": "Ghana", "AGO": "Angola",
"GMB": "Gambia", "BEN": "Benin",
}
# Language to countries mapping (full country names)
# Used for language metadata and region lookups
LANGUAGE_TO_COUNTRIES_MAP: dict[str, list[str]] = {
# East Africa
"Swahili": ["Kenya", "Tanzania", "Uganda", "DR Congo"],
"Dholuo": ["Kenya"],
"Dholuo (Luo)": ["Kenya"],
"Kalenjin": ["Kenya"],
"Kikuyu": ["Kenya"],
"Kamba": ["Kenya"],
"Maasai": ["Kenya", "Tanzania"],
"Kidaw'ida": ["Kenya", "Tanzania"],
"Luganda": ["Uganda"],
"Ganda": ["Uganda"],
"Ganda (Luganda)": ["Uganda"],
"Kinyarwanda": ["Rwanda"],
"Somali": ["Somalia", "Kenya", "Ethiopia"],
"Amharic": ["Ethiopia"],
"Tigrinya": ["Eritrea", "Ethiopia"],
"Tigre": ["Eritrea"],
"Oromo": ["Ethiopia"],
# Southern Africa
"Afrikaans": ["South Africa", "Namibia"],
"Zulu": ["South Africa"],
"Xhosa": ["South Africa"],
"Setswana": ["Botswana", "South Africa"],
"Sesotho": ["South Africa", "Lesotho"],
"Northern Sotho": ["South Africa"],
"Xitsonga": ["South Africa", "Mozambique"],
"Tshivenda": ["South Africa"],
"Shona": ["Zimbabwe"],
"Nyanja": ["Malawi", "Zambia"],
"Nyungwe": ["Mozambique"],
"Ekoti": ["Mozambique"],
# West Africa
"Yoruba": ["Nigeria"],
"Igbo": ["Nigeria"],
"Hausa": ["Nigeria", "Niger"],
"Wolof": ["Senegal", "Gambia"],
"Fula": ["Senegal", "Guinea", "Mali", "Nigeria"],
"Twi": ["Ghana"],
"Dioula": ["Burkina Faso", "Côte d'Ivoire", "Mali"],
"Basaa": ["Cameroon"],
# Central Africa
"Lingala": ["DR Congo", "Republic of Congo"],
"Umbundu": ["Angola"],
# North Africa
"Arabic": ["Egypt", "Libya", "Tunisia", "Algeria", "Morocco", "Sudan"],
"Kabyle": ["Algeria"],
"Tamazight": ["Morocco", "Algeria"],
}
# Country to African region mapping (geographical)
COUNTRY_TO_REGION_MAP: dict[str, str] = {
# East Africa
"Kenya": "East Africa",
"Tanzania": "East Africa",
"Uganda": "East Africa",
"Rwanda": "East Africa",
"Burundi": "East Africa",
"Ethiopia": "East Africa",
"Eritrea": "East Africa",
"Somalia": "East Africa",
"Djibouti": "East Africa",
# Southern Africa
"South Africa": "Southern Africa",
"Namibia": "Southern Africa",
"Botswana": "Southern Africa",
"Zimbabwe": "Southern Africa",
"Zambia": "Southern Africa",
"Malawi": "Southern Africa",
"Mozambique": "Southern Africa",
"Angola": "Southern Africa",
"Lesotho": "Southern Africa",
"Eswatini": "Southern Africa",
# West Africa
"Nigeria": "West Africa",
"Ghana": "West Africa",
"Senegal": "West Africa",
"Gambia": "West Africa",
"Guinea": "West Africa",
"Mali": "West Africa",
"Burkina Faso": "West Africa",
"Côte d'Ivoire": "West Africa",
"Niger": "West Africa",
"Cameroon": "West Africa",
# Central Africa
"DR Congo": "Central Africa",
"Republic of Congo": "Central Africa",
"Central African Republic": "Central Africa",
"Gabon": "Central Africa",
# North Africa
"Egypt": "North Africa",
"Libya": "North Africa",
"Tunisia": "North Africa",
"Algeria": "North Africa",
"Morocco": "North Africa",
"Sudan": "North Africa",
}
# =============================================================================
# Sample Counts
# =============================================================================
# Language sample count data from PazaBench (39 African languages)
# Note: These counts represent unique samples per language
# Total: 204,492 samples
LANGUAGE_SAMPLE_COUNTS = {
"Hausa": 22628,
"Yoruba": 20612,
"Igbo": 18582,
"Kabyle": 15003,
"Kinyarwanda": 14800,
"Swahili": 14422,
"Kikuyu": 12980,
"Luganda": 11875,
"Arabic": 10508,
"Kalenjin": 8881,
"Dholuo": 7111,
"Setswana": 5633,
"Somali": 5601,
"Xhosa": 4679,
"Xitsonga": 4394,
"Zulu": 3880,
"Sesotho": 3719,
"Maasai": 2903,
"Wolof": 2371,
"Tshivenda": 1652,
"Tigre": 1607,
"Basaa": 1550,
"Amharic": 1132,
"Kidaw'ida": 1004,
"Shona": 925,
"Kamba": 827,
"Northern Sotho": 790,
"Nyanja": 761,
"Ganda": 723,
"Fula": 660,
"Lingala": 478,
"Ekoti": 414,
"Afrikaans": 389,
"Umbundu": 379,
"Nyungwe": 248,
"Tamazight": 230,
"Dioula": 63,
"Oromo": 41,
"Twi": 21,
"Tigrinya": 16,
}
# =============================================================================
# Visualization Interpretation Text
# =============================================================================
INTERPRETATIONS = {
'speed_accuracy': """
- Each bubble represents a specific model; **bubble size = model parameter count**
- **X-axis (WER)**: Left is better (more accurate)
- **Y-axis (RTFx)**: Up is better (faster processing)
- **Top-left quadrant (⭐)**: Ideal zone - fast AND accurate models
- Gray dashed lines show median values for reference
- Uses median values to reduce impact of outliers
- Hover over bubbles to see exact parameter counts (e.g., 1.5B, 300M)
""",
'leaderboard': """
- **No language selected**: Shows model families (aggregated across all languages)
- **Language(s) selected**: Shows top 15 individual models for those languages
- The horizontal bars show the median Word Error Rate (WER)
- Lower WER values (left side) indicate better accuracy
- Error bars represent the standard deviation, showing variability
- Bar colors correspond to each model family's assigned color
- Hover over bars to see additional metrics like RTFx (speed) and total samples evaluated
- Uses median instead of mean to reduce impact of outliers
""",
'cer_leaderboard': """
- **No language selected**: Shows model families (aggregated across all languages)
- **Language(s) selected**: Shows top 15 individual models for those languages
- The horizontal bars show the median Character Error Rate (CER)
- Lower CER values (left side) indicate better accuracy
- CER is especially important for agglutinative and low-resource languages
- Error bars represent the standard deviation, showing variability
- Bar colors correspond to each model family's assigned color
- Hover over bars to see additional metrics like WER, RTFx (speed) and total samples evaluated
- Uses median instead of mean to reduce impact of outliers
""",
'correlation': """
- Each point represents one evaluation result
- Strong positive correlation means CER and WER move together
- Models with high character errors typically also have high word errors
- The trend line shows the overall relationship
- **Below the line**: Models make more accurate character-level predictions (phonetically closer errors)
- **Above the line**: Models make more severe character-level errors per word mistake
- Can be filtered by language to analyze specific language patterns
""",
'consistency': """
- Coefficient of Variation (CV) = (Standard Deviation / Median) × 100%
- **Lower CV** = more consistent performance across different languages
- **Higher CV** = performance varies widely depending on the language
- Bar colors correspond to each model family's assigned color
- Important for production deployment - you want consistent models
- Outliers have been removed using IQR method for more robust analysis
- Uses median instead of mean for more robust central tendency measure
"""
}