Spaces:
Sleeping
Sleeping
| # config.py | |
| import os | |
| # HuggingFace settings | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| LEADERBOARD_DATASET = "Sunbird/salt-translation-leaderboard" | |
| TEST_SET_DATASET = "Sunbird/salt-translation-test-set" | |
| SALT_DATASET = "sunbird/salt" | |
| # Language settings - ALL UG40 LANGUAGES | |
| ALL_UG40_LANGUAGES = ["ach", "eng", "lgg", "lug", "nyn", "rny", "teo", "swa"] | |
| LANGUAGE_NAMES = { | |
| "ach": "Acholi", | |
| "eng": "English", | |
| "lgg": "Lugbara", | |
| "lug": "Luganda", | |
| "nyn": "Runyankole", | |
| "rny": "Runyoro", | |
| "teo": "Ateso", | |
| "swa": "Swahili", | |
| } | |
| # Google Translate supported subset (for fair comparison) | |
| GOOGLE_SUPPORTED_LANGUAGES = ["lug", "ach", "swa", "eng"] | |
| # EVALUATION TRACKS | |
| EVALUATION_TRACKS = { | |
| "google_comparable": { | |
| "name": "Google-Comparable Track", | |
| "description": "Models evaluated on language pairs supported by Google Translate for commercial comparison", | |
| "languages": GOOGLE_SUPPORTED_LANGUAGES, | |
| "min_samples_per_pair": 50, | |
| }, | |
| "ug40_complete": { | |
| "name": "UG40-Complete Track", | |
| "description": "Models evaluated on all UG40 language pairs for comprehensive assessment", | |
| "languages": ALL_UG40_LANGUAGES, | |
| "min_samples_per_pair": 30, | |
| }, | |
| } | |
| # MODEL CATEGORIES | |
| MODEL_CATEGORIES = { | |
| "commercial": { | |
| "name": "Commercial Systems", | |
| "description": "Production translation systems", | |
| "examples": ["google_translate", "azure_translator"], | |
| "color": "#1f77b4", | |
| }, | |
| "research": { | |
| "name": "Research Models", | |
| "description": "Academic and research institution models", | |
| "examples": ["nllb", "m2m100"], | |
| "color": "#ff7f0e", | |
| }, | |
| "baseline": { | |
| "name": "Baseline Models", | |
| "description": "Simple baseline and reference models", | |
| "examples": ["word_lookup", "frequency_baseline"], | |
| "color": "#2ca02c", | |
| }, | |
| "community": { | |
| "name": "Community Submissions", | |
| "description": "User-submitted models and fine-tuned variants", | |
| "examples": ["user_submission"], | |
| "color": "#d62728", | |
| }, | |
| } | |
| # METRICS CONFIGURATION | |
| METRICS_CONFIG = { | |
| "primary_metrics": ["bleu", "chrf", "quality_score"], | |
| "secondary_metrics": ["rouge1", "rougeL", "cer", "wer"], | |
| "display_precision": 4, | |
| "confidence_level": 0.95, | |
| "bootstrap_samples": 1000, | |
| "min_samples_for_ci": 20, | |
| } | |
| # VALIDATION REQUIREMENTS | |
| VALIDATION_CONFIG = { | |
| "min_samples_per_track": { | |
| "google_comparable": 200, | |
| "ug40_complete": 400, | |
| }, | |
| "max_missing_rate": 0.05, # 5% missing predictions allowed | |
| "quality_thresholds": { | |
| "min_valid_predictions": 0.95, | |
| "max_duplicate_rate": 0.1, | |
| "min_avg_length": 3, | |
| "max_avg_length": 500, | |
| }, | |
| } | |
| # FILE FORMAT SPECIFICATIONS | |
| PREDICTION_FORMAT = { | |
| "required_columns": ["sample_id", "prediction"], | |
| "optional_columns": ["model_name", "confidence", "category"], | |
| "file_types": [".csv", ".tsv", ".json"], | |
| "category_detection": { | |
| "google": ["google", "translate"], | |
| "nllb": ["nllb", "meta"], | |
| "m2m": ["m2m", "facebook"], | |
| "baseline": ["baseline", "simple", "lookup"], | |
| }, | |
| } | |
| # EVALUATION SETTINGS | |
| MAX_TEST_SAMPLES = 500 # Per language pair | |
| MIN_SAMPLES_PER_PAIR = 10 # Minimum for basic statistics | |
| # CHART CONFIGURATION | |
| CHART_CONFIG = { | |
| "category_colors": {cat: info["color"] for cat, info in MODEL_CATEGORIES.items()}, | |
| "height": 600, | |
| "width": 800, | |
| "margin": {"l": 100, "r": 50, "t": 50, "b": 100}, | |
| } |