Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

akera commited on Jun 16, 2025

Commit

d5b83bc

verified ·

1 Parent(s): a729bca

Update config.py

Browse files

Files changed (1) hide show

config.py +177 -74

config.py CHANGED Viewed

@@ -7,96 +7,199 @@ LEADERBOARD_DATASET = "Sunbird/salt-translation-leaderboard"
 TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
 SALT_DATASET = "sunbird/salt"
-# Language settings - ALL UG40 LANGUAGES (Updated from SALT constants)
-ALL_UG40_LANGUAGES = [
-    'ach', 'eng', 'lgg', 'lug', 'nyn', 'rny', 'teo', 'swa'
-]
 LANGUAGE_NAMES = {
-    'ach': 'Acholi',
-    'eng': 'English',
-    'lgg': 'Lugbara',
-    'lug': 'Luganda',
-    'nyn': 'Runyankole',
-    'rny': 'Runyoro',
-    'teo': 'Ateso',
-    'swa': 'Swahili'
 }
-# Google Translate supported subset (for comparison)
-GOOGLE_SUPPORTED_LANGUAGES = ['lug', 'ach', 'swa', 'eng']
 # Google Translate language mapping
-GOOGLE_LANG_MAP = {
-    'lug': 'lg',
-    'ach': 'ach',
-    'swa': 'sw',
-    'eng': 'en'
-}
-# Evaluation settings
-MAX_TEST_SAMPLES = 500  # Per language pair
-MIN_SAMPLES_PER_PAIR = 10  # Minimum samples to be valid
-# UI settings
-TITLE = "🏆 SALT Translation Leaderboard"
-DESCRIPTION = """
-Evaluation of translation models on Ugandan languages
-Upload your model's predictions on our standardized test set to see how it performs across all UG40 language pairs.
-Compare against Google Translate baseline and other submitted models.
-"""
-# File format specifications
-PREDICTION_FORMAT = {
-    'required_columns': ['sample_id', 'prediction'],
-    'optional_columns': ['model_name', 'confidence'],
-    'file_types': ['.csv', '.tsv', '.json']
 }
-# Metrics configuration - Updated to match reference implementation
 METRICS_CONFIG = {
-    'primary_metrics': ['bleu', 'chrf', 'quality_score'],
-    'secondary_metrics': ['rouge1', 'rouge2', 'rougeL', 'cer', 'wer', 'len_ratio'],
-    'display_precision': 4,
-    'quality_score_components': [
-        'bleu',     # normalized to 0-1
-        'chrf',     # already 0-1
-        'cer',      # inverted (1-cer)
-        'wer',      # inverted (1-wer)
-        'rouge1',   # 0-1
-        'rougeL'    # 0-1
     ],
-    'error_metrics': ['cer', 'wer'],  # Lower is better
-    'score_metrics': ['bleu', 'chrf', 'quality_score', 'rouge1', 'rouge2', 'rougeL']  # Higher is better
 }
-# Display settings for leaderboard
-DISPLAY_CONFIG = {
-    'max_models_radar': 8,
-    'max_models_ranking': 15,
-    'max_language_pairs_detail': 20,
-    'decimal_places': {
-        'quality_score': 4,
-        'bleu': 2,
-        'chrf': 4,
-        'rouge1': 4,
-        'rouge2': 4,
-        'rougeL': 4,
-        'cer': 4,
-        'wer': 4,
-        'len_ratio': 3,
-        'coverage_rate': 1  # percentage
-    }
 }
-# Chart colors and styling
 CHART_CONFIG = {
-    'google_comparable_color': '#1f77b4',
-    'ug40_only_color': '#ff7f0e',
-    'primary_colorscale': 'Viridis',
-    'secondary_colorscale': 'Plasma',
-    'bar_height_per_model': 30,
-    'min_chart_height': 400,
-    'max_chart_height': 1000
 }

 TEST_SET_DATASET = "Sunbird/salt-translation-test-set"
 SALT_DATASET = "sunbird/salt"
+# Language settings - ALL UG40 LANGUAGES
+ALL_UG40_LANGUAGES = ["ach", "eng", "lgg", "lug", "nyn", "rny", "teo", "swa"]
 LANGUAGE_NAMES = {
+    "ach": "Acholi",
+    "eng": "English",
+    "lgg": "Lugbara",
+    "lug": "Luganda",
+    "nyn": "Runyankole",
+    "rny": "Runyoro",
+    "teo": "Ateso",
+    "swa": "Swahili",
 }
+# Google Translate supported subset (for fair comparison)
+GOOGLE_SUPPORTED_LANGUAGES = ["lug", "ach", "swa", "eng"]
 # Google Translate language mapping
+GOOGLE_LANG_MAP = {"lug": "lg", "ach": "ach", "swa": "sw", "eng": "en"}
+# SCIENTIFIC EVALUATION TRACKS
+EVALUATION_TRACKS = {
+    "google_comparable": {
+        "name": "Google-Comparable Track",
+        "description": "Models evaluated only on language pairs supported by Google Translate",
+        "languages": GOOGLE_SUPPORTED_LANGUAGES,
+        "min_samples_per_pair": 50,
+        "statistical_power": 0.8,
+        "significance_level": 0.05,
+    },
+    "ug40_complete": {
+        "name": "UG40-Complete Track",
+        "description": "Models evaluated on all UG40 language pairs",
+        "languages": ALL_UG40_LANGUAGES,
+        "min_samples_per_pair": 30,
+        "statistical_power": 0.8,
+        "significance_level": 0.05,
+    },
+    "language_pair_matrix": {
+        "name": "Language-Pair Matrix",
+        "description": "Individual language pair analysis with statistical significance",
+        "languages": ALL_UG40_LANGUAGES,
+        "min_samples_per_pair": 20,
+        "statistical_power": 0.7,
+        "significance_level": 0.05,
+    },
+}
+# MODEL CATEGORIES
+MODEL_CATEGORIES = {
+    "commercial": {
+        "name": "Commercial Systems",
+        "description": "Production translation systems",
+        "examples": ["google_translate", "azure_translator"],
+        "color": "#1f77b4",
+    },
+    "research": {
+        "name": "Research Models",
+        "description": "Academic and research institution models",
+        "examples": ["nllb", "m2m100"],
+        "color": "#ff7f0e",
+    },
+    "baseline": {
+        "name": "Baseline Models",
+        "description": "Simple baseline and reference models",
+        "examples": ["word_lookup", "frequency_baseline"],
+        "color": "#2ca02c",
+    },
+    "community": {
+        "name": "Community Submissions",
+        "description": "User-submitted models and fine-tuned variants",
+        "examples": ["user_submission"],
+        "color": "#d62728",
+    },
+}
+# STATISTICAL SETTINGS
+STATISTICAL_CONFIG = {
+    "confidence_level": 0.95,
+    "bootstrap_samples": 1000,
+    "min_samples_for_ci": 20,
+    "effect_size_thresholds": {
+        "small": 0.2,
+        "medium": 0.5,
+        "large": 0.8,
+    },
+    "multiple_testing_correction": "bonferroni",
+    "outlier_detection": {
+        "method": "iqr",
+        "factor": 1.5,
+    },
 }
+# METRICS CONFIGURATION - Enhanced for statistical analysis
 METRICS_CONFIG = {
+    "primary_metrics": ["bleu", "chrf", "quality_score"],
+    "secondary_metrics": ["rouge1", "rouge2", "rougeL", "cer", "wer", "len_ratio"],
+    "display_precision": 4,
+    "quality_score_components": ["bleu", "chrf", "cer", "wer", "rouge1", "rougeL"],
+    "error_metrics": ["cer", "wer"],  # Lower is better
+    "score_metrics": ["bleu", "chrf", "quality_score", "rouge1", "rouge2", "rougeL"],
+    "statistical_metrics": [
+        "mean",
+        "std",
+        "median",
+        "ci_lower",
+        "ci_upper",
+        "p_value",
+        "effect_size",
     ],
 }
+# VALIDATION REQUIREMENTS
+VALIDATION_CONFIG = {
+    "min_samples_per_track": {
+        "google_comparable": 200,
+        "ug40_complete": 400,
+        "language_pair_matrix": 50,
+    },
+    "max_missing_rate": 0.05,  # 5% missing predictions allowed
+    "quality_thresholds": {
+        "min_valid_predictions": 0.95,
+        "max_duplicate_rate": 0.1,
+        "min_avg_length": 3,
+        "max_avg_length": 500,
+    },
+}
+# UI CONFIGURATION
+UI_CONFIG = {
+    "title": "🏆 SALT Translation Leaderboard - Scientific Edition",
+    "description": """
+    Rigorous evaluation of translation models on Ugandan languages with statistical significance testing.
+    Three evaluation tracks ensure fair comparison across different model capabilities and language support.
+    """,
+    "tracks": {
+        "google_comparable": {
+            "tab_name": "🤖 Google-Comparable Track",
+            "icon": "🤖",
+            "color": "#1f77b4",
+        },
+        "ug40_complete": {
+            "tab_name": "🌍 UG40-Complete Track",
+            "icon": "🌍",
+            "color": "#ff7f0e",
+        },
+        "language_pair_matrix": {
+            "tab_name": "📊 Language-Pair Matrix",
+            "icon": "📊",
+            "color": "#2ca02c",
+        },
+    },
 }
+# CHART CONFIGURATION - Research-grade styling
 CHART_CONFIG = {
+    "statistical_colorscale": "RdYlBu_r",
+    "category_colors": {cat: info["color"] for cat, info in MODEL_CATEGORIES.items()},
+    "heatmap_config": {
+        "colorscale": "Viridis",
+        "show_values": True,
+        "font_size": 10,
+    },
+    "confidence_interval_config": {
+        "alpha": 0.3,
+        "line_width": 2,
+        "marker_size": 8,
+    },
+    "statistical_plot_config": {
+        "height": 600,
+        "width": 800,
+        "margin": {"l": 100, "r": 50, "t": 50, "b": 100},
+    },
+}
+# FILE FORMAT SPECIFICATIONS
+PREDICTION_FORMAT = {
+    "required_columns": ["sample_id", "prediction"],
+    "optional_columns": ["model_name", "confidence", "category"],
+    "file_types": [".csv", ".tsv", ".json"],
+    "category_detection": {
+        "google": ["google", "translate"],
+        "nllb": ["nllb", "meta"],
+        "m2m": ["m2m", "facebook"],
+        "baseline": ["baseline", "simple", "lookup"],
+    },
+}
+# EVALUATION SETTINGS
+MAX_TEST_SAMPLES = 500  # Per language pair
+MIN_SAMPLES_PER_PAIR = 10  # Minimum for basic statistics
+SAMPLE_SIZE_RECOMMENDATIONS = {
+    "basic_comparison": 50,
+    "statistical_significance": 100,
+    "publication_quality": 200,
 }