Spaces:

juakazike
/

gender-sensitization-engine

Sleeping

AcharO commited on Mar 17

Commit

96de871

1 Parent(s): a1672c4

feat: auto-write eval/metrics.json on every eval run — gradio_app loads live metrics

Files changed (3) hide show

eval/evaluator.py CHANGED Viewed

@@ -4,6 +4,7 @@ Main evaluation orchestrator for bias detection framework.
 This module coordinates the evaluation process and provides the main interface
 for running evaluations.
 """
 from datetime import datetime
 from pathlib import Path
 from typing import List, Optional
@@ -139,7 +140,26 @@ class BiasEvaluationOrchestrator:
             csv_filename = f"f1_report_{timestamp}.csv"
             csv_path = self.results_writer.write_csv_report(csv_data, csv_filename)
             print(f"Report saved to: {csv_path}")
         except Exception as e:
             print(f"Warning: Failed to save results: {e}")

 This module coordinates the evaluation process and provides the main interface
 for running evaluations.
 """
+import json
 from datetime import datetime
 from pathlib import Path
 from typing import List, Optional
             csv_filename = f"f1_report_{timestamp}.csv"
             csv_path = self.results_writer.write_csv_report(csv_data, csv_filename)
             print(f"Report saved to: {csv_path}")
+            # Write metrics.json for gradio_app.py to load at startup
+            lang_map = {
+                Language.ENGLISH: "en", Language.SWAHILI: "sw",
+                Language.FRENCH: "fr", Language.GIKUYU: "ki",
+            }
+            metrics_out = {}
+            for r in results:
+                code = lang_map.get(r.language, r.language.value)
+                m = r.overall_metrics
+                metrics_out[code] = {
+                    "f1": round(m.f1_score, 3),
+                    "precision": round(m.precision, 3),
+                    "recall": round(m.recall, 3),
+                    "samples": m.true_positives + m.false_positives + m.false_negatives + m.true_negatives,
+                }
+            metrics_path = Path(__file__).resolve().parent / "metrics.json"
+            metrics_path.write_text(json.dumps(metrics_out, indent=2))
+            print(f"Metrics saved to: {metrics_path}")
         except Exception as e:
             print(f"Warning: Failed to save results: {e}")

eval/metrics.json ADDED Viewed

+{
+  "en": {
+    "f1": 0.786,
+    "precision": 1.0,
+    "recall": 0.647,
+    "samples": 66
+  },
+  "sw": {
+    "f1": 0.816,
+    "precision": 0.733,
+    "recall": 0.92,
+    "samples": 64723
+  },
+  "fr": {
+    "f1": 0.542,
+    "precision": 1.0,
+    "recall": 0.371,
+    "samples": 50
+  },
+  "ki": {
+    "f1": 0.352,
+    "precision": 0.926,
+    "recall": 0.217,
+    "samples": 11848
+  }
+}

gradio_app.py CHANGED Viewed

@@ -36,15 +36,26 @@ LANGS = {
     "Gikuyu":   ("ki", Language.GIKUYU),
 }
 # Per-model metrics: model_key -> lang_code -> metrics dict
 MODEL_METRICS = {
     "rules": {
         "label": "Rules-based (lexicon)",
         "description": "Deterministic lexicon rules across all 4 languages. High precision, no GPU needed.",
-        "en": dict(f1=0.885, precision=1.000, recall=0.794, tier="Pre-Bronze", samples=66),
-        "sw": dict(f1=0.821, precision=0.741, recall=0.919, tier="Gold (sample count)", samples=64_723),
-        "fr": dict(f1=0.793, precision=1.000, recall=0.657, tier="Pre-Bronze", samples=50),
-        "ki": dict(f1=0.368, precision=0.916, recall=0.231, tier="Bronze (sample count)", samples=11_848),
     },
     "ml_classifier": {
         "label": "sw-bias-classifier-v2 (ML)",

     "Gikuyu":   ("ki", Language.GIKUYU),
 }
+# Load live metrics from eval/metrics.json if available (written by eval/evaluator.py)
+_METRICS_PATH = Path(__file__).parent / "eval" / "metrics.json"
+_LIVE_METRICS: dict = {}
+try:
+    _LIVE_METRICS = json.loads(_METRICS_PATH.read_text())
+except Exception:
+    pass
+def _m(code: str, field: str, fallback):
+    return _LIVE_METRICS.get(code, {}).get(field, fallback)
 # Per-model metrics: model_key -> lang_code -> metrics dict
 MODEL_METRICS = {
     "rules": {
         "label": "Rules-based (lexicon)",
         "description": "Deterministic lexicon rules across all 4 languages. High precision, no GPU needed.",
+        "en": dict(f1=_m("en","f1",0.885), precision=_m("en","precision",1.000), recall=_m("en","recall",0.794), tier="Pre-Bronze", samples=_m("en","samples",66)),
+        "sw": dict(f1=_m("sw","f1",0.821), precision=_m("sw","precision",0.741), recall=_m("sw","recall",0.919), tier="Gold (sample count)", samples=_m("sw","samples",64_723)),
+        "fr": dict(f1=_m("fr","f1",0.793), precision=_m("fr","precision",1.000), recall=_m("fr","recall",0.657), tier="Pre-Bronze", samples=_m("fr","samples",50)),
+        "ki": dict(f1=_m("ki","f1",0.368), precision=_m("ki","precision",0.916), recall=_m("ki","recall",0.231), tier="Bronze (sample count)", samples=_m("ki","samples",11_848)),
     },
     "ml_classifier": {
         "label": "sw-bias-classifier-v2 (ML)",