AcharO commited on
Commit
96de871
·
1 Parent(s): a1672c4

feat: auto-write eval/metrics.json on every eval run — gradio_app loads live metrics

Browse files
Files changed (3) hide show
  1. eval/evaluator.py +21 -1
  2. eval/metrics.json +26 -0
  3. gradio_app.py +15 -4
eval/evaluator.py CHANGED
@@ -4,6 +4,7 @@ Main evaluation orchestrator for bias detection framework.
4
  This module coordinates the evaluation process and provides the main interface
5
  for running evaluations.
6
  """
 
7
  from datetime import datetime
8
  from pathlib import Path
9
  from typing import List, Optional
@@ -139,7 +140,26 @@ class BiasEvaluationOrchestrator:
139
  csv_filename = f"f1_report_{timestamp}.csv"
140
  csv_path = self.results_writer.write_csv_report(csv_data, csv_filename)
141
  print(f"Report saved to: {csv_path}")
142
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
  print(f"Warning: Failed to save results: {e}")
145
 
 
4
  This module coordinates the evaluation process and provides the main interface
5
  for running evaluations.
6
  """
7
+ import json
8
  from datetime import datetime
9
  from pathlib import Path
10
  from typing import List, Optional
 
140
  csv_filename = f"f1_report_{timestamp}.csv"
141
  csv_path = self.results_writer.write_csv_report(csv_data, csv_filename)
142
  print(f"Report saved to: {csv_path}")
143
+
144
+ # Write metrics.json for gradio_app.py to load at startup
145
+ lang_map = {
146
+ Language.ENGLISH: "en", Language.SWAHILI: "sw",
147
+ Language.FRENCH: "fr", Language.GIKUYU: "ki",
148
+ }
149
+ metrics_out = {}
150
+ for r in results:
151
+ code = lang_map.get(r.language, r.language.value)
152
+ m = r.overall_metrics
153
+ metrics_out[code] = {
154
+ "f1": round(m.f1_score, 3),
155
+ "precision": round(m.precision, 3),
156
+ "recall": round(m.recall, 3),
157
+ "samples": m.true_positives + m.false_positives + m.false_negatives + m.true_negatives,
158
+ }
159
+ metrics_path = Path(__file__).resolve().parent / "metrics.json"
160
+ metrics_path.write_text(json.dumps(metrics_out, indent=2))
161
+ print(f"Metrics saved to: {metrics_path}")
162
+
163
  except Exception as e:
164
  print(f"Warning: Failed to save results: {e}")
165
 
eval/metrics.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "f1": 0.786,
4
+ "precision": 1.0,
5
+ "recall": 0.647,
6
+ "samples": 66
7
+ },
8
+ "sw": {
9
+ "f1": 0.816,
10
+ "precision": 0.733,
11
+ "recall": 0.92,
12
+ "samples": 64723
13
+ },
14
+ "fr": {
15
+ "f1": 0.542,
16
+ "precision": 1.0,
17
+ "recall": 0.371,
18
+ "samples": 50
19
+ },
20
+ "ki": {
21
+ "f1": 0.352,
22
+ "precision": 0.926,
23
+ "recall": 0.217,
24
+ "samples": 11848
25
+ }
26
+ }
gradio_app.py CHANGED
@@ -36,15 +36,26 @@ LANGS = {
36
  "Gikuyu": ("ki", Language.GIKUYU),
37
  }
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Per-model metrics: model_key -> lang_code -> metrics dict
40
  MODEL_METRICS = {
41
  "rules": {
42
  "label": "Rules-based (lexicon)",
43
  "description": "Deterministic lexicon rules across all 4 languages. High precision, no GPU needed.",
44
- "en": dict(f1=0.885, precision=1.000, recall=0.794, tier="Pre-Bronze", samples=66),
45
- "sw": dict(f1=0.821, precision=0.741, recall=0.919, tier="Gold (sample count)", samples=64_723),
46
- "fr": dict(f1=0.793, precision=1.000, recall=0.657, tier="Pre-Bronze", samples=50),
47
- "ki": dict(f1=0.368, precision=0.916, recall=0.231, tier="Bronze (sample count)", samples=11_848),
48
  },
49
  "ml_classifier": {
50
  "label": "sw-bias-classifier-v2 (ML)",
 
36
  "Gikuyu": ("ki", Language.GIKUYU),
37
  }
38
 
39
+ # Load live metrics from eval/metrics.json if available (written by eval/evaluator.py)
40
+ _METRICS_PATH = Path(__file__).parent / "eval" / "metrics.json"
41
+ _LIVE_METRICS: dict = {}
42
+ try:
43
+ _LIVE_METRICS = json.loads(_METRICS_PATH.read_text())
44
+ except Exception:
45
+ pass
46
+
47
+ def _m(code: str, field: str, fallback):
48
+ return _LIVE_METRICS.get(code, {}).get(field, fallback)
49
+
50
  # Per-model metrics: model_key -> lang_code -> metrics dict
51
  MODEL_METRICS = {
52
  "rules": {
53
  "label": "Rules-based (lexicon)",
54
  "description": "Deterministic lexicon rules across all 4 languages. High precision, no GPU needed.",
55
+ "en": dict(f1=_m("en","f1",0.885), precision=_m("en","precision",1.000), recall=_m("en","recall",0.794), tier="Pre-Bronze", samples=_m("en","samples",66)),
56
+ "sw": dict(f1=_m("sw","f1",0.821), precision=_m("sw","precision",0.741), recall=_m("sw","recall",0.919), tier="Gold (sample count)", samples=_m("sw","samples",64_723)),
57
+ "fr": dict(f1=_m("fr","f1",0.793), precision=_m("fr","precision",1.000), recall=_m("fr","recall",0.657), tier="Pre-Bronze", samples=_m("fr","samples",50)),
58
+ "ki": dict(f1=_m("ki","f1",0.368), precision=_m("ki","precision",0.916), recall=_m("ki","recall",0.231), tier="Bronze (sample count)", samples=_m("ki","samples",11_848)),
59
  },
60
  "ml_classifier": {
61
  "label": "sw-bias-classifier-v2 (ML)",