TheAarvee05 commited on
Commit
338aebc
·
verified ·
1 Parent(s): c030db3

Upload evaluation/metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. evaluation/metrics.py +37 -0
evaluation/metrics.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ evaluation/metrics.py — Aggregate metrics across episodes and tasks.
3
+ """
4
+
5
+ from __future__ import annotations
6
+ from typing import List, Dict
7
+ import statistics
8
+
9
+
10
+ def summarise_results(results: List[Dict]) -> Dict:
11
+ scores = [r["score"] for r in results]
12
+
13
+ def _mean_from_breakdown(key: str) -> float:
14
+ vals = [r.get("breakdown", {}).get(key) for r in results if key in r.get("breakdown", {})]
15
+ if not vals:
16
+ return 0.0
17
+ return round(statistics.mean(vals), 4)
18
+
19
+ return {
20
+ "mean_score": round(statistics.mean(scores), 4),
21
+ "median_score": round(statistics.median(scores), 4),
22
+ "min_score": round(min(scores), 4),
23
+ "max_score": round(max(scores), 4),
24
+ "pass_rate": round(sum(1 for r in results if r["passed"]) / len(results), 4),
25
+ "mean_gap_reduction": _mean_from_breakdown("gap_closed"),
26
+ "mean_signal_recovery": _mean_from_breakdown("signal_recovery"),
27
+ "mean_roas_improvement": _mean_from_breakdown("roas_gain") or _mean_from_breakdown("roas_improvement"),
28
+ "mean_action_efficiency": _mean_from_breakdown("action_efficiency"),
29
+ "mean_redundancy_penalty": _mean_from_breakdown("redundant_action_penalty"),
30
+ "by_difficulty": {
31
+ diff: round(
32
+ statistics.mean(r["score"] for r in results if r["difficulty"] == diff) or 0, 4
33
+ )
34
+ for diff in ["easy", "medium", "hard"]
35
+ if any(r["difficulty"] == diff for r in results)
36
+ },
37
+ }