strategy-sync-ai / src /evaluation.py
Lahiru Munasinghe
Initial Space snapshot without binaries
a91323c
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Tuple, Iterable, Set, Any
import math
from .alignment import AlignmentEngine
from .models import StrategicObjective, ActionTask
@dataclass
class EvalConfig:
top_k: int = 5
def precision_recall_at_k(
pred_ids: List[str], truth_ids: Set[str], k: int
) -> Tuple[float, float]:
preds = pred_ids[:k]
hits = sum(1 for pid in preds if pid in truth_ids)
precision = hits / max(1, len(preds))
recall = hits / max(1, len(truth_ids))
return precision, recall
def average_precision(pred_ids: List[str], truth_ids: Set[str]) -> float:
hits = 0
ap_sum = 0.0
for i, pid in enumerate(pred_ids, start=1):
if pid in truth_ids:
hits += 1
ap_sum += hits / i
return ap_sum / max(1, hits)
def ndcg_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float:
# Relevance is binary: 1 if in truth, else 0
dcg = 0.0
for i, pid in enumerate(pred_ids[:k], start=1):
rel = 1.0 if pid in truth_ids else 0.0
dcg += rel / math.log2(i + 1)
# Ideal DCG assumes all relevant items are ranked first
ideal_rel_count = min(len(truth_ids), k)
idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_rel_count + 1))
return dcg / idcg if idcg > 0 else 0.0
@dataclass
class StrategyEval:
strategy_id: str
precision_at_k: float
recall_at_k: float
ap: float
ndcg: float
@dataclass
class EvalSummary:
top_k: int
macro_precision: float
macro_recall: float
map: float
mean_ndcg: float
per_strategy: List[StrategyEval]
similarity_summary: Dict[str, float] | None = None
def evaluate_alignment(
engine: AlignmentEngine,
strategies: Iterable[StrategicObjective],
actions: Iterable[ActionTask],
ground_truth: Dict[str, List[str]],
config: EvalConfig | None = None,
) -> EvalSummary:
cfg = config or EvalConfig()
# Run alignment retrieval
result = engine.align(
strategies=list(strategies), actions=list(actions), top_k=cfg.top_k
)
per_strategy: List[StrategyEval] = []
p_list: List[float] = []
r_list: List[float] = []
ap_list: List[float] = []
ndcg_list: List[float] = []
for sres in result["strategy_results"]:
sid = sres["strategy_id"]
preds = [m["action_id"] for m in sres.get("top_matches", [])]
truth = set(ground_truth.get(sid, []))
p, r = precision_recall_at_k(preds, truth, cfg.top_k)
ap = average_precision(preds, truth)
nd = ndcg_at_k(preds, truth, cfg.top_k)
per_strategy.append(
StrategyEval(
strategy_id=sid,
precision_at_k=p,
recall_at_k=r,
ap=ap,
ndcg=nd,
)
)
p_list.append(p)
r_list.append(r)
ap_list.append(ap)
ndcg_list.append(nd)
summary = EvalSummary(
top_k=cfg.top_k,
macro_precision=sum(p_list) / max(1, len(p_list)),
macro_recall=sum(r_list) / max(1, len(r_list)),
map=sum(ap_list) / max(1, len(ap_list)),
mean_ndcg=sum(ndcg_list) / max(1, len(ndcg_list)),
per_strategy=per_strategy,
similarity_summary=None,
)
return summary
def precision_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float:
p, _ = precision_recall_at_k(pred_ids, truth_ids, k)
return p
def recall_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float:
_, r = precision_recall_at_k(pred_ids, truth_ids, k)
return r
def run_evaluation(
alignment_result: Dict[str, Any], ground_truth_path: str | None, top_k: int = 5
) -> Dict[str, Any]:
"""Compute Precision@K, Recall@K and similarity summaries given alignment results.
Ground truth format: {"S1": ["A3","A9"], "S2": ["A2"], ...}
"""
import json
from pathlib import Path
truth_map: Dict[str, List[str]] = {}
if ground_truth_path:
p = Path(ground_truth_path)
if p.exists():
with p.open("r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
truth_map = {str(k): list(v or []) for k, v in data.items()}
per_strategy: List[Dict[str, Any]] = []
p_list: List[float] = []
r_list: List[float] = []
retrieved_sims: List[float] = []
relevant_sims: List[float] = []
for sres in alignment_result.get("strategy_results", []):
sid = sres.get("strategy_id")
preds = [m.get("action_id") for m in sres.get("top_matches", [])]
sims = [float(m.get("similarity", 0.0)) for m in sres.get("top_matches", [])]
truth = set(truth_map.get(str(sid), []))
p, r = precision_recall_at_k(preds, truth, top_k)
ap = average_precision(preds, truth)
nd = ndcg_at_k(preds, truth, top_k)
per_strategy.append(
{
"strategy_id": sid,
"precision_at_k": p,
"recall_at_k": r,
"ap": ap,
"ndcg": nd,
}
)
p_list.append(p)
r_list.append(r)
# Similarity summaries
retrieved_sims.extend(sims)
# Relevant sims: similarity of matches that are in ground truth
for m in sres.get("top_matches", []):
if m.get("action_id") in truth:
relevant_sims.append(float(m.get("similarity", 0.0)))
eval_summary = {
"top_k": top_k,
"macro_precision": sum(p_list) / max(1, len(p_list)),
"macro_recall": sum(r_list) / max(1, len(r_list)),
"per_strategy": per_strategy,
"similarity_summary": {
"retrieved_mean": (sum(retrieved_sims) / max(1, len(retrieved_sims)))
if retrieved_sims
else 0.0,
"relevant_mean": (sum(relevant_sims) / max(1, len(relevant_sims)))
if relevant_sims
else 0.0,
},
}
return eval_summary