from __future__ import annotations from dataclasses import dataclass from typing import Dict, List, Tuple, Iterable, Set, Any import math from .alignment import AlignmentEngine from .models import StrategicObjective, ActionTask @dataclass class EvalConfig: top_k: int = 5 def precision_recall_at_k( pred_ids: List[str], truth_ids: Set[str], k: int ) -> Tuple[float, float]: preds = pred_ids[:k] hits = sum(1 for pid in preds if pid in truth_ids) precision = hits / max(1, len(preds)) recall = hits / max(1, len(truth_ids)) return precision, recall def average_precision(pred_ids: List[str], truth_ids: Set[str]) -> float: hits = 0 ap_sum = 0.0 for i, pid in enumerate(pred_ids, start=1): if pid in truth_ids: hits += 1 ap_sum += hits / i return ap_sum / max(1, hits) def ndcg_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float: # Relevance is binary: 1 if in truth, else 0 dcg = 0.0 for i, pid in enumerate(pred_ids[:k], start=1): rel = 1.0 if pid in truth_ids else 0.0 dcg += rel / math.log2(i + 1) # Ideal DCG assumes all relevant items are ranked first ideal_rel_count = min(len(truth_ids), k) idcg = sum(1.0 / math.log2(i + 1) for i in range(1, ideal_rel_count + 1)) return dcg / idcg if idcg > 0 else 0.0 @dataclass class StrategyEval: strategy_id: str precision_at_k: float recall_at_k: float ap: float ndcg: float @dataclass class EvalSummary: top_k: int macro_precision: float macro_recall: float map: float mean_ndcg: float per_strategy: List[StrategyEval] similarity_summary: Dict[str, float] | None = None def evaluate_alignment( engine: AlignmentEngine, strategies: Iterable[StrategicObjective], actions: Iterable[ActionTask], ground_truth: Dict[str, List[str]], config: EvalConfig | None = None, ) -> EvalSummary: cfg = config or EvalConfig() # Run alignment retrieval result = engine.align( strategies=list(strategies), actions=list(actions), top_k=cfg.top_k ) per_strategy: List[StrategyEval] = [] p_list: List[float] = [] r_list: List[float] = [] ap_list: List[float] = [] ndcg_list: List[float] = [] for sres in result["strategy_results"]: sid = sres["strategy_id"] preds = [m["action_id"] for m in sres.get("top_matches", [])] truth = set(ground_truth.get(sid, [])) p, r = precision_recall_at_k(preds, truth, cfg.top_k) ap = average_precision(preds, truth) nd = ndcg_at_k(preds, truth, cfg.top_k) per_strategy.append( StrategyEval( strategy_id=sid, precision_at_k=p, recall_at_k=r, ap=ap, ndcg=nd, ) ) p_list.append(p) r_list.append(r) ap_list.append(ap) ndcg_list.append(nd) summary = EvalSummary( top_k=cfg.top_k, macro_precision=sum(p_list) / max(1, len(p_list)), macro_recall=sum(r_list) / max(1, len(r_list)), map=sum(ap_list) / max(1, len(ap_list)), mean_ndcg=sum(ndcg_list) / max(1, len(ndcg_list)), per_strategy=per_strategy, similarity_summary=None, ) return summary def precision_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float: p, _ = precision_recall_at_k(pred_ids, truth_ids, k) return p def recall_at_k(pred_ids: List[str], truth_ids: Set[str], k: int) -> float: _, r = precision_recall_at_k(pred_ids, truth_ids, k) return r def run_evaluation( alignment_result: Dict[str, Any], ground_truth_path: str | None, top_k: int = 5 ) -> Dict[str, Any]: """Compute Precision@K, Recall@K and similarity summaries given alignment results. Ground truth format: {"S1": ["A3","A9"], "S2": ["A2"], ...} """ import json from pathlib import Path truth_map: Dict[str, List[str]] = {} if ground_truth_path: p = Path(ground_truth_path) if p.exists(): with p.open("r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, dict): truth_map = {str(k): list(v or []) for k, v in data.items()} per_strategy: List[Dict[str, Any]] = [] p_list: List[float] = [] r_list: List[float] = [] retrieved_sims: List[float] = [] relevant_sims: List[float] = [] for sres in alignment_result.get("strategy_results", []): sid = sres.get("strategy_id") preds = [m.get("action_id") for m in sres.get("top_matches", [])] sims = [float(m.get("similarity", 0.0)) for m in sres.get("top_matches", [])] truth = set(truth_map.get(str(sid), [])) p, r = precision_recall_at_k(preds, truth, top_k) ap = average_precision(preds, truth) nd = ndcg_at_k(preds, truth, top_k) per_strategy.append( { "strategy_id": sid, "precision_at_k": p, "recall_at_k": r, "ap": ap, "ndcg": nd, } ) p_list.append(p) r_list.append(r) # Similarity summaries retrieved_sims.extend(sims) # Relevant sims: similarity of matches that are in ground truth for m in sres.get("top_matches", []): if m.get("action_id") in truth: relevant_sims.append(float(m.get("similarity", 0.0))) eval_summary = { "top_k": top_k, "macro_precision": sum(p_list) / max(1, len(p_list)), "macro_recall": sum(r_list) / max(1, len(r_list)), "per_strategy": per_strategy, "similarity_summary": { "retrieved_mean": (sum(retrieved_sims) / max(1, len(retrieved_sims))) if retrieved_sims else 0.0, "relevant_mean": (sum(relevant_sims) / max(1, len(relevant_sims))) if relevant_sims else 0.0, }, } return eval_summary