""" FR-19: src/evaluation/aggregator.py — Weighted Score Aggregation ================================================================ Combines scores from all evaluation modules into a single composite score using the fixed weights defined in SRS Section 8.2. Weights (must sum to 1.0): faithfulness : 0.35 (primary signal — DeBERTa NLI) entity_accuracy : 0.20 (SciSpaCy NER + RxNorm) source_credibility : 0.20 (evidence tier) contradiction_risk : 0.15 (1.0 - contradiction_score) ragas_composite : 0.10 (optional — 0.5 neutral if unavailable) Output: EvalResult with: module_name = "aggregator" score = weighted composite in [0, 1] details = {weights_used, weighted_composite, component_contributions} Usage: from src.evaluation.aggregator import aggregate agg_result = aggregate(faith_res, entity_res, source_res, contra_res, ragas_res) """ from __future__ import annotations import logging import time from typing import Optional from src.modules.base import EvalResult logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Default weights (SRS Section 8.2) # --------------------------------------------------------------------------- DEFAULT_WEIGHTS: dict[str, float] = { "faithfulness": 0.35, "entity_accuracy": 0.20, "source_credibility": 0.20, "contradiction_risk": 0.15, "ragas_composite": 0.10, } def aggregate( faithfulness_result: EvalResult, entity_result: EvalResult, source_result: EvalResult, contradiction_result: EvalResult, ragas_result: Optional[EvalResult] = None, weights: Optional[dict[str, float]] = None, ) -> EvalResult: """ Aggregate all module scores into a single composite evaluation result. Args: faithfulness_result : Output from faithfulness.score_faithfulness() entity_result : Output from entity_verifier.verify_entities() source_result : Output from source_credibility.score_source_credibility() contradiction_result : Output from contradiction.score_contradiction() ragas_result : Output from ragas_eval.score_ragas() (optional) weights : Override default weights (must sum to 1.0) Returns: EvalResult with module_name="aggregator" and composite score. """ t0 = time.perf_counter() w = weights or DEFAULT_WEIGHTS # Validate weights sum to 1.0 (tolerance 0.01) weight_sum = sum(w.values()) if abs(weight_sum - 1.0) > 0.01: logger.warning( "Weights sum to %.4f (expected 1.0) — normalising.", weight_sum ) w = {k: v / weight_sum for k, v in w.items()} # Extract scores — use 0.5 neutral for any unavailable module faith_score = faithfulness_result.score if not faithfulness_result.error else 0.5 entity_score = entity_result.score if not entity_result.error else 0.5 source_score = source_result.score if not source_result.error else 0.5 contra_score = contradiction_result.score if not contradiction_result.error else 1.0 ragas_score = (ragas_result.score if ragas_result and not ragas_result.error else 0.5) # Compute base weighted contributions contributions = { "faithfulness_contribution": round(faith_score * w["faithfulness"], 4), "entity_contribution": round(entity_score * w["entity_accuracy"], 4), "source_contribution": round(source_score * w["source_credibility"], 4), "contradiction_contribution": round(contra_score * w["contradiction_risk"], 4), "ragas_contribution": round(ragas_score * w["ragas_composite"], 4), } base_composite = sum(contributions.values()) # --- Non-linear Safety Penalties --- # Faithfulness penalty: applies when answer is not grounded in context. # Contradiction penalty: only applies when actual contradictions are detected # (score < 0.3). Score = 0.5 means "neutral/cannot verify" (refusal answers, # no keyword overlap) — these should NOT be double-penalized. penalty_multiplier = 1.0 if faith_score <= 0.6: penalty_multiplier *= 0.6 # 40% penalty for ungrounded claims if contra_score < 0.3: penalty_multiplier *= 0.6 # 40% penalty only for confirmed contradictions composite = base_composite * penalty_multiplier # HRS = round(100 × (1 - composite)), then map to risk band # Thresholds must match config.yaml aggregator.risk_bands _HRS_LOW = 30 _HRS_MODERATE = 60 _HRS_HIGH = 85 hrs = int(round(100 * (1.0 - composite))) hrs = max(0, min(100, hrs)) if hrs <= _HRS_LOW: risk_band = "LOW" elif hrs <= _HRS_MODERATE: risk_band = "MODERATE" elif hrs <= _HRS_HIGH: risk_band = "HIGH" else: risk_band = "CRITICAL" # Confidence level (based on composite, not HRS) if composite >= 0.80: confidence = "HIGH" elif composite >= 0.55: confidence = "MODERATE" else: confidence = "LOW" details = { "weights_used": {k: round(v, 4) for k, v in w.items()}, "component_scores": { "faithfulness": round(faith_score, 4), "entity_accuracy": round(entity_score, 4), "source_credibility": round(source_score, 4), "contradiction_risk": round(contra_score, 4), "ragas_composite": round(ragas_score, 4), }, "weighted_composite": round(composite, 4), "hrs": hrs, "risk_band": risk_band, "component_contributions": contributions, "confidence_level": confidence, "module_latencies_ms": { "faithfulness": faithfulness_result.latency_ms, "entity_verifier": entity_result.latency_ms, "source_credibility": source_result.latency_ms, "contradiction": contradiction_result.latency_ms, "ragas": ragas_result.latency_ms if ragas_result else 0, }, } latency_ms = int((time.perf_counter() - t0) * 1000) logger.info( "Aggregated score: %.3f (%s confidence) — " "faith=%.2f entity=%.2f source=%.2f contra=%.2f ragas=%.2f", composite, confidence, faith_score, entity_score, source_score, contra_score, ragas_score, ) return EvalResult( module_name="aggregator", score=composite, details=details, latency_ms=latency_ms, )