Spaces:
Running
Running
File size: 6,610 Bytes
b6f9fa8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """
FR-19: src/evaluation/aggregator.py β Weighted Score Aggregation
================================================================
Combines scores from all evaluation modules into a single composite score
using the fixed weights defined in SRS Section 8.2.
Weights (must sum to 1.0):
faithfulness : 0.35 (primary signal β DeBERTa NLI)
entity_accuracy : 0.20 (SciSpaCy NER + RxNorm)
source_credibility : 0.20 (evidence tier)
contradiction_risk : 0.15 (1.0 - contradiction_score)
ragas_composite : 0.10 (optional β 0.5 neutral if unavailable)
Output:
EvalResult with:
module_name = "aggregator"
score = weighted composite in [0, 1]
details = {weights_used, weighted_composite, component_contributions}
Usage:
from src.evaluation.aggregator import aggregate
agg_result = aggregate(faith_res, entity_res, source_res, contra_res, ragas_res)
"""
from __future__ import annotations
import logging
import time
from typing import Optional
from src.modules.base import EvalResult
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Default weights (SRS Section 8.2)
# ---------------------------------------------------------------------------
DEFAULT_WEIGHTS: dict[str, float] = {
"faithfulness": 0.35,
"entity_accuracy": 0.20,
"source_credibility": 0.20,
"contradiction_risk": 0.15,
"ragas_composite": 0.10,
}
def aggregate(
faithfulness_result: EvalResult,
entity_result: EvalResult,
source_result: EvalResult,
contradiction_result: EvalResult,
ragas_result: Optional[EvalResult] = None,
weights: Optional[dict[str, float]] = None,
) -> EvalResult:
"""
Aggregate all module scores into a single composite evaluation result.
Args:
faithfulness_result : Output from faithfulness.score_faithfulness()
entity_result : Output from entity_verifier.verify_entities()
source_result : Output from source_credibility.score_source_credibility()
contradiction_result : Output from contradiction.score_contradiction()
ragas_result : Output from ragas_eval.score_ragas() (optional)
weights : Override default weights (must sum to 1.0)
Returns:
EvalResult with module_name="aggregator" and composite score.
"""
t0 = time.perf_counter()
w = weights or DEFAULT_WEIGHTS
# Validate weights sum to 1.0 (tolerance 0.01)
weight_sum = sum(w.values())
if abs(weight_sum - 1.0) > 0.01:
logger.warning(
"Weights sum to %.4f (expected 1.0) β normalising.", weight_sum
)
w = {k: v / weight_sum for k, v in w.items()}
# Extract scores β use 0.5 neutral for any unavailable module
faith_score = faithfulness_result.score if not faithfulness_result.error else 0.5
entity_score = entity_result.score if not entity_result.error else 0.5
source_score = source_result.score if not source_result.error else 0.5
contra_score = contradiction_result.score if not contradiction_result.error else 1.0
ragas_score = (ragas_result.score if ragas_result and not ragas_result.error else 0.5)
# Compute base weighted contributions
contributions = {
"faithfulness_contribution": round(faith_score * w["faithfulness"], 4),
"entity_contribution": round(entity_score * w["entity_accuracy"], 4),
"source_contribution": round(source_score * w["source_credibility"], 4),
"contradiction_contribution": round(contra_score * w["contradiction_risk"], 4),
"ragas_contribution": round(ragas_score * w["ragas_composite"], 4),
}
base_composite = sum(contributions.values())
# --- Non-linear Safety Penalties ---
# Faithfulness penalty: applies when answer is not grounded in context.
# Contradiction penalty: only applies when actual contradictions are detected
# (score < 0.3). Score = 0.5 means "neutral/cannot verify" (refusal answers,
# no keyword overlap) β these should NOT be double-penalized.
penalty_multiplier = 1.0
if faith_score <= 0.6:
penalty_multiplier *= 0.6 # 40% penalty for ungrounded claims
if contra_score < 0.3:
penalty_multiplier *= 0.6 # 40% penalty only for confirmed contradictions
composite = base_composite * penalty_multiplier
# HRS = round(100 Γ (1 - composite)), then map to risk band
# Thresholds must match config.yaml aggregator.risk_bands
_HRS_LOW = 30
_HRS_MODERATE = 60
_HRS_HIGH = 85
hrs = int(round(100 * (1.0 - composite)))
hrs = max(0, min(100, hrs))
if hrs <= _HRS_LOW:
risk_band = "LOW"
elif hrs <= _HRS_MODERATE:
risk_band = "MODERATE"
elif hrs <= _HRS_HIGH:
risk_band = "HIGH"
else:
risk_band = "CRITICAL"
# Confidence level (based on composite, not HRS)
if composite >= 0.80:
confidence = "HIGH"
elif composite >= 0.55:
confidence = "MODERATE"
else:
confidence = "LOW"
details = {
"weights_used": {k: round(v, 4) for k, v in w.items()},
"component_scores": {
"faithfulness": round(faith_score, 4),
"entity_accuracy": round(entity_score, 4),
"source_credibility": round(source_score, 4),
"contradiction_risk": round(contra_score, 4),
"ragas_composite": round(ragas_score, 4),
},
"weighted_composite": round(composite, 4),
"hrs": hrs,
"risk_band": risk_band,
"component_contributions": contributions,
"confidence_level": confidence,
"module_latencies_ms": {
"faithfulness": faithfulness_result.latency_ms,
"entity_verifier": entity_result.latency_ms,
"source_credibility": source_result.latency_ms,
"contradiction": contradiction_result.latency_ms,
"ragas": ragas_result.latency_ms if ragas_result else 0,
},
}
latency_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"Aggregated score: %.3f (%s confidence) β "
"faith=%.2f entity=%.2f source=%.2f contra=%.2f ragas=%.2f",
composite, confidence,
faith_score, entity_score, source_score, contra_score, ragas_score,
)
return EvalResult(
module_name="aggregator",
score=composite,
details=details,
latency_ms=latency_ms,
)
|