ai-response-validator / eval /calibrate.py
mbochniak01
Fix compat, bugs, and types; expand retail KB
e181667
"""
Threshold calibration: run L1 graders against golden-dataset expected answers.
Uses expected_answer as the model response (no generation, no HF API call).
Reports score distribution per metric and flags if current threshold cuts
more than 10% of known-good answers.
Usage:
cd backend && python ../eval/calibrate.py
"""
import statistics
import sys
from pathlib import Path
import yaml
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from grader import (
FAITHFULNESS_THRESHOLD,
RELEVANCY_THRESHOLD,
grade_answer_relevancy,
grade_faithfulness,
)
DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
AT_RISK_WARN = 0.10 # flag if >10% of good answers fall below threshold
def _load_pairs() -> list[dict]:
data = yaml.safe_load(DATASET_PATH.read_text())
return data["pairs"]
def _percentile(values: list[float], p: float) -> float:
sorted_v = sorted(values)
idx = (len(sorted_v) - 1) * p / 100
lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1)
return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo)
def _report_metric(
name: str,
scores: list[float],
threshold: float,
) -> None:
at_risk = sum(1 for s in scores if s < threshold) / len(scores)
flag = " ← FLAG" if at_risk > AT_RISK_WARN else ""
print(
f" {name:<22} "
f"min={min(scores):.3f} "
f"p25={_percentile(scores, 25):.3f} "
f"median={statistics.median(scores):.3f} "
f"p75={_percentile(scores, 75):.3f} "
f"max={max(scores):.3f} "
f"threshold={threshold} "
f"at_risk={at_risk:.0%}{flag}"
)
def run() -> None:
pairs = _load_pairs()
print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n")
relevancy_scores: list[float] = []
faithfulness_scores: list[float] = []
skipped_refusals = 0
for pair in pairs:
question = pair["question"]
answer = pair["expected_answer"].strip()
context = "\n\n".join(pair.get("expected_contains", []))
r = grade_answer_relevancy(question, answer)
relevancy_scores.append(r.score)
f = grade_faithfulness(answer, context)
if "Refusal" in f.detail:
skipped_refusals += 1
else:
faithfulness_scores.append(f.score)
print("Score distributions (expected_answer used as model response):\n")
_report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD)
if faithfulness_scores:
_report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD)
else:
print(" faithfulness β€” all pairs triggered refusal detection, no NLI scores")
if skipped_refusals:
print(f"\n ({skipped_refusals} pairs skipped for faithfulness β€” refusal responses)")
print()
if __name__ == "__main__":
run()