""" Threshold calibration: run L1 graders against golden-dataset expected answers. Uses expected_answer as the model response (no generation, no HF API call). Reports score distribution per metric and flags if current threshold cuts more than 10% of known-good answers. Usage: cd backend && python ../eval/calibrate.py """ import statistics import sys from pathlib import Path import yaml sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) from grader import ( FAITHFULNESS_THRESHOLD, RELEVANCY_THRESHOLD, grade_answer_relevancy, grade_faithfulness, ) DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml" AT_RISK_WARN = 0.10 # flag if >10% of good answers fall below threshold def _load_pairs() -> list[dict]: data = yaml.safe_load(DATASET_PATH.read_text()) return data["pairs"] def _percentile(values: list[float], p: float) -> float: sorted_v = sorted(values) idx = (len(sorted_v) - 1) * p / 100 lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1) return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo) def _report_metric( name: str, scores: list[float], threshold: float, ) -> None: at_risk = sum(1 for s in scores if s < threshold) / len(scores) flag = " ← FLAG" if at_risk > AT_RISK_WARN else "" print( f" {name:<22} " f"min={min(scores):.3f} " f"p25={_percentile(scores, 25):.3f} " f"median={statistics.median(scores):.3f} " f"p75={_percentile(scores, 75):.3f} " f"max={max(scores):.3f} " f"threshold={threshold} " f"at_risk={at_risk:.0%}{flag}" ) def run() -> None: pairs = _load_pairs() print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n") relevancy_scores: list[float] = [] faithfulness_scores: list[float] = [] skipped_refusals = 0 for pair in pairs: question = pair["question"] answer = pair["expected_answer"].strip() context = "\n\n".join(pair.get("expected_contains", [])) r = grade_answer_relevancy(question, answer) relevancy_scores.append(r.score) f = grade_faithfulness(answer, context) if "Refusal" in f.detail: skipped_refusals += 1 else: faithfulness_scores.append(f.score) print("Score distributions (expected_answer used as model response):\n") _report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD) if faithfulness_scores: _report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD) else: print(" faithfulness — all pairs triggered refusal detection, no NLI scores") if skipped_refusals: print(f"\n ({skipped_refusals} pairs skipped for faithfulness — refusal responses)") print() if __name__ == "__main__": run()