| """ |
| Threshold calibration: run L1 graders against golden-dataset expected answers. |
| |
| Uses expected_answer as the model response (no generation, no HF API call). |
| Reports score distribution per metric and flags if current threshold cuts |
| more than 10% of known-good answers. |
| |
| Usage: |
| cd backend && python ../eval/calibrate.py |
| """ |
|
|
| import statistics |
| import sys |
| from pathlib import Path |
|
|
| import yaml |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) |
|
|
| from grader import ( |
| FAITHFULNESS_THRESHOLD, |
| RELEVANCY_THRESHOLD, |
| grade_answer_relevancy, |
| grade_faithfulness, |
| ) |
|
|
| DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml" |
| AT_RISK_WARN = 0.10 |
|
|
|
|
| def _load_pairs() -> list[dict]: |
| data = yaml.safe_load(DATASET_PATH.read_text()) |
| return data["pairs"] |
|
|
|
|
| def _percentile(values: list[float], p: float) -> float: |
| sorted_v = sorted(values) |
| idx = (len(sorted_v) - 1) * p / 100 |
| lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1) |
| return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo) |
|
|
|
|
| def _report_metric( |
| name: str, |
| scores: list[float], |
| threshold: float, |
| ) -> None: |
| at_risk = sum(1 for s in scores if s < threshold) / len(scores) |
| flag = " β FLAG" if at_risk > AT_RISK_WARN else "" |
| print( |
| f" {name:<22} " |
| f"min={min(scores):.3f} " |
| f"p25={_percentile(scores, 25):.3f} " |
| f"median={statistics.median(scores):.3f} " |
| f"p75={_percentile(scores, 75):.3f} " |
| f"max={max(scores):.3f} " |
| f"threshold={threshold} " |
| f"at_risk={at_risk:.0%}{flag}" |
| ) |
|
|
|
|
| def run() -> None: |
| pairs = _load_pairs() |
| print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n") |
|
|
| relevancy_scores: list[float] = [] |
| faithfulness_scores: list[float] = [] |
| skipped_refusals = 0 |
|
|
| for pair in pairs: |
| question = pair["question"] |
| answer = pair["expected_answer"].strip() |
| context = "\n\n".join(pair.get("expected_contains", [])) |
|
|
| r = grade_answer_relevancy(question, answer) |
| relevancy_scores.append(r.score) |
|
|
| f = grade_faithfulness(answer, context) |
| if "Refusal" in f.detail: |
| skipped_refusals += 1 |
| else: |
| faithfulness_scores.append(f.score) |
|
|
| print("Score distributions (expected_answer used as model response):\n") |
| _report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD) |
| if faithfulness_scores: |
| _report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD) |
| else: |
| print(" faithfulness β all pairs triggered refusal detection, no NLI scores") |
|
|
| if skipped_refusals: |
| print(f"\n ({skipped_refusals} pairs skipped for faithfulness β refusal responses)") |
|
|
| print() |
|
|
|
|
| if __name__ == "__main__": |
| run() |
|
|