File size: 2,885 Bytes
907c06a
 
 
 
 
 
 
 
 
 
 
 
e181667
907c06a
 
 
 
 
 
 
e181667
 
907c06a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Threshold calibration: run L1 graders against golden-dataset expected answers.

Uses expected_answer as the model response (no generation, no HF API call).
Reports score distribution per metric and flags if current threshold cuts
more than 10% of known-good answers.

Usage:
    cd backend && python ../eval/calibrate.py
"""

import statistics
import sys
from pathlib import Path

import yaml

sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))

from grader import (
    FAITHFULNESS_THRESHOLD,
    RELEVANCY_THRESHOLD,
    grade_answer_relevancy,
    grade_faithfulness,
)

DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
AT_RISK_WARN = 0.10  # flag if >10% of good answers fall below threshold


def _load_pairs() -> list[dict]:
    data = yaml.safe_load(DATASET_PATH.read_text())
    return data["pairs"]


def _percentile(values: list[float], p: float) -> float:
    sorted_v = sorted(values)
    idx = (len(sorted_v) - 1) * p / 100
    lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1)
    return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo)


def _report_metric(
    name: str,
    scores: list[float],
    threshold: float,
) -> None:
    at_risk = sum(1 for s in scores if s < threshold) / len(scores)
    flag = " ← FLAG" if at_risk > AT_RISK_WARN else ""
    print(
        f"  {name:<22} "
        f"min={min(scores):.3f}  "
        f"p25={_percentile(scores, 25):.3f}  "
        f"median={statistics.median(scores):.3f}  "
        f"p75={_percentile(scores, 75):.3f}  "
        f"max={max(scores):.3f}  "
        f"threshold={threshold}  "
        f"at_risk={at_risk:.0%}{flag}"
    )


def run() -> None:
    pairs = _load_pairs()
    print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n")

    relevancy_scores: list[float] = []
    faithfulness_scores: list[float] = []
    skipped_refusals = 0

    for pair in pairs:
        question = pair["question"]
        answer = pair["expected_answer"].strip()
        context = "\n\n".join(pair.get("expected_contains", []))

        r = grade_answer_relevancy(question, answer)
        relevancy_scores.append(r.score)

        f = grade_faithfulness(answer, context)
        if "Refusal" in f.detail:
            skipped_refusals += 1
        else:
            faithfulness_scores.append(f.score)

    print("Score distributions (expected_answer used as model response):\n")
    _report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD)
    if faithfulness_scores:
        _report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD)
    else:
        print("  faithfulness           — all pairs triggered refusal detection, no NLI scores")

    if skipped_refusals:
        print(f"\n  ({skipped_refusals} pairs skipped for faithfulness — refusal responses)")

    print()


if __name__ == "__main__":
    run()