Spaces:

below-threshold
/

ai-response-validator

Sleeping

ai-response-validator / eval /calibrate.py

mbochniak01

Fix compat, bugs, and types; expand retail KB

e181667 7 days ago

2.89 kB

	"""
	Threshold calibration: run L1 graders against golden-dataset expected answers.

	Uses expected_answer as the model response (no generation, no HF API call).
	Reports score distribution per metric and flags if current threshold cuts
	more than 10% of known-good answers.

	Usage:
	cd backend && python ../eval/calibrate.py
	"""

	import statistics
	import sys
	from pathlib import Path

	import yaml

	sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))

	from grader import (
	FAITHFULNESS_THRESHOLD,
	RELEVANCY_THRESHOLD,
	grade_answer_relevancy,
	grade_faithfulness,
	)

	DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
	AT_RISK_WARN = 0.10 # flag if >10% of good answers fall below threshold


	def _load_pairs() -> list[dict]:
	data = yaml.safe_load(DATASET_PATH.read_text())
	return data["pairs"]


	def _percentile(values: list[float], p: float) -> float:
	sorted_v = sorted(values)
	idx = (len(sorted_v) - 1) * p / 100
	lo, hi = int(idx), min(int(idx) + 1, len(sorted_v) - 1)
	return sorted_v[lo] + (sorted_v[hi] - sorted_v[lo]) * (idx - lo)


	def _report_metric(
	name: str,
	scores: list[float],
	threshold: float,
	) -> None:
	at_risk = sum(1 for s in scores if s < threshold) / len(scores)
	flag = " ← FLAG" if at_risk > AT_RISK_WARN else ""
	print(
	f" {name:<22} "
	f"min={min(scores):.3f} "
	f"p25={_percentile(scores, 25):.3f} "
	f"median={statistics.median(scores):.3f} "
	f"p75={_percentile(scores, 75):.3f} "
	f"max={max(scores):.3f} "
	f"threshold={threshold} "
	f"at_risk={at_risk:.0%}{flag}"
	)


	def run() -> None:
	pairs = _load_pairs()
	print(f"\nCalibrating against {len(pairs)} pairs from golden-dataset.yaml\n")

	relevancy_scores: list[float] = []
	faithfulness_scores: list[float] = []
	skipped_refusals = 0

	for pair in pairs:
	question = pair["question"]
	answer = pair["expected_answer"].strip()
	context = "\n\n".join(pair.get("expected_contains", []))

	r = grade_answer_relevancy(question, answer)
	relevancy_scores.append(r.score)

	f = grade_faithfulness(answer, context)
	if "Refusal" in f.detail:
	skipped_refusals += 1
	else:
	faithfulness_scores.append(f.score)

	print("Score distributions (expected_answer used as model response):\n")
	_report_metric("answer_relevancy", relevancy_scores, RELEVANCY_THRESHOLD)
	if faithfulness_scores:
	_report_metric("faithfulness", faithfulness_scores, FAITHFULNESS_THRESHOLD)
	else:
	print(" faithfulness — all pairs triggered refusal detection, no NLI scores")

	if skipped_refusals:
	print(f"\n ({skipped_refusals} pairs skipped for faithfulness — refusal responses)")

	print()


	if __name__ == "__main__":
	run()