sentence-uncertainty / scripts /fit_quantiles.py
rdisipio's picture
Extract shared modules to eliminate code duplication
1a07ac9 unverified
#!/usr/bin/env python3
"""
fit_quantiles.py
Read sentence-level raw uncertainty scores produced by a
compute_uncertainty_scores_*.py script, compute quantile boundaries, and write
them to a normalizer config file. The output path should reflect the backend:
config/uncertainty_quantiles_mc_dropout.json
config/uncertainty_quantiles_lora_laplace.json
Usage:
python scripts/fit_quantiles.py \\
--infile data/uncertainty_scores_mc_dropout.jsonl \\
--outfile config/uncertainty_quantiles_mc_dropout.json \\
--quantiles 0.0 0.25 0.5 0.75 1.0
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
ROOT_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT_DIR))
from src.lora_training import collect_scores_from_jsonl, fit_quantiles, save_quantiles
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
)
logger = logging.getLogger(__name__)
def main(args: argparse.Namespace) -> None:
scores = collect_scores_from_jsonl(args.infile)
if not scores:
logger.error("No uncertainty scores found in %r. Aborting.", args.infile)
sys.exit(1)
logger.info("Collected %d sentence-level scores", len(scores))
boundaries = fit_quantiles(scores, quantile_points=args.quantiles)
logger.info("Boundaries: %s", [f"{b:.6f}" for b in boundaries])
save_quantiles(boundaries, args.outfile)
if __name__ == "__main__":
p = argparse.ArgumentParser(description="Fit quantile normalizer from scored pairs.")
p.add_argument("--infile", required=True)
p.add_argument("--outfile", required=True,
help="e.g. config/uncertainty_quantiles_lora_laplace.json")
p.add_argument("--quantiles", type=float, nargs="+", default=[0.0, 0.25, 0.5, 0.75, 1.0])
main(p.parse_args())