Spaces:

kpr7
/

squad2-qa

Sleeping

squad2-qa / src /utils /tune_threshold.py

Kimis Perros

Initial deployment

461f64f 4 months ago

3.57 kB

	"""
	Minimal threshold tuning script that reuses existing pipeline code.
	Can be run from a Jupyter notebook.

	Note that tuning does NOT happen on the 'dev' set, which is considered to be
	an external, unseen dataset for objective performance measurement. Threshold
	tuning happens on the 'val' set (which is a small part of the SQuAD v2.0 training).
	"""

	import numpy as np
	from pathlib import Path
	from src.models.bert_based_model import BertBasedQAModel
	from src.etl.squad_v2_loader import load_squad_v2_df, df_to_examples_map
	from src.evaluation.evaluator import Evaluator
	from src.utils.constants import TRAIN_DATA_PATH, DEV_DATA_PATH
	from src.pipeline.qa_runner import split_by_title, DEFAULT_VAL_SET_FRACTION


	def tune_threshold_and_report_final_perf(
	experiment_dir: str \| Path,
	config_class,
	device: str,
	threshold_range: np.ndarray = np.linspace(-2, 2, 9),
	):
	"""
	Simple wrapper to tune threshold on validation (part of training) and
	report final performance on dev set.
	"""
	experiment_dir = Path(experiment_dir)
	best_threshold, _, _, model = _tune_threshold_on_validation(
	experiment_dir=experiment_dir,
	config_class=config_class,
	device=device,
	threshold_range=threshold_range,
	)

	dev_examples = df_to_examples_map(load_squad_v2_df(DEV_DATA_PATH))
	final_predictions = model.predict(dev_examples, threshold_override=best_threshold)
	final_metrics = Evaluator().evaluate(final_predictions, dev_examples)
	print(f"Final dev set performance: {final_metrics.export_for_exp_tracking()}")

	return best_threshold, model


	def _tune_threshold_on_validation(
	experiment_dir: Path,
	config_class,
	device: str,
	threshold_range: np.ndarray,
	val_fraction: float = DEFAULT_VAL_SET_FRACTION,
	):
	print("=" * 70)
	print("THRESHOLD TUNING ON VALIDATION SET")
	print("=" * 70)

	model = BertBasedQAModel.load_from_experiment(
	experiment_dir, config_class, device=device
	)
	# TODO - can also store/load the exact val question IDs used during training,
	# to be even more certain that we are tuning on the exact val set
	df = load_squad_v2_df(TRAIN_DATA_PATH)
	_, df_val = split_by_title(df, val_fraction)
	val_examples = df_to_examples_map(df_val)

	print(f"\nValidation set: {len(val_examples)} examples")
	print(
	f"Testing {len(threshold_range)} thresholds from {threshold_range.min():.1f} to {threshold_range.max():.1f}\n"
	)

	# Test each threshold
	best_f1 = -1
	best_threshold = None
	best_metrics = None
	results = []

	for threshold in threshold_range:
	predictions = model.predict(val_examples, threshold_override=threshold)
	metrics = Evaluator().evaluate(predictions, val_examples)

	results.append(
	{"threshold": threshold, "em": metrics.exact_score, "f1": metrics.f1_score}
	)

	print(
	f"Threshold: {threshold:6.2f} \| EM: {metrics.exact_score:5.2f}% \| F1: {metrics.f1_score:5.2f}%"
	)

	if metrics.f1_score > best_f1:
	best_f1 = metrics.f1_score
	best_threshold = threshold
	best_metrics = metrics

	# Type assertion
	assert best_metrics is not None, "No thresholds tested!"

	print("\n" + "=" * 70)
	print("BEST THRESHOLD")
	print("=" * 70)
	print(f"Threshold: {best_threshold:.2f}")
	print(f"EM: {best_metrics.exact_score:.2f}%")
	print(f"F1: {best_metrics.f1_score:.2f}%")
	print("=" * 70)

	return best_threshold, best_metrics, results, model