CrescendAI
/

piano-eval

Model card Files Files and versions

piano-eval / models /calibration.py

Jai-D's picture

Upload folder using huggingface_hub

bfc6d2a verified 2 months ago

history blame contribute delete

4.03 kB

	"""MAESTRO-based calibration for performance predictions.

	Normalizes raw model predictions relative to professional MAESTRO recordings,
	making scores more interpretable for end users.
	"""

	import numpy as np
	from typing import Dict

	from constants import MAESTRO_CALIBRATION, PERCEPIANO_DIMENSIONS


	def calibrate_predictions(
	raw_predictions: np.ndarray,
	method: str = "percentile",
	) -> np.ndarray:
	"""Calibrate raw predictions using MAESTRO professional benchmarks.

	Args:
	raw_predictions: Raw model outputs [6] in range ~[0, 1]
	method: Calibration method:
	- "percentile": Scale to [0, 1] where 0 = MAESTRO 5th percentile,
	1 = MAESTRO 95th percentile. Scores can exceed [0, 1] for
	exceptional or below-average performances.
	- "zscore": Convert to z-scores relative to MAESTRO distribution.

	Returns:
	Calibrated predictions [6]. For "percentile" method, ~0.5 means
	comparable to average MAESTRO professional performance.
	"""
	calibrated = np.zeros_like(raw_predictions)

	for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
	raw_score = raw_predictions[i]

	# Get calibration stats - keys match PERCEPIANO_DIMENSIONS exactly
	dim_key = dim
	if dim_key not in MAESTRO_CALIBRATION:
	# Fallback: use raw score (this shouldn't happen with properly configured data)
	calibrated[i] = raw_score
	continue

	stats = MAESTRO_CALIBRATION[dim_key]

	if method == "percentile":
	# Scale so MAESTRO 5th percentile = 0, 95th percentile = 1
	# This means ~0.5 = average professional performance
	p5 = stats["p5"]
	p95 = stats["p95"]
	range_width = p95 - p5

	if range_width > 0:
	calibrated[i] = (raw_score - p5) / range_width
	else:
	calibrated[i] = 0.5

	elif method == "zscore":
	# Convert to z-score relative to MAESTRO mean/std
	mean = stats["mean"]
	std = stats["std"]
	if std > 0:
	calibrated[i] = (raw_score - mean) / std
	else:
	calibrated[i] = 0.0

	else:
	calibrated[i] = raw_score

	return calibrated


	def predictions_to_calibrated_dict(
	raw_predictions: np.ndarray,
	) -> Dict[str, Dict[str, float]]:
	"""Convert raw predictions to a dict with both raw and calibrated scores.

	Args:
	raw_predictions: Raw model outputs [6]

	Returns:
	Dict with structure:
	{
	"timing": {"raw": 0.65, "calibrated": 0.42, "percentile_rank": 42},
	...
	}
	"""
	calibrated = calibrate_predictions(raw_predictions, method="percentile")
	result = {}

	for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
	raw_score = float(raw_predictions[i])
	cal_score = float(calibrated[i])

	# Clamp percentile rank to [0, 100] for display
	percentile_rank = int(max(0, min(100, cal_score * 100)))

	result[dim] = {
	"raw": round(raw_score, 4),
	"calibrated": round(max(0.0, min(1.0, cal_score)), 4),
	"percentile_rank": percentile_rank,
	}

	return result


	def get_calibration_context() -> str:
	"""Get a text description of the calibration for LLM context.

	Returns:
	String describing how to interpret calibrated scores.
	"""
	return """Score Interpretation (calibrated relative to 500 professional MAESTRO recordings):
	- 0.0 = Performance at the 5th percentile of professionals (lower end)
	- 0.5 = Performance at the 50th percentile of professionals (average professional level)
	- 1.0 = Performance at the 95th percentile of professionals (exceptional)
	- Scores can exceed [0, 1] for truly exceptional or below-average performances

	Note: These scores compare against competition-level professional pianists.
	A calibrated score of 0.5 represents professional-level competency."""