piano-eval / models /calibration.py
Jai-D's picture
Upload folder using huggingface_hub
bfc6d2a verified
"""MAESTRO-based calibration for performance predictions.
Normalizes raw model predictions relative to professional MAESTRO recordings,
making scores more interpretable for end users.
"""
import numpy as np
from typing import Dict
from constants import MAESTRO_CALIBRATION, PERCEPIANO_DIMENSIONS
def calibrate_predictions(
raw_predictions: np.ndarray,
method: str = "percentile",
) -> np.ndarray:
"""Calibrate raw predictions using MAESTRO professional benchmarks.
Args:
raw_predictions: Raw model outputs [6] in range ~[0, 1]
method: Calibration method:
- "percentile": Scale to [0, 1] where 0 = MAESTRO 5th percentile,
1 = MAESTRO 95th percentile. Scores can exceed [0, 1] for
exceptional or below-average performances.
- "zscore": Convert to z-scores relative to MAESTRO distribution.
Returns:
Calibrated predictions [6]. For "percentile" method, ~0.5 means
comparable to average MAESTRO professional performance.
"""
calibrated = np.zeros_like(raw_predictions)
for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
raw_score = raw_predictions[i]
# Get calibration stats - keys match PERCEPIANO_DIMENSIONS exactly
dim_key = dim
if dim_key not in MAESTRO_CALIBRATION:
# Fallback: use raw score (this shouldn't happen with properly configured data)
calibrated[i] = raw_score
continue
stats = MAESTRO_CALIBRATION[dim_key]
if method == "percentile":
# Scale so MAESTRO 5th percentile = 0, 95th percentile = 1
# This means ~0.5 = average professional performance
p5 = stats["p5"]
p95 = stats["p95"]
range_width = p95 - p5
if range_width > 0:
calibrated[i] = (raw_score - p5) / range_width
else:
calibrated[i] = 0.5
elif method == "zscore":
# Convert to z-score relative to MAESTRO mean/std
mean = stats["mean"]
std = stats["std"]
if std > 0:
calibrated[i] = (raw_score - mean) / std
else:
calibrated[i] = 0.0
else:
calibrated[i] = raw_score
return calibrated
def predictions_to_calibrated_dict(
raw_predictions: np.ndarray,
) -> Dict[str, Dict[str, float]]:
"""Convert raw predictions to a dict with both raw and calibrated scores.
Args:
raw_predictions: Raw model outputs [6]
Returns:
Dict with structure:
{
"timing": {"raw": 0.65, "calibrated": 0.42, "percentile_rank": 42},
...
}
"""
calibrated = calibrate_predictions(raw_predictions, method="percentile")
result = {}
for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
raw_score = float(raw_predictions[i])
cal_score = float(calibrated[i])
# Clamp percentile rank to [0, 100] for display
percentile_rank = int(max(0, min(100, cal_score * 100)))
result[dim] = {
"raw": round(raw_score, 4),
"calibrated": round(max(0.0, min(1.0, cal_score)), 4),
"percentile_rank": percentile_rank,
}
return result
def get_calibration_context() -> str:
"""Get a text description of the calibration for LLM context.
Returns:
String describing how to interpret calibrated scores.
"""
return """Score Interpretation (calibrated relative to 500 professional MAESTRO recordings):
- 0.0 = Performance at the 5th percentile of professionals (lower end)
- 0.5 = Performance at the 50th percentile of professionals (average professional level)
- 1.0 = Performance at the 95th percentile of professionals (exceptional)
- Scores can exceed [0, 1] for truly exceptional or below-average performances
Note: These scores compare against competition-level professional pianists.
A calibrated score of 0.5 represents professional-level competency."""