| """MAESTRO-based calibration for performance predictions. |
| |
| Normalizes raw model predictions relative to professional MAESTRO recordings, |
| making scores more interpretable for end users. |
| """ |
|
|
| import numpy as np |
| from typing import Dict |
|
|
| from constants import MAESTRO_CALIBRATION, PERCEPIANO_DIMENSIONS |
|
|
|
|
| def calibrate_predictions( |
| raw_predictions: np.ndarray, |
| method: str = "percentile", |
| ) -> np.ndarray: |
| """Calibrate raw predictions using MAESTRO professional benchmarks. |
| |
| Args: |
| raw_predictions: Raw model outputs [6] in range ~[0, 1] |
| method: Calibration method: |
| - "percentile": Scale to [0, 1] where 0 = MAESTRO 5th percentile, |
| 1 = MAESTRO 95th percentile. Scores can exceed [0, 1] for |
| exceptional or below-average performances. |
| - "zscore": Convert to z-scores relative to MAESTRO distribution. |
| |
| Returns: |
| Calibrated predictions [6]. For "percentile" method, ~0.5 means |
| comparable to average MAESTRO professional performance. |
| """ |
| calibrated = np.zeros_like(raw_predictions) |
|
|
| for i, dim in enumerate(PERCEPIANO_DIMENSIONS): |
| raw_score = raw_predictions[i] |
|
|
| |
| dim_key = dim |
| if dim_key not in MAESTRO_CALIBRATION: |
| |
| calibrated[i] = raw_score |
| continue |
|
|
| stats = MAESTRO_CALIBRATION[dim_key] |
|
|
| if method == "percentile": |
| |
| |
| p5 = stats["p5"] |
| p95 = stats["p95"] |
| range_width = p95 - p5 |
|
|
| if range_width > 0: |
| calibrated[i] = (raw_score - p5) / range_width |
| else: |
| calibrated[i] = 0.5 |
|
|
| elif method == "zscore": |
| |
| mean = stats["mean"] |
| std = stats["std"] |
| if std > 0: |
| calibrated[i] = (raw_score - mean) / std |
| else: |
| calibrated[i] = 0.0 |
|
|
| else: |
| calibrated[i] = raw_score |
|
|
| return calibrated |
|
|
|
|
| def predictions_to_calibrated_dict( |
| raw_predictions: np.ndarray, |
| ) -> Dict[str, Dict[str, float]]: |
| """Convert raw predictions to a dict with both raw and calibrated scores. |
| |
| Args: |
| raw_predictions: Raw model outputs [6] |
| |
| Returns: |
| Dict with structure: |
| { |
| "timing": {"raw": 0.65, "calibrated": 0.42, "percentile_rank": 42}, |
| ... |
| } |
| """ |
| calibrated = calibrate_predictions(raw_predictions, method="percentile") |
| result = {} |
|
|
| for i, dim in enumerate(PERCEPIANO_DIMENSIONS): |
| raw_score = float(raw_predictions[i]) |
| cal_score = float(calibrated[i]) |
|
|
| |
| percentile_rank = int(max(0, min(100, cal_score * 100))) |
|
|
| result[dim] = { |
| "raw": round(raw_score, 4), |
| "calibrated": round(max(0.0, min(1.0, cal_score)), 4), |
| "percentile_rank": percentile_rank, |
| } |
|
|
| return result |
|
|
|
|
| def get_calibration_context() -> str: |
| """Get a text description of the calibration for LLM context. |
| |
| Returns: |
| String describing how to interpret calibrated scores. |
| """ |
| return """Score Interpretation (calibrated relative to 500 professional MAESTRO recordings): |
| - 0.0 = Performance at the 5th percentile of professionals (lower end) |
| - 0.5 = Performance at the 50th percentile of professionals (average professional level) |
| - 1.0 = Performance at the 95th percentile of professionals (exceptional) |
| - Scores can exceed [0, 1] for truly exceptional or below-average performances |
| |
| Note: These scores compare against competition-level professional pianists. |
| A calibrated score of 0.5 represents professional-level competency.""" |
|
|