File size: 4,026 Bytes

bfc6d2a

"""MAESTRO-based calibration for performance predictions.

Normalizes raw model predictions relative to professional MAESTRO recordings,
making scores more interpretable for end users.
"""

import numpy as np
from typing import Dict

from constants import MAESTRO_CALIBRATION, PERCEPIANO_DIMENSIONS


def calibrate_predictions(
    raw_predictions: np.ndarray,
    method: str = "percentile",
) -> np.ndarray:
    """Calibrate raw predictions using MAESTRO professional benchmarks.

    Args:
        raw_predictions: Raw model outputs [6] in range ~[0, 1]
        method: Calibration method:
            - "percentile": Scale to [0, 1] where 0 = MAESTRO 5th percentile,
              1 = MAESTRO 95th percentile. Scores can exceed [0, 1] for
              exceptional or below-average performances.
            - "zscore": Convert to z-scores relative to MAESTRO distribution.

    Returns:
        Calibrated predictions [6]. For "percentile" method, ~0.5 means
        comparable to average MAESTRO professional performance.
    """
    calibrated = np.zeros_like(raw_predictions)

    for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
        raw_score = raw_predictions[i]

        # Get calibration stats - keys match PERCEPIANO_DIMENSIONS exactly
        dim_key = dim
        if dim_key not in MAESTRO_CALIBRATION:
            # Fallback: use raw score (this shouldn't happen with properly configured data)
            calibrated[i] = raw_score
            continue

        stats = MAESTRO_CALIBRATION[dim_key]

        if method == "percentile":
            # Scale so MAESTRO 5th percentile = 0, 95th percentile = 1
            # This means ~0.5 = average professional performance
            p5 = stats["p5"]
            p95 = stats["p95"]
            range_width = p95 - p5

            if range_width > 0:
                calibrated[i] = (raw_score - p5) / range_width
            else:
                calibrated[i] = 0.5

        elif method == "zscore":
            # Convert to z-score relative to MAESTRO mean/std
            mean = stats["mean"]
            std = stats["std"]
            if std > 0:
                calibrated[i] = (raw_score - mean) / std
            else:
                calibrated[i] = 0.0

        else:
            calibrated[i] = raw_score

    return calibrated


def predictions_to_calibrated_dict(
    raw_predictions: np.ndarray,
) -> Dict[str, Dict[str, float]]:
    """Convert raw predictions to a dict with both raw and calibrated scores.

    Args:
        raw_predictions: Raw model outputs [6]

    Returns:
        Dict with structure:
        {
            "timing": {"raw": 0.65, "calibrated": 0.42, "percentile_rank": 42},
            ...
        }
    """
    calibrated = calibrate_predictions(raw_predictions, method="percentile")
    result = {}

    for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
        raw_score = float(raw_predictions[i])
        cal_score = float(calibrated[i])

        # Clamp percentile rank to [0, 100] for display
        percentile_rank = int(max(0, min(100, cal_score * 100)))

        result[dim] = {
            "raw": round(raw_score, 4),
            "calibrated": round(max(0.0, min(1.0, cal_score)), 4),
            "percentile_rank": percentile_rank,
        }

    return result


def get_calibration_context() -> str:
    """Get a text description of the calibration for LLM context.

    Returns:
        String describing how to interpret calibrated scores.
    """
    return """Score Interpretation (calibrated relative to 500 professional MAESTRO recordings):
- 0.0 = Performance at the 5th percentile of professionals (lower end)
- 0.5 = Performance at the 50th percentile of professionals (average professional level)
- 1.0 = Performance at the 95th percentile of professionals (exceptional)
- Scores can exceed [0, 1] for truly exceptional or below-average performances

Note: These scores compare against competition-level professional pianists.
A calibrated score of 0.5 represents professional-level competency."""