File size: 2,269 Bytes
bfc6d2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""Constants for A1-Max MuQ LoRA inference handler."""

PERCEPIANO_DIMENSIONS = [
    "dynamics",
    "timing",
    "pedaling",
    "articulation",
    "phrasing",
    "interpretation",
]

# A1-Max model configuration
# MuQ embeddings (1024 dim) with attention pooling -> encoder -> regression head
MODEL_CONFIG = {
    # MuQ configuration (layers to average)
    "muq_layer_start": 9,
    "muq_layer_end": 13,  # Exclusive (layers 9, 10, 11, 12)
    "muq_dim": 1024,  # Per-layer hidden size (= input_dim)
    # Head configuration
    "input_dim": 1024,
    "hidden_dim": 512,
    "num_labels": 6,
    "dropout": 0.2,
    # Audio processing
    "target_sr": 24000,
    "max_frames": 1000,
}

# Model info for response
MODEL_INFO = {
    "name": "A1-Max MuQ LoRA",
    "type": "audio-muq-lora",
    "pairwise": 0.7872,
    "description": "A1-Max: MuQ + LoRA with ListMLE, CCC, mixup, hard negative mining",
    "architecture": "MuQLoRAMaxModel (MuQ L9-12 avg -> attn pool -> encoder -> 6-dim regression)",
    "best_config": "A1max_r32_L7-12_ls0.1",
}

# Number of folds for ensemble
N_FOLDS = 4

# MAESTRO calibration stats: per-dimension distribution over 24,321 professional segments.
# Computed by model/scripts/compute_maestro_calibration.py using A1-Max 4-fold ensemble.
MAESTRO_CALIBRATION = {
    "dynamics": {
        "mean": 0.560947, "std": 0.021063,
        "p5": 0.526612, "p25": 0.546136, "p50": 0.560859, "p75": 0.575372, "p95": 0.59573,
    },
    "timing": {
        "mean": 0.531883, "std": 0.028791,
        "p5": 0.480467, "p25": 0.512976, "p50": 0.534302, "p75": 0.552652, "p95": 0.575376,
    },
    "pedaling": {
        "mean": 0.590465, "std": 0.030438,
        "p5": 0.534399, "p25": 0.572243, "p50": 0.593731, "p75": 0.611854, "p95": 0.635053,
    },
    "articulation": {
        "mean": 0.553624, "std": 0.014287,
        "p5": 0.53023, "p25": 0.543792, "p50": 0.553554, "p75": 0.563275, "p95": 0.577426,
    },
    "phrasing": {
        "mean": 0.550866, "std": 0.013717,
        "p5": 0.528466, "p25": 0.541541, "p50": 0.550801, "p75": 0.560116, "p95": 0.573567,
    },
    "interpretation": {
        "mean": 0.564377, "std": 0.023457,
        "p5": 0.522434, "p25": 0.549302, "p50": 0.566195, "p75": 0.580981, "p95": 0.599733,
    },
}