Spaces:

JacobLinCool
/

TaikoChartEstimator

Running

App Files Files Community

JacobLinCool commited on 27 days ago

Commit

dce0030

verified ·

1 Parent(s): c8739ce

Upload 17 files

Browse files

Files changed (17) hide show

TaikoChartEstimator/__init__.py +0 -0
TaikoChartEstimator/constants.py +105 -0
TaikoChartEstimator/data/__init__.py +29 -0
TaikoChartEstimator/data/audio.py +231 -0
TaikoChartEstimator/data/dataset.py +427 -0
TaikoChartEstimator/data/tokenizer.py +337 -0
TaikoChartEstimator/eval/__init__.py +21 -0
TaikoChartEstimator/eval/evaluator.py +476 -0
TaikoChartEstimator/eval/metrics.py +501 -0
TaikoChartEstimator/model/__init__.py +36 -0
TaikoChartEstimator/model/aggregator.py +383 -0
TaikoChartEstimator/model/encoder.py +348 -0
TaikoChartEstimator/model/heads.py +398 -0
TaikoChartEstimator/model/losses.py +431 -0
TaikoChartEstimator/model/model.py +374 -0
TaikoChartEstimator/train/__init__.py +7 -0
TaikoChartEstimator/train/__main__.py +808 -0

TaikoChartEstimator/__init__.py ADDED Viewed

File without changes

TaikoChartEstimator/constants.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Centralized Constants for TaikoChartEstimator
+Consolidates all difficulty mappings, note types, and star ranges
+to avoid duplication across modules.
+"""
+from typing import Dict, Tuple
+# =============================================================================
+# Note Types
+# =============================================================================
+NOTE_TYPES = [
+    "Don",  # 0
+    "Ka",  # 1
+    "DonBig",  # 2
+    "KaBig",  # 3
+    "Roll",  # 4
+    "RollBig",  # 5
+    "Balloon",  # 6
+    "BalloonAlt",  # 7
+    "EndOf",  # 8
+]
+NOTE_TYPE_TO_ID: Dict[str, int] = {
+    note_type: i for i, note_type in enumerate(NOTE_TYPES)
+}
+NUM_NOTE_TYPES = len(NOTE_TYPES)
+PAD_TOKEN_ID = NUM_NOTE_TYPES  # 9 for padding
+# =============================================================================
+# Difficulty Classes
+# =============================================================================
+# Original 5 classes
+DIFFICULTY_CLASSES = ["easy", "normal", "hard", "oni", "ura"]
+# Merged classes (ura -> oni)
+DIFFICULTY_CLASSES_MERGED = ["easy", "normal", "hard", "oni_ura"]
+NUM_DIFFICULTY_CLASSES = len(DIFFICULTY_CLASSES)
+NUM_DIFFICULTY_CLASSES_MERGED = len(DIFFICULTY_CLASSES_MERGED)
+# Difficulty name -> class ID mapping (handles both cases)
+DIFFICULTY_TO_ID: Dict[str, int] = {}
+for i, d in enumerate(DIFFICULTY_CLASSES):
+    DIFFICULTY_TO_ID[d] = i
+    DIFFICULTY_TO_ID[d.capitalize()] = i
+# Difficulty ordering for ranking comparisons
+DIFFICULTY_ORDER: Dict[str, int] = {
+    "easy": 0,
+    "Easy": 0,
+    "normal": 1,
+    "Normal": 1,
+    "hard": 2,
+    "Hard": 2,
+    "oni": 3,
+    "Oni": 3,
+    "ura": 4,
+    "Ura": 4,
+}
+# =============================================================================
+# Star Ranges per Difficulty
+# =============================================================================
+# Star ranges by difficulty index
+STAR_RANGES_BY_ID: Dict[int, Tuple[int, int]] = {
+    0: (1, 5),  # easy
+    1: (1, 7),  # normal
+    2: (1, 8),  # hard
+    3: (1, 10),  # oni
+    4: (1, 10),  # ura
+}
+# Star ranges by difficulty name (includes capitalized versions)
+STAR_RANGES_BY_NAME: Dict[str, Tuple[int, int]] = {
+    "easy": (1, 5),
+    "Easy": (1, 5),
+    "normal": (1, 7),
+    "Normal": (1, 7),
+    "hard": (1, 8),
+    "Hard": (1, 8),
+    "oni": (1, 10),
+    "Oni": (1, 10),
+    "ura": (1, 10),
+    "Ura": (1, 10),
+}
+# =============================================================================
+# Helper Functions
+# =============================================================================
+def merge_difficulty_class(class_id: int) -> int:
+    """Merge ura (4) into oni (3) for classification."""
+    return 3 if class_id == 4 else class_id
+def get_difficulty_name(class_id: int, merged: bool = False) -> str:
+    """Get difficulty name from class ID."""
+    if merged:
+        return DIFFICULTY_CLASSES_MERGED[min(class_id, 3)]
+    return DIFFICULTY_CLASSES[class_id]

TaikoChartEstimator/data/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+TaikoChartEstimator Data Pipeline
+Provides event tokenization, dataset loading, and audio processing for
+MIL-based Taiko chart difficulty estimation.
+"""
+from .audio import AudioProcessor
+from .dataset import (
+    ChartBag,
+    SongGroup,
+    TaikoChartDataset,
+    WithinSongPairSampler,
+    collate_chart_bags,
+)
+from .tokenizer import NOTE_TYPE_TO_ID, NOTE_TYPES, EventToken, EventTokenizer
+__all__ = [
+    "EventToken",
+    "EventTokenizer",
+    "NOTE_TYPES",
+    "NOTE_TYPE_TO_ID",
+    "TaikoChartDataset",
+    "ChartBag",
+    "SongGroup",
+    "WithinSongPairSampler",
+    "collate_chart_bags",
+    "AudioProcessor",
+]

TaikoChartEstimator/data/audio.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Audio Processing for Taiko Chart Estimation
+Handles mel spectrogram extraction and alignment with chart events.
+"""
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as T
+class AudioProcessor:
+    """
+    Processes audio waveforms into mel spectrograms for model input.
+    Features:
+    - Mel spectrogram extraction with configurable parameters
+    - Window extraction aligned with chart timing
+    - Optional augmentation (time stretch, pitch shift)
+    """
+    def __init__(
+        self,
+        sample_rate: int = 22050,
+        n_mels: int = 128,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        f_min: float = 20.0,
+        f_max: float = 8000.0,
+        normalize: bool = True,
+    ):
+        """
+        Initialize audio processor.
+        Args:
+            sample_rate: Target sample rate for audio
+            n_mels: Number of mel frequency bins
+            n_fft: FFT window size
+            hop_length: Hop length for STFT
+            f_min: Minimum frequency for mel filterbank
+            f_max: Maximum frequency for mel filterbank
+            normalize: Whether to normalize spectrograms
+        """
+        self.sample_rate = sample_rate
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.f_min = f_min
+        self.f_max = f_max
+        self.normalize = normalize
+        # Mel spectrogram transform
+        self.mel_transform = T.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_mels=n_mels,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            f_min=f_min,
+            f_max=f_max,
+            power=2.0,
+        )
+        # Amplitude to dB
+        self.amplitude_to_db = T.AmplitudeToDB(stype="power", top_db=80)
+        # Resampler cache
+        self._resamplers: dict[int, T.Resample] = {}
+    def _get_resampler(self, orig_sr: int) -> T.Resample:
+        """Get or create a resampler for the given source sample rate."""
+        if orig_sr not in self._resamplers:
+            self._resamplers[orig_sr] = T.Resample(orig_sr, self.sample_rate)
+        return self._resamplers[orig_sr]
+    def process_audio(
+        self,
+        waveform: np.ndarray | torch.Tensor,
+        orig_sample_rate: int,
+    ) -> torch.Tensor:
+        """
+        Process raw audio waveform to mel spectrogram.
+        Args:
+            waveform: Audio waveform array [samples] or [channels, samples]
+            orig_sample_rate: Original sample rate of the audio
+        Returns:
+            Mel spectrogram tensor [n_mels, time_frames]
+        """
+        # Convert to tensor if needed
+        if isinstance(waveform, np.ndarray):
+            waveform = torch.from_numpy(waveform).float()
+        # Ensure 2D [channels, samples]
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        # Convert stereo to mono
+        if waveform.size(0) > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        # Resample if needed
+        if orig_sample_rate != self.sample_rate:
+            resampler = self._get_resampler(orig_sample_rate)
+            waveform = resampler(waveform)
+        # Compute mel spectrogram
+        mel_spec = self.mel_transform(waveform)
+        # Convert to dB scale
+        mel_spec_db = self.amplitude_to_db(mel_spec)
+        # Remove channel dimension
+        mel_spec_db = mel_spec_db.squeeze(0)
+        # Normalize if requested
+        if self.normalize:
+            mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / (
+                mel_spec_db.std() + 1e-8
+            )
+        return mel_spec_db
+    def time_to_frame(self, time_sec: float) -> int:
+        """Convert time in seconds to frame index."""
+        return int(time_sec * self.sample_rate / self.hop_length)
+    def frame_to_time(self, frame_idx: int) -> float:
+        """Convert frame index to time in seconds."""
+        return frame_idx * self.hop_length / self.sample_rate
+    def extract_window(
+        self,
+        mel_spec: torch.Tensor,
+        start_time: float,
+        end_time: float,
+        pad_value: float = 0.0,
+    ) -> torch.Tensor:
+        """
+        Extract a time window from mel spectrogram.
+        Args:
+            mel_spec: Full mel spectrogram [n_mels, time_frames]
+            start_time: Window start time in seconds
+            end_time: Window end time in seconds
+            pad_value: Value for padding if window extends beyond spectrogram
+        Returns:
+            Window tensor [n_mels, window_frames]
+        """
+        start_frame = self.time_to_frame(start_time)
+        end_frame = self.time_to_frame(end_time)
+        # Clamp to valid range
+        start_frame = max(0, start_frame)
+        end_frame = min(mel_spec.size(1), end_frame)
+        window = mel_spec[:, start_frame:end_frame]
+        # Pad if window is shorter than expected
+        expected_frames = self.time_to_frame(end_time - start_time)
+        if window.size(1) < expected_frames:
+            pad_size = expected_frames - window.size(1)
+            window = F.pad(window, (0, pad_size), value=pad_value)
+        return window
+    def extract_windows_for_instances(
+        self,
+        mel_spec: torch.Tensor,
+        instance_times: list[tuple[float, float]],
+        fixed_frames: Optional[int] = None,
+    ) -> list[torch.Tensor]:
+        """
+        Extract mel spectrogram windows aligned with chart instances.
+        Args:
+            mel_spec: Full mel spectrogram [n_mels, time_frames]
+            instance_times: List of (start_time, end_time) for each instance
+            fixed_frames: If provided, resize all windows to this frame count
+        Returns:
+            List of window tensors
+        """
+        windows = []
+        for start_time, end_time in instance_times:
+            window = self.extract_window(mel_spec, start_time, end_time)
+            if fixed_frames is not None and window.size(1) != fixed_frames:
+                # Resize to fixed frame count
+                window = F.interpolate(
+                    window.unsqueeze(0),
+                    size=fixed_frames,
+                    mode="linear",
+                    align_corners=False,
+                ).squeeze(0)
+            windows.append(window)
+        return windows
+    def compute_onset_strength(self, mel_spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute onset strength envelope from mel spectrogram.
+        Useful for beat tracking and rhythm analysis.
+        Args:
+            mel_spec: Mel spectrogram [n_mels, time_frames]
+        Returns:
+            Onset strength envelope [time_frames]
+        """
+        # Compute first-order difference
+        diff = torch.diff(mel_spec, dim=1)
+        # Half-wave rectify (keep only positive changes)
+        diff = F.relu(diff)
+        # Sum across frequency bins
+        onset_env = diff.sum(dim=0)
+        # Pad to match original length
+        onset_env = F.pad(onset_env, (1, 0))
+        return onset_env

TaikoChartEstimator/data/dataset.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Taiko Chart Dataset for MIL-based Difficulty Estimation
+Loads data from JacobLinCool/taiko-1000-parsed and provides:
+- ChartBag: A single chart with its instances (windows)
+- SongGroup: All difficulty charts for a single song (for ranking loss)
+- Within-song pair sampling for training
+"""
+from dataclasses import dataclass, field
+from typing import Iterator, Optional
+import numpy as np
+import torch
+from datasets import Dataset as HFDataset
+from datasets import load_dataset
+from torch.utils.data import Dataset, Sampler
+# Import from centralized constants
+from ..constants import (
+    DIFFICULTY_CLASSES,
+    DIFFICULTY_ORDER,
+    NOTE_TYPE_TO_ID,
+)
+from ..constants import (
+    DIFFICULTY_TO_ID as DIFFICULTY_TO_CLASS_ID,
+)
+from ..constants import (
+    STAR_RANGES_BY_NAME as STAR_RANGES,
+)
+from .audio import AudioProcessor
+from .tokenizer import EventToken, EventTokenizer
+@dataclass
+class ChartBag:
+    """
+    A single chart represented as a bag of instances for MIL.
+    Attributes:
+        song_id: Unique identifier for the song
+        difficulty: Difficulty level (easy/normal/hard/oni/ura)
+        difficulty_class_id: Integer class ID for difficulty
+        star: Star rating from label (1-10)
+        is_right_censored: True if star == max for difficulty (label is lower bound)
+        is_left_censored: True if star == min for difficulty (label is upper bound)
+        instances: List of token tensors for each window
+        instance_masks: Attention masks for each instance
+        instance_times: (start, end) time for each instance
+        audio_mel: Optional full mel spectrogram for the song
+    """
+    song_id: str
+    difficulty: str
+    difficulty_class_id: int
+    star: int
+    is_right_censored: bool
+    is_left_censored: bool
+    instances: list[torch.Tensor] = field(default_factory=list)
+    instance_masks: list[torch.Tensor] = field(default_factory=list)
+    instance_times: list[tuple[float, float]] = field(default_factory=list)
+    audio_mel: Optional[torch.Tensor] = None
+    def __len__(self) -> int:
+        return len(self.instances)
+@dataclass
+class SongGroup:
+    """
+    All charts for a single song, for within-song ranking loss.
+    Charts are ordered by difficulty (easy < normal < hard < oni < ura).
+    """
+    song_id: str
+    charts: list[ChartBag] = field(default_factory=list)
+    def get_ranking_pairs(self) -> list[tuple[ChartBag, ChartBag]]:
+        """
+        Get all adjacent difficulty pairs for ranking loss.
+        Returns:
+            List of (easier_chart, harder_chart) tuples
+        """
+        # Sort by difficulty order
+        sorted_charts = sorted(
+            self.charts, key=lambda c: DIFFICULTY_ORDER.get(c.difficulty, 0)
+        )
+        pairs = []
+        for i in range(len(sorted_charts) - 1):
+            pairs.append((sorted_charts[i], sorted_charts[i + 1]))
+        return pairs
+class TaikoChartDataset(Dataset):
+    """
+    PyTorch Dataset for Taiko chart difficulty estimation.
+    Loads from HuggingFace dataset and provides ChartBag instances.
+    Supports multi-scale windowing and optional audio features.
+    """
+    def __init__(
+        self,
+        split: str = "train",
+        dataset_name: str = "JacobLinCool/taiko-1000-parsed",
+        window_measures: list[int] = [2, 4],
+        hop_measures: int = 2,
+        max_instances_per_chart: int = 64,
+        max_tokens_per_instance: int = 128,
+        include_audio: bool = False,
+        cache_dir: Optional[str] = None,
+    ):
+        """
+        Initialize dataset.
+        Args:
+            split: Dataset split ("train" or "test")
+            dataset_name: HuggingFace dataset name
+            window_measures: Window sizes in measures for multi-scale
+            hop_measures: Hop size in measures
+            max_instances_per_chart: Maximum instances to keep per chart
+            max_tokens_per_instance: Maximum tokens per instance
+            include_audio: Whether to load and process audio
+            cache_dir: Cache directory for dataset
+        """
+        self.split = split
+        self.window_measures = window_measures
+        self.hop_measures = hop_measures
+        self.max_instances_per_chart = max_instances_per_chart
+        self.max_tokens_per_instance = max_tokens_per_instance
+        self.include_audio = include_audio
+        # Initialize processors
+        self.tokenizer = EventTokenizer()
+        self.audio_processor = AudioProcessor() if include_audio else None
+        # Load HuggingFace dataset
+        self.hf_dataset = load_dataset(
+            dataset_name,
+            split=split,
+            cache_dir=cache_dir,
+        )
+        # Build index of all charts (song_idx, difficulty)
+        self._build_chart_index()
+    def _build_chart_index(self):
+        """Build an index of all available charts across songs."""
+        self.chart_index: list[tuple[int, str]] = []  # (song_idx, difficulty)
+        self.song_groups: dict[int, SongGroup] = {}  # song_idx -> SongGroup
+        difficulties = ["easy", "normal", "hard", "oni", "ura"]
+        for song_idx in range(len(self.hf_dataset)):
+            song = self.hf_dataset[song_idx]
+            song_id = f"song_{song_idx}"
+            # Check which difficulties are available
+            available_diffs = []
+            for diff in difficulties:
+                if diff in song and song[diff] is not None:
+                    diff_data = song[diff]
+                    # Check if it has valid segments
+                    if diff_data.get("segments") and len(diff_data["segments"]) > 0:
+                        self.chart_index.append((song_idx, diff))
+                        available_diffs.append(diff)
+            # Create song group
+            if available_diffs:
+                self.song_groups[song_idx] = SongGroup(song_id=song_id)
+    def __len__(self) -> int:
+        return len(self.chart_index)
+    def _process_chart(
+        self,
+        song_data: dict,
+        song_idx: int,
+        difficulty: str,
+    ) -> ChartBag:
+        """Process a single chart into a ChartBag."""
+        song_id = f"song_{song_idx}"
+        diff_data = song_data[difficulty]
+        # Get star rating and censoring info
+        star = diff_data.get("level", 5)  # Default to 5 if missing
+        min_star, max_star = STAR_RANGES.get(difficulty, (1, 10))
+        is_right_censored = star >= max_star
+        is_left_censored = star <= min_star
+        # Get difficulty class ID
+        diff_class_id = DIFFICULTY_TO_CLASS_ID.get(difficulty, 0)
+        # Tokenize chart notes
+        segments = diff_data.get("segments", [])
+        tokens = self.tokenizer.tokenize_chart(segments)
+        # Create multi-scale windows
+        all_instances = []
+        all_masks = []
+        all_times = []
+        for window_size in self.window_measures:
+            windows = self.tokenizer.create_windows(
+                tokens,
+                window_measures=window_size,
+                hop_measures=self.hop_measures,
+            )
+            for window_tokens in windows:
+                if not window_tokens:
+                    continue
+                # Convert to tensor
+                tensor, mask = self.tokenizer.tokens_to_tensor(
+                    window_tokens,
+                    max_length=self.max_tokens_per_instance,
+                )
+                # Pad to max length
+                tensor, mask = self.tokenizer.pad_sequence(
+                    tensor, mask, self.max_tokens_per_instance
+                )
+                # Record time range
+                start_time = window_tokens[0].timestamp
+                end_time = window_tokens[-1].timestamp
+                all_instances.append(tensor)
+                all_masks.append(mask)
+                all_times.append((start_time, end_time))
+        # Limit number of instances
+        if len(all_instances) > self.max_instances_per_chart:
+            # Sample uniformly
+            indices = np.linspace(
+                0, len(all_instances) - 1, self.max_instances_per_chart, dtype=int
+            )
+            all_instances = [all_instances[i] for i in indices]
+            all_masks = [all_masks[i] for i in indices]
+            all_times = [all_times[i] for i in indices]
+        # Process audio if requested
+        audio_mel = None
+        if self.include_audio and "audio" in song_data:
+            audio_data = song_data["audio"]
+            if audio_data is not None:
+                waveform = audio_data.get("array")
+                sr = audio_data.get("sampling_rate", 22050)
+                if waveform is not None:
+                    audio_mel = self.audio_processor.process_audio(waveform, sr)
+        return ChartBag(
+            song_id=song_id,
+            difficulty=difficulty,
+            difficulty_class_id=diff_class_id,
+            star=star,
+            is_right_censored=is_right_censored,
+            is_left_censored=is_left_censored,
+            instances=all_instances,
+            instance_masks=all_masks,
+            instance_times=all_times,
+            audio_mel=audio_mel,
+        )
+    def __getitem__(self, idx: int) -> ChartBag:
+        song_idx, difficulty = self.chart_index[idx]
+        song_data = self.hf_dataset[song_idx]
+        return self._process_chart(song_data, song_idx, difficulty)
+    def get_song_group(self, song_idx: int) -> SongGroup:
+        """
+        Get all charts for a song as a SongGroup.
+        Args:
+            song_idx: Index in the HuggingFace dataset
+        Returns:
+            SongGroup with all available difficulty charts
+        """
+        song_data = self.hf_dataset[song_idx]
+        song_id = f"song_{song_idx}"
+        group = SongGroup(song_id=song_id)
+        for diff in DIFFICULTY_CLASSES:
+            if diff in song_data and song_data[diff] is not None:
+                diff_data = song_data[diff]
+                if diff_data.get("segments") and len(diff_data["segments"]) > 0:
+                    chart = self._process_chart(song_data, song_idx, diff)
+                    group.charts.append(chart)
+        return group
+    def get_all_song_indices(self) -> list[int]:
+        """Get list of unique song indices in the dataset."""
+        return list(self.song_groups.keys())
+class WithinSongBatchSampler(Sampler[list[int]]):
+    """
+    BatchSampler that ensures each batch contains complete song groups.
+    This prevents ranking loss from being broken by batch boundaries that
+    split charts from the same song into different batches.
+    """
+    def __init__(
+        self,
+        dataset: TaikoChartDataset,
+        min_batch_size: int = 16,
+        shuffle: bool = True,
+        seed: int = 2025,
+    ):
+        """
+        Initialize batch sampler.
+        Args:
+            dataset: The TaikoChartDataset
+            min_batch_size: Minimum number of charts per batch
+            shuffle: Whether to shuffle songs each epoch
+            seed: Random seed
+        """
+        self.dataset = dataset
+        self.min_batch_size = min_batch_size
+        self.shuffle = shuffle
+        self.rng = np.random.default_rng(seed)
+        # Build song to chart indices mapping
+        self.song_to_charts: dict[int, list[int]] = {}
+        for chart_idx, (song_idx, diff) in enumerate(dataset.chart_index):
+            if song_idx not in self.song_to_charts:
+                self.song_to_charts[song_idx] = []
+            self.song_to_charts[song_idx].append(chart_idx)
+        self.song_indices = list(self.song_to_charts.keys())
+    def __iter__(self) -> Iterator[list[int]]:
+        """Yield batches of chart indices, with complete song groups."""
+        song_order = self.song_indices.copy()
+        if self.shuffle:
+            self.rng.shuffle(song_order)
+        current_batch: list[int] = []
+        for song_idx in song_order:
+            chart_indices = self.song_to_charts[song_idx].copy()
+            if self.shuffle:
+                self.rng.shuffle(chart_indices)
+            # Add all charts from this song to current batch
+            current_batch.extend(chart_indices)
+            # Yield batch when we have enough samples
+            if len(current_batch) >= self.min_batch_size:
+                yield current_batch
+                current_batch = []
+        # Yield remaining samples
+        if current_batch:
+            yield current_batch
+    def __len__(self) -> int:
+        # Approximate number of batches
+        total_charts = len(self.dataset)
+        return max(1, total_charts // self.min_batch_size)
+# Keep old class name as alias for backward compatibility
+WithinSongPairSampler = WithinSongBatchSampler
+def collate_chart_bags(bags: list[ChartBag], max_seq_len: int = 128) -> dict:
+    """
+    Collate function for ChartBag instances.
+    Args:
+        bags: List of ChartBag instances to collate
+        max_seq_len: Fallback sequence length for padding empty instances
+    Returns a dictionary suitable for model input.
+    """
+    # Stack instances: need to handle variable numbers
+    max_instances = max(len(b.instances) for b in bags)
+    # Infer sequence length from first non-empty bag, or use parameter
+    inferred_seq_len = max_seq_len
+    for bag in bags:
+        if bag.instances:
+            inferred_seq_len = bag.instances[0].shape[0]
+            break
+    # Pad instances to same count
+    batch_instances = []
+    batch_masks = []
+    instance_counts = []
+    for bag in bags:
+        instances = bag.instances
+        masks = bag.instance_masks
+        # Pad to max_instances
+        n_pad = max_instances - len(instances)
+        if n_pad > 0:
+            # Infer shape from existing instances or use fallback
+            pad_shape = instances[0].shape if instances else (inferred_seq_len, 6)
+            instances = instances + [torch.zeros(pad_shape) for _ in range(n_pad)]
+            masks = masks + [torch.zeros(pad_shape[0]) for _ in range(n_pad)]
+        batch_instances.append(torch.stack(instances))
+        batch_masks.append(torch.stack(masks))
+        instance_counts.append(len(bag.instances))
+    return {
+        "instances": torch.stack(batch_instances),  # [B, N, L, 6]
+        "instance_masks": torch.stack(batch_masks),  # [B, N, L]
+        "instance_counts": torch.tensor(instance_counts),  # [B]
+        "difficulty_class": torch.tensor([b.difficulty_class_id for b in bags]),  # [B]
+        "star": torch.tensor([b.star for b in bags], dtype=torch.float32),  # [B]
+        "is_right_censored": torch.tensor([b.is_right_censored for b in bags]),  # [B]
+        "is_left_censored": torch.tensor([b.is_left_censored for b in bags]),  # [B]
+        "song_ids": [b.song_id for b in bags],  # List[str]
+        "difficulties": [b.difficulty for b in bags],  # List[str]
+    }

TaikoChartEstimator/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+Event Tokenizer for Taiko Chart Notes
+Converts raw chart note data into event tokens suitable for sequence modeling.
+Handles 9 note types with continuous features (BPM, scroll, timestamp, duration).
+"""
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+# Import from centralized constants
+from ..constants import (
+    DIFFICULTY_ORDER,
+    NOTE_TYPE_TO_ID,
+    NOTE_TYPES,
+    PAD_TOKEN_ID,
+)
+from ..constants import (
+    STAR_RANGES_BY_NAME as STAR_RANGES,
+)
+@dataclass
+class EventToken:
+    """A single event token representing a note or event in the chart."""
+    timestamp: float  # Absolute time in seconds
+    beat_position: float  # Position within the measure (0-1)
+    note_type: int  # ID from NOTE_TYPE_TO_ID
+    duration: float  # Duration for rolls/balloons (0 for regular notes)
+    bpm: float  # Current BPM at this event
+    scroll: float  # Scroll speed multiplier
+    gogo: bool  # Whether in GOGO time (increased scoring)
+    def to_tensor(self) -> torch.Tensor:
+        """Convert to tensor representation [type_id, beat_pos, duration, bpm, scroll, gogo]."""
+        return torch.tensor(
+            [
+                self.note_type,
+                self.beat_position,
+                self.duration,
+                self.bpm,
+                self.scroll,
+                float(self.gogo),
+            ],
+            dtype=torch.float32,
+        )
+class EventTokenizer:
+    """
+    Tokenizes Taiko chart data into event token sequences.
+    Features:
+    - Extracts note events from segments
+    - Computes beat-relative positions
+    - Normalizes continuous features (BPM, scroll)
+    - Creates beat-aligned windows for MIL instances
+    """
+    def __init__(
+        self,
+        bpm_mean: float = 150.0,
+        bpm_std: float = 50.0,
+        scroll_mean: float = 1.0,
+        scroll_std: float = 0.5,
+        max_duration: float = 4.0,  # Max roll/balloon duration in beats
+    ):
+        self.bpm_mean = bpm_mean
+        self.bpm_std = bpm_std
+        self.scroll_mean = scroll_mean
+        self.scroll_std = scroll_std
+        self.max_duration = max_duration
+    def tokenize_chart(self, segments: list[dict]) -> list[EventToken]:
+        """
+        Convert chart segments to a list of EventTokens.
+        Args:
+            segments: List of segment dicts from the dataset
+        Returns:
+            List of EventToken objects, sorted by timestamp
+        """
+        tokens = []
+        for segment in segments:
+            segment_start = segment["timestamp"]
+            measure_num = segment.get("measure_num", 4)
+            measure_den = segment.get("measure_den", 4)
+            notes = segment.get("notes", [])
+            for note in notes:
+                note_type_str = note.get("note_type", "Don")
+                if note_type_str not in NOTE_TYPE_TO_ID:
+                    continue  # Skip unknown note types
+                # Calculate beat position within measure
+                note_time = note.get("timestamp", segment_start)
+                # Estimate beat position (simplified - assuming 4/4)
+                beat_in_measure = (
+                    (note_time - segment_start) * note.get("bpm", 120) / 60
+                ) % measure_num
+                beat_position = (
+                    beat_in_measure / measure_num if measure_num > 0 else 0.0
+                )
+                # Calculate duration for long notes
+                duration = 0.0
+                if note_type_str in ["Roll", "RollBig", "Balloon", "BalloonAlt"]:
+                    # Duration will be until EndOf, but we estimate from context
+                    duration = note.get("delay", 0.0)  # Use delay as duration hint
+                token = EventToken(
+                    timestamp=note_time,
+                    beat_position=beat_position,
+                    note_type=NOTE_TYPE_TO_ID[note_type_str],
+                    duration=min(duration, self.max_duration),
+                    bpm=note.get("bpm", 120.0),
+                    scroll=note.get("scroll", 1.0),
+                    gogo=note.get("gogo", False),
+                )
+                tokens.append(token)
+        # Sort by timestamp
+        tokens.sort(key=lambda t: t.timestamp)
+        return tokens
+    def compute_note_density(
+        self, tokens: list[EventToken], window_sec: float = 1.0
+    ) -> list[float]:
+        """
+        Compute local note density for each token (notes per second in window).
+        Args:
+            tokens: List of EventTokens
+            window_sec: Window size in seconds for density calculation
+        Returns:
+            List of density values, one per token
+        """
+        if not tokens:
+            return []
+        timestamps = np.array([t.timestamp for t in tokens])
+        densities = []
+        for i, t in enumerate(tokens):
+            # Count notes in window centered on this note
+            window_start = t.timestamp - window_sec / 2
+            window_end = t.timestamp + window_sec / 2
+            count = np.sum((timestamps >= window_start) & (timestamps <= window_end))
+            density = count / window_sec
+            densities.append(density)
+        return densities
+    def create_windows(
+        self,
+        tokens: list[EventToken],
+        window_measures: int = 4,
+        hop_measures: int = 2,
+        default_bpm: float = 120.0,
+    ) -> list[list[EventToken]]:
+        """
+        Create beat-aligned windows from token sequence, respecting BPM changes.
+        Windows are created within BPM-consistent segments to ensure proper
+        beat alignment. This prevents window boundaries from falling on
+        off-beats when BPM changes occur.
+        Args:
+            tokens: List of EventTokens
+            window_measures: Window size in measures
+            hop_measures: Hop size in measures
+            default_bpm: Default BPM if not available
+        Returns:
+            List of token subsequences (windows)
+        """
+        if not tokens:
+            return []
+        # Split tokens by BPM changes
+        segments = self._split_by_bpm(tokens, threshold=5.0)
+        all_windows = []
+        for segment_tokens in segments:
+            if not segment_tokens:
+                continue
+            # Use this segment's BPM for window calculation
+            segment_bpm = (
+                segment_tokens[0].bpm if segment_tokens[0].bpm > 0 else default_bpm
+            )
+            beats_per_measure = 4  # Assuming 4/4 time
+            measure_duration = (beats_per_measure * 60) / segment_bpm
+            window_duration = window_measures * measure_duration
+            hop_duration = hop_measures * measure_duration
+            # Create windows within this segment
+            start_time = segment_tokens[0].timestamp
+            end_time = segment_tokens[-1].timestamp
+            current_start = start_time
+            while current_start < end_time:
+                window_end = current_start + window_duration
+                # Get tokens in this window
+                window_tokens = [
+                    t
+                    for t in segment_tokens
+                    if current_start <= t.timestamp < window_end
+                ]
+                if window_tokens:  # Only add non-empty windows
+                    all_windows.append(window_tokens)
+                current_start += hop_duration
+        return all_windows
+    def _split_by_bpm(
+        self,
+        tokens: list[EventToken],
+        threshold: float = 5.0,
+    ) -> list[list[EventToken]]:
+        """
+        Split token list into segments with consistent BPM.
+        Args:
+            tokens: List of EventTokens sorted by timestamp
+            threshold: BPM difference threshold to trigger a new segment
+        Returns:
+            List of token lists, one per BPM segment
+        """
+        if not tokens:
+            return []
+        segments = []
+        current_segment = [tokens[0]]
+        current_bpm = tokens[0].bpm
+        for token in tokens[1:]:
+            if abs(token.bpm - current_bpm) > threshold:
+                # BPM changed significantly, start new segment
+                if current_segment:
+                    segments.append(current_segment)
+                current_segment = [token]
+                current_bpm = token.bpm
+            else:
+                current_segment.append(token)
+        # Don't forget the last segment
+        if current_segment:
+            segments.append(current_segment)
+        return segments
+    def tokens_to_tensor(
+        self,
+        tokens: list[EventToken],
+        max_length: Optional[int] = None,
+        normalize: bool = True,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Convert token list to padded tensor batch.
+        Args:
+            tokens: List of EventTokens
+            max_length: Maximum sequence length (None = no limit)
+            normalize: Whether to normalize continuous features
+        Returns:
+            Tuple of (token_tensor, attention_mask)
+            token_tensor: [seq_len, 6] - [type, beat_pos, duration, bpm, scroll, gogo]
+            attention_mask: [seq_len] - 1 for real tokens, 0 for padding
+        """
+        if not tokens:
+            # Return empty tensors
+            return torch.zeros(1, 6), torch.zeros(1)
+        # Truncate if needed
+        if max_length is not None and len(tokens) > max_length:
+            tokens = tokens[:max_length]
+        # Stack token tensors
+        tensor = torch.stack([t.to_tensor() for t in tokens])
+        if normalize:
+            # Normalize BPM (column 3)
+            tensor[:, 3] = (tensor[:, 3] - self.bpm_mean) / self.bpm_std
+            # Normalize scroll (column 4)
+            tensor[:, 4] = (tensor[:, 4] - self.scroll_mean) / self.scroll_std
+        # Create attention mask (all 1s for real tokens)
+        mask = torch.ones(len(tokens))
+        return tensor, mask
+    def pad_sequence(
+        self,
+        tensor: torch.Tensor,
+        mask: torch.Tensor,
+        target_length: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Pad tensor and mask to target length.
+        Args:
+            tensor: [seq_len, 6] token tensor
+            mask: [seq_len] attention mask
+            target_length: Target sequence length
+        Returns:
+            Padded tensor and mask
+        """
+        current_length = tensor.size(0)
+        if current_length >= target_length:
+            return tensor[:target_length], mask[:target_length]
+        # Pad tensor
+        pad_length = target_length - current_length
+        pad_tensor = torch.zeros(pad_length, tensor.size(1))
+        pad_tensor[:, 0] = PAD_TOKEN_ID  # Set type to PAD
+        padded_tensor = torch.cat([tensor, pad_tensor], dim=0)
+        padded_mask = torch.cat([mask, torch.zeros(pad_length)], dim=0)
+        return padded_tensor, padded_mask

TaikoChartEstimator/eval/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+TaikoChartEstimator Evaluation Package
+"""
+from .evaluator import Evaluator
+from .metrics import (
+    DecompressionMetrics,
+    DifficultyMetrics,
+    MILHealthMetrics,
+    MonotonicityMetrics,
+    StarMetrics,
+)
+__all__ = [
+    "DifficultyMetrics",
+    "StarMetrics",
+    "MonotonicityMetrics",
+    "DecompressionMetrics",
+    "MILHealthMetrics",
+    "Evaluator",
+]

TaikoChartEstimator/eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,476 @@

+"""
+Evaluator for TaikoChartEstimator
+Orchestrates evaluation across all metric types and generates reports.
+"""
+import argparse
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from ..data import TaikoChartDataset, collate_chart_bags
+from ..model import ModelConfig, TaikoChartEstimator
+from .metrics import (
+    DecompressionMetrics,
+    DifficultyMetrics,
+    MILHealthMetrics,
+    MonotonicityMetrics,
+    StarMetrics,
+)
+class Evaluator:
+    """
+    Comprehensive evaluator for TaikoChartEstimator.
+    Runs all metrics and generates detailed reports.
+    """
+    def __init__(
+        self,
+        model: TaikoChartEstimator,
+        device: torch.device = torch.device("cpu"),
+    ):
+        self.model = model
+        self.device = device
+        # Initialize metric calculators
+        self.difficulty_metrics = DifficultyMetrics()
+        self.star_metrics = StarMetrics()
+        self.monotonicity_metrics = MonotonicityMetrics()
+        self.decompression_metrics = DecompressionMetrics()
+        self.mil_health_metrics = MILHealthMetrics()
+    @torch.no_grad()
+    def run_inference(
+        self,
+        dataloader: DataLoader,
+    ) -> dict:
+        """
+        Run inference on entire dataset and collect predictions.
+        Returns:
+            Dict with all predictions and metadata
+        """
+        self.model.eval()
+        results = {
+            "pred_difficulty_class": [],
+            "true_difficulty_class": [],
+            "pred_star": [],
+            "true_star": [],
+            "raw_score": [],
+            "song_ids": [],
+            "difficulties": [],
+            "is_right_censored": [],
+            "is_left_censored": [],
+            "attention_weights": [],
+            "instance_counts": [],
+        }
+        for batch in tqdm(dataloader, desc="Running inference"):
+            instances = batch["instances"].to(self.device)
+            instance_masks = batch["instance_masks"].to(self.device)
+            instance_counts = batch["instance_counts"].to(self.device)
+            difficulty_class = batch["difficulty_class"].to(self.device)
+            output = self.model(
+                instances,
+                instance_masks,
+                instance_counts,
+                difficulty_hint=difficulty_class,
+                return_attention=True,
+            )
+            # Collect predictions
+            results["pred_difficulty_class"].extend(
+                output.difficulty_logits.argmax(dim=-1).cpu().numpy()
+            )
+            results["true_difficulty_class"].extend(batch["difficulty_class"].numpy())
+            results["pred_star"].extend(output.raw_star.cpu().numpy())
+            results["true_star"].extend(batch["star"].numpy())
+            results["raw_score"].extend(output.raw_score.cpu().numpy())
+            results["song_ids"].extend(batch["song_ids"])
+            results["difficulties"].extend(batch["difficulties"])
+            results["is_right_censored"].extend(batch["is_right_censored"].numpy())
+            results["is_left_censored"].extend(batch["is_left_censored"].numpy())
+            results["instance_counts"].extend(instance_counts.cpu().numpy())
+            # Collect attention weights (average across branches if multi-branch)
+            if "average_attention" in output.attention_info:
+                results["attention_weights"].extend(
+                    output.attention_info["average_attention"].cpu().numpy()
+                )
+        # Convert to numpy arrays
+        for key in [
+            "pred_difficulty_class",
+            "true_difficulty_class",
+            "pred_star",
+            "true_star",
+            "raw_score",
+            "is_right_censored",
+            "is_left_censored",
+            "instance_counts",
+        ]:
+            results[key] = np.array(results[key])
+        if results["attention_weights"]:
+            results["attention_weights"] = np.stack(results["attention_weights"])
+        return results
+    def compute_all_metrics(self, results: dict) -> dict:
+        """
+        Compute all metrics from inference results.
+        Returns:
+            Dict with all metrics organized by category
+        """
+        all_metrics = {}
+        # Difficulty classification metrics
+        all_metrics["difficulty"] = self.difficulty_metrics.compute(
+            results["pred_difficulty_class"],
+            results["true_difficulty_class"],
+        )
+        # Star regression metrics
+        all_metrics["star"] = self.star_metrics.compute(
+            results["pred_star"],
+            results["true_star"],
+            results["true_difficulty_class"],
+            results["is_right_censored"],
+            results["is_left_censored"],
+        )
+        # Monotonicity metrics
+        all_metrics["monotonicity"] = self.monotonicity_metrics.compute(
+            results["raw_score"],
+            results["song_ids"],
+            results["difficulties"],
+        )
+        # Decompression metrics
+        all_metrics["decompression"] = self.decompression_metrics.compute(
+            results["pred_star"],
+            results["true_star"],
+            results["true_difficulty_class"],
+        )
+        # MIL health metrics
+        if len(results.get("attention_weights", [])) > 0:
+            all_metrics["mil_health"] = self.mil_health_metrics.compute(
+                results["attention_weights"],
+                results["instance_counts"],
+            )
+        return all_metrics
+    def generate_report(
+        self,
+        metrics: dict,
+        output_path: Optional[Path] = None,
+    ) -> str:
+        """
+        Generate a human-readable report from metrics.
+        Returns:
+            Report as markdown string
+        """
+        lines = []
+        lines.append("# TaikoChartEstimator Evaluation Report")
+        lines.append(f"\nGenerated: {datetime.now().isoformat()}\n")
+        # Difficulty Classification
+        lines.append("## Difficulty Classification")
+        lines.append("")
+        d_metrics = metrics.get("difficulty", {})
+        lines.append(f"- **Accuracy**: {d_metrics.get('accuracy', 0):.4f}")
+        lines.append(
+            f"- **Balanced Accuracy**: {d_metrics.get('balanced_accuracy', 0):.4f}"
+        )
+        lines.append(f"- **Macro F1**: {d_metrics.get('macro_f1', 0):.4f}")
+        lines.append(
+            f"- **±1 Accuracy**: {d_metrics.get('plus_minus_1_accuracy', 0):.4f}"
+        )
+        lines.append("")
+        # Per-class F1
+        lines.append("### Per-Class F1")
+        for cls in ["easy", "normal", "hard", "oni", "ura"]:
+            f1 = d_metrics.get(f"f1_{cls}", 0)
+            lines.append(f"- {cls.capitalize()}: {f1:.4f}")
+        lines.append("")
+        # Star Regression
+        lines.append("## Star Rating Prediction")
+        lines.append("")
+        s_metrics = metrics.get("star", {})
+        lines.append("### Overall")
+        lines.append(f"- **MAE**: {s_metrics.get('mae', 0):.4f}")
+        lines.append(f"- **RMSE**: {s_metrics.get('rmse', 0):.4f}")
+        lines.append(f"- **Spearman ρ**: {s_metrics.get('spearman_rho', 0):.4f}")
+        lines.append("")
+        lines.append("### Uncensored Samples")
+        lines.append(f"- **MAE**: {s_metrics.get('mae_uncensored', 0):.4f}")
+        lines.append(
+            f"- **Spearman ρ**: {s_metrics.get('spearman_rho_uncensored', 0):.4f}"
+        )
+        lines.append("")
+        lines.append("### Censoring Consistency")
+        lines.append(
+            f"- **Right Censor Violation Rate**: {s_metrics.get('right_censor_violation_rate', 0):.4f}"
+        )
+        lines.append(
+            f"- **Right Censor Mean Shortfall**: {s_metrics.get('right_censor_mean_shortfall', 0):.4f}"
+        )
+        lines.append(
+            f"- **Left Censor Violation Rate**: {s_metrics.get('left_censor_violation_rate', 0):.4f}"
+        )
+        lines.append("")
+        # Monotonicity
+        lines.append("## Within-Song Monotonicity")
+        lines.append("")
+        m_metrics = metrics.get("monotonicity", {})
+        lines.append(
+            f"- **Violation Rate**: {m_metrics.get('violation_rate', 0):.4f} ({m_metrics.get('n_violations', 0)}/{m_metrics.get('n_pairs', 0)} pairs)"
+        )
+        lines.append(
+            f"- **Mean Violation Margin**: {m_metrics.get('mean_violation_margin', 0):.4f}"
+        )
+        lines.append(
+            f"- **Mean Kendall τ (within-song)**: {m_metrics.get('mean_kendall_tau_within_song', 0):.4f}"
+        )
+        lines.append("")
+        # Decompression
+        lines.append("## 10-Star Decompression")
+        lines.append("")
+        dec_metrics = metrics.get("decompression", {})
+        lines.append(
+            f"- **Std (10-star predictions)**: {dec_metrics.get('std_10star', 0):.4f}"
+        )
+        lines.append(
+            f"- **Range (10-star predictions)**: {dec_metrics.get('range_10star', 0):.4f}"
+        )
+        if "p90_p50_10star" in dec_metrics:
+            lines.append(f"- **P90 - P50**: {dec_metrics.get('p90_p50_10star', 0):.4f}")
+            lines.append(f"- **P99 - P90**: {dec_metrics.get('p99_p90_10star', 0):.4f}")
+        lines.append("")
+        # MIL Health
+        if "mil_health" in metrics:
+            lines.append("## MIL Attention Health")
+            lines.append("")
+            mil_metrics = metrics["mil_health"]
+            lines.append(
+                f"- **Mean Attention Entropy**: {mil_metrics.get('mean_attention_entropy', 0):.4f}"
+            )
+            lines.append(
+                f"- **Mean Effective Instances**: {mil_metrics.get('mean_effective_instances', 0):.4f}"
+            )
+            lines.append(
+                f"- **Mean Top-5% Mass**: {mil_metrics.get('mean_top5_mass', 0):.4f}"
+            )
+            if mil_metrics.get("attention_collapse_warning", False):
+                lines.append("")
+                lines.append(
+                    "> ⚠️ **Warning**: Attention collapse detected! "
+                    "Model may be relying on too few instances."
+                )
+            lines.append("")
+        report = "\n".join(lines)
+        if output_path:
+            output_path.write_text(report)
+        return report
+    def evaluate(
+        self,
+        dataloader: DataLoader,
+        output_dir: Optional[Path] = None,
+    ) -> dict:
+        """
+        Run full evaluation pipeline.
+        Args:
+            dataloader: DataLoader for evaluation data
+            output_dir: Optional directory to save results
+        Returns:
+            Dict with all metrics
+        """
+        # Run inference
+        results = self.run_inference(dataloader)
+        # Compute metrics
+        metrics = self.compute_all_metrics(results)
+        # Generate report
+        report = self.generate_report(metrics)
+        if output_dir:
+            output_dir.mkdir(parents=True, exist_ok=True)
+            # Save metrics as JSON
+            # Convert numpy types for JSON serialization
+            def convert_numpy(obj):
+                if isinstance(obj, np.ndarray):
+                    return obj.tolist()
+                elif isinstance(obj, np.integer):
+                    return int(obj)
+                elif isinstance(obj, np.floating):
+                    return float(obj)
+                elif isinstance(obj, (np.bool_, bool)):
+                    return bool(obj)
+                elif isinstance(obj, dict):
+                    return {k: convert_numpy(v) for k, v in obj.items()}
+                elif isinstance(obj, list):
+                    return [convert_numpy(v) for v in obj]
+                return obj
+            metrics_serializable = convert_numpy(metrics)
+            with open(output_dir / "metrics.json", "w") as f:
+                json.dump(metrics_serializable, f, indent=2)
+            # Save report
+            (output_dir / "report.md").write_text(report)
+            print(f"Results saved to {output_dir}")
+        return metrics
+def load_model_from_checkpoint(
+    checkpoint_path: Path,
+    device: torch.device,
+) -> TaikoChartEstimator:
+    """
+    Load model from checkpoint.
+    Supports two formats:
+    1. Traditional .pt checkpoint file (contains model_state_dict and config)
+    2. HuggingFace save_pretrained directory (saved via model.save_pretrained())
+    Args:
+        checkpoint_path: Path to checkpoint file or pretrained directory
+        device: Device to load model to
+    Returns:
+        Loaded TaikoChartEstimator model
+    """
+    checkpoint_path = Path(checkpoint_path)
+    if checkpoint_path.is_dir():
+        # HuggingFace pretrained directory format
+        model = TaikoChartEstimator.from_pretrained(
+            checkpoint_path,
+        ).to(device)
+    else:
+        # Traditional .pt checkpoint format
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        config = ModelConfig(**checkpoint["config"])
+        model = TaikoChartEstimator(config)
+        model.load_state_dict(checkpoint["model_state_dict"])
+    model = model.to(device)
+    model.eval()
+    return model
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate TaikoChartEstimator")
+    parser.add_argument(
+        "--checkpoint", type=str, required=True, help="Path to model checkpoint"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="JacobLinCool/taiko-1000-parsed",
+        help="HuggingFace dataset name",
+    )
+    parser.add_argument(
+        "--split", type=str, default="test", help="Dataset split to evaluate"
+    )
+    parser.add_argument("--batch-size", type=int, default=16, help="Batch size")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="eval_results",
+        help="Output directory for results",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to use",
+    )
+    parser.add_argument(
+        "--num-workers", type=int, default=4, help="Number of data loader workers"
+    )
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    # Load model
+    print(f"Loading model from {args.checkpoint}")
+    model = load_model_from_checkpoint(Path(args.checkpoint), device)
+    # Load dataset
+    print(f"Loading {args.split} dataset...")
+    dataset = TaikoChartDataset(
+        split=args.split,
+        dataset_name=args.dataset,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        collate_fn=collate_chart_bags,
+        num_workers=args.num_workers,
+    )
+    print(f"Evaluating on {len(dataset)} samples...")
+    # Run evaluation
+    evaluator = Evaluator(model, device)
+    metrics = evaluator.evaluate(
+        dataloader,
+        output_dir=Path(args.output_dir),
+    )
+    # Print summary
+    print("\n" + "=" * 50)
+    print("EVALUATION SUMMARY")
+    print("=" * 50)
+    print(f"Difficulty Macro-F1: {metrics['difficulty']['macro_f1']:.4f}")
+    print(f"Star MAE (uncensored): {metrics['star']['mae_uncensored']:.4f}")
+    print(f"Star Spearman ρ: {metrics['star']['spearman_rho']:.4f}")
+    print(
+        f"Monotonicity Violation Rate: {metrics['monotonicity']['violation_rate']:.4f}"
+    )
+    print(
+        f"10-Star Decompression Std: {metrics['decompression'].get('std_10star', 0):.4f}"
+    )
+    print("=" * 50)
+if __name__ == "__main__":
+    main()

TaikoChartEstimator/eval/metrics.py ADDED Viewed

	@@ -0,0 +1,501 @@

+"""
+Evaluation Metrics for TaikoChartEstimator
+Comprehensive metrics covering:
+- Difficulty classification
+- Star rating regression (with censoring awareness)
+- Monotonicity constraints
+- 10-star decompression
+- MIL attention health
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+from scipy.stats import kendalltau, spearmanr
+from sklearn.metrics import (
+    accuracy_score,
+    balanced_accuracy_score,
+    confusion_matrix,
+    f1_score,
+)
+from ..constants import STAR_RANGES_BY_ID
+@dataclass
+class DifficultyMetrics:
+    """
+    Metrics for difficulty classification (easy/normal/hard/oni/ura).
+    Includes ordinal-aware metrics since difficulties are ordered.
+    Note: ura (4) and oni (3) are treated as the same class for metrics.
+    """
+    merge_ura_oni: bool = True  # Treat ura and oni as the same class
+    def _merge_classes(self, arr: np.ndarray) -> np.ndarray:
+        """Merge ura (4) into oni (3) class."""
+        if self.merge_ura_oni:
+            arr = arr.copy()
+            arr[arr == 4] = 3  # Map ura -> oni
+        return arr
+    def compute(
+        self,
+        predictions: np.ndarray,
+        targets: np.ndarray,
+    ) -> dict:
+        """
+        Compute classification metrics.
+        Args:
+            predictions: Predicted difficulty class indices [N]
+            targets: True difficulty class indices [N]
+        Returns:
+            Dict with all metrics
+        """
+        metrics = {}
+        # Merge ura and oni classes if enabled
+        predictions = self._merge_classes(predictions)
+        targets = self._merge_classes(targets)
+        # Standard classification metrics
+        metrics["accuracy"] = accuracy_score(targets, predictions)
+        metrics["balanced_accuracy"] = balanced_accuracy_score(targets, predictions)
+        metrics["macro_f1"] = f1_score(targets, predictions, average="macro")
+        metrics["weighted_f1"] = f1_score(targets, predictions, average="weighted")
+        # Per-class F1 (4 classes when merged: easy, normal, hard, oni/ura)
+        per_class_f1 = f1_score(targets, predictions, average=None)
+        if self.merge_ura_oni:
+            class_names = ["easy", "normal", "hard", "oni_ura"]
+        else:
+            class_names = ["easy", "normal", "hard", "oni", "ura"]
+        for i, name in enumerate(class_names):
+            if i < len(per_class_f1):
+                metrics[f"f1_{name}"] = per_class_f1[i]
+        # Ordinal-aware metrics (difficulties are ordered)
+        abs_diff = np.abs(predictions - targets)
+        metrics["mean_absolute_error_ordinal"] = abs_diff.mean()
+        metrics["plus_minus_1_accuracy"] = (abs_diff <= 1).mean()
+        metrics["plus_minus_2_accuracy"] = (abs_diff <= 2).mean()
+        # Confusion matrix
+        metrics["confusion_matrix"] = confusion_matrix(targets, predictions)
+        return metrics
+@dataclass
+class StarMetrics:
+    """
+    Metrics for star rating prediction with censoring awareness.
+    Separates metrics for:
+    - Uncensored samples (true regression quality)
+    - Right-censored samples (10-star boundary)
+    - Left-censored samples (1-star boundary)
+    """
+    star_ranges: dict = field(default_factory=lambda: STAR_RANGES_BY_ID.copy())
+    def compute(
+        self,
+        predictions: np.ndarray,
+        targets: np.ndarray,
+        difficulties: np.ndarray,
+        is_right_censored: Optional[np.ndarray] = None,
+        is_left_censored: Optional[np.ndarray] = None,
+    ) -> dict:
+        """
+        Compute star regression metrics.
+        Args:
+            predictions: Predicted star ratings [N]
+            targets: Target star labels [N]
+            difficulties: Difficulty class indices [N]
+            is_right_censored: Boolean mask for right-censored samples
+            is_left_censored: Boolean mask for left-censored samples
+        Returns:
+            Dict with all metrics
+        """
+        metrics = {}
+        # Auto-detect censoring if not provided
+        if is_right_censored is None or is_left_censored is None:
+            is_right_censored = np.zeros(len(predictions), dtype=bool)
+            is_left_censored = np.zeros(len(predictions), dtype=bool)
+            for diff_idx, (min_star, max_star) in self.star_ranges.items():
+                mask = difficulties == diff_idx
+                is_right_censored[mask] = targets[mask] >= max_star
+                is_left_censored[mask] = targets[mask] <= min_star
+        # Overall metrics
+        metrics["mae"] = np.abs(predictions - targets).mean()
+        metrics["rmse"] = np.sqrt(((predictions - targets) ** 2).mean())
+        if len(predictions) > 1:
+            rho, p_value = spearmanr(predictions, targets)
+            metrics["spearman_rho"] = rho
+            metrics["spearman_pvalue"] = p_value
+        else:
+            metrics["spearman_rho"] = 0.0
+            metrics["spearman_pvalue"] = 1.0
+        # Uncensored samples: true regression quality
+        uncensored_mask = ~(is_right_censored | is_left_censored)
+        if uncensored_mask.sum() > 0:
+            uncensored_preds = predictions[uncensored_mask]
+            uncensored_targets = targets[uncensored_mask]
+            metrics["mae_uncensored"] = np.abs(
+                uncensored_preds - uncensored_targets
+            ).mean()
+            metrics["rmse_uncensored"] = np.sqrt(
+                ((uncensored_preds - uncensored_targets) ** 2).mean()
+            )
+            if len(uncensored_preds) > 1:
+                rho, _ = spearmanr(uncensored_preds, uncensored_targets)
+                metrics["spearman_rho_uncensored"] = rho
+            else:
+                metrics["spearman_rho_uncensored"] = 0.0
+        else:
+            metrics["mae_uncensored"] = 0.0
+            metrics["rmse_uncensored"] = 0.0
+            metrics["spearman_rho_uncensored"] = 0.0
+        # Right-censored (at max star): check violation
+        if is_right_censored.sum() > 0:
+            right_preds = predictions[is_right_censored]
+            right_targets = targets[is_right_censored]
+            # Violation: prediction below the max star bound
+            violation_mask = right_preds < right_targets
+            metrics["right_censor_violation_rate"] = violation_mask.mean()
+            if violation_mask.sum() > 0:
+                metrics["right_censor_mean_shortfall"] = (
+                    right_targets[violation_mask] - right_preds[violation_mask]
+                ).mean()
+            else:
+                metrics["right_censor_mean_shortfall"] = 0.0
+            metrics["right_censor_count"] = is_right_censored.sum()
+        else:
+            metrics["right_censor_violation_rate"] = 0.0
+            metrics["right_censor_mean_shortfall"] = 0.0
+            metrics["right_censor_count"] = 0
+        # Left-censored (at min star): check violation
+        if is_left_censored.sum() > 0:
+            left_preds = predictions[is_left_censored]
+            left_targets = targets[is_left_censored]
+            # Violation: prediction above the min star bound
+            violation_mask = left_preds > left_targets
+            metrics["left_censor_violation_rate"] = violation_mask.mean()
+            if violation_mask.sum() > 0:
+                metrics["left_censor_mean_overshoot"] = (
+                    left_preds[violation_mask] - left_targets[violation_mask]
+                ).mean()
+            else:
+                metrics["left_censor_mean_overshoot"] = 0.0
+            metrics["left_censor_count"] = is_left_censored.sum()
+        else:
+            metrics["left_censor_violation_rate"] = 0.0
+            metrics["left_censor_mean_overshoot"] = 0.0
+            metrics["left_censor_count"] = 0
+        return metrics
+@dataclass
+class MonotonicityMetrics:
+    """
+    Metrics for within-song monotonicity constraint.
+    Checks that harder difficulties have higher scores/stars
+    within the same song.
+    """
+    difficulty_order: dict = field(
+        default_factory=lambda: {
+            "easy": 0,
+            "Easy": 0,
+            "normal": 1,
+            "Normal": 1,
+            "hard": 2,
+            "Hard": 2,
+            "oni": 3,
+            "Oni": 3,
+            "ura": 4,
+            "Ura": 4,
+        }
+    )
+    def compute(
+        self,
+        raw_scores: np.ndarray,
+        song_ids: list[str],
+        difficulties: list[str],
+    ) -> dict:
+        """
+        Compute monotonicity metrics.
+        Args:
+            raw_scores: Raw difficulty scores [N]
+            song_ids: Song identifiers
+            difficulties: Difficulty names
+        Returns:
+            Dict with metrics
+        """
+        metrics = {}
+        # Group by song
+        song_groups: dict[str, list] = {}
+        for i, song_id in enumerate(song_ids):
+            if song_id not in song_groups:
+                song_groups[song_id] = []
+            song_groups[song_id].append(
+                {
+                    "idx": i,
+                    "difficulty": difficulties[i],
+                    "score": raw_scores[i],
+                }
+            )
+        # Count violations
+        n_violations = 0
+        n_pairs = 0
+        violation_margins = []
+        per_song_kendall_tau = []
+        for song_id, charts in song_groups.items():
+            if len(charts) < 2:
+                continue
+            # Sort by difficulty order
+            sorted_charts = sorted(
+                charts, key=lambda c: self.difficulty_order.get(c["difficulty"], 0)
+            )
+            # Check adjacent pairs
+            for i in range(len(sorted_charts) - 1):
+                n_pairs += 1
+                score_easier = sorted_charts[i]["score"]
+                score_harder = sorted_charts[i + 1]["score"]
+                if score_easier >= score_harder:
+                    n_violations += 1
+                    violation_margins.append(score_easier - score_harder)
+            # Compute Kendall's tau within song
+            if len(sorted_charts) >= 2:
+                actual_scores = [c["score"] for c in sorted_charts]
+                expected_ranks = list(range(len(sorted_charts)))
+                tau, _ = kendalltau(actual_scores, expected_ranks)
+                if not np.isnan(tau):
+                    per_song_kendall_tau.append(tau)
+        # Aggregate metrics
+        metrics["n_pairs"] = n_pairs
+        metrics["n_violations"] = n_violations
+        metrics["violation_rate"] = n_violations / n_pairs if n_pairs > 0 else 0.0
+        if violation_margins:
+            metrics["mean_violation_margin"] = np.mean(violation_margins)
+            metrics["max_violation_margin"] = np.max(violation_margins)
+        else:
+            metrics["mean_violation_margin"] = 0.0
+            metrics["max_violation_margin"] = 0.0
+        if per_song_kendall_tau:
+            metrics["mean_kendall_tau_within_song"] = np.mean(per_song_kendall_tau)
+            metrics["min_kendall_tau_within_song"] = np.min(per_song_kendall_tau)
+        else:
+            metrics["mean_kendall_tau_within_song"] = 0.0
+            metrics["min_kendall_tau_within_song"] = 0.0
+        return metrics
+@dataclass
+class DecompressionMetrics:
+    """
+    Metrics for 10-star decompression.
+    Checks if the model learns to distinguish between different
+    10-star charts (which vary widely in actual difficulty).
+    """
+    def compute(
+        self,
+        predictions: np.ndarray,
+        targets: np.ndarray,
+        difficulties: np.ndarray,
+    ) -> dict:
+        """
+        Compute decompression metrics for max-star samples.
+        Args:
+            predictions: Predicted star ratings (can exceed range)
+            targets: Target star labels
+            difficulties: Difficulty indices
+        Returns:
+            Dict with metrics
+        """
+        metrics = {}
+        # Star ranges per difficulty
+        max_stars = {0: 5, 1: 7, 2: 8, 3: 10, 4: 10}
+        for diff_idx, max_star in max_stars.items():
+            mask = (difficulties == diff_idx) & (targets >= max_star)
+            if mask.sum() < 2:
+                continue
+            preds_at_max = predictions[mask]
+            diff_name = ["easy", "normal", "hard", "oni", "ura"][diff_idx]
+            # Spread of predictions
+            metrics[f"std_{diff_name}_max"] = preds_at_max.std()
+            # Percentile gaps
+            if len(preds_at_max) >= 10:
+                p50 = np.percentile(preds_at_max, 50)
+                p90 = np.percentile(preds_at_max, 90)
+                p99 = np.percentile(preds_at_max, 99)
+                metrics[f"p90_p50_{diff_name}"] = p90 - p50
+                metrics[f"p99_p90_{diff_name}"] = p99 - p90
+            # Range
+            metrics[f"range_{diff_name}_max"] = preds_at_max.max() - preds_at_max.min()
+            metrics[f"n_samples_{diff_name}_max"] = mask.sum()
+        # Overall 10-star decompression (oni + ura combined)
+        max_10_mask = (targets >= 10) & ((difficulties == 3) | (difficulties == 4))
+        if max_10_mask.sum() >= 2:
+            preds_10star = predictions[max_10_mask]
+            metrics["std_10star"] = preds_10star.std()
+            metrics["range_10star"] = preds_10star.max() - preds_10star.min()
+            metrics["n_samples_10star"] = max_10_mask.sum()
+            if len(preds_10star) >= 10:
+                metrics["p90_p50_10star"] = np.percentile(
+                    preds_10star, 90
+                ) - np.percentile(preds_10star, 50)
+                metrics["p99_p90_10star"] = np.percentile(
+                    preds_10star, 99
+                ) - np.percentile(preds_10star, 90)
+        return metrics
+@dataclass
+class MILHealthMetrics:
+    """
+    Metrics for MIL attention health.
+    Monitors attention distribution to detect collapse
+    (model focusing on too few instances).
+    """
+    def compute(
+        self,
+        attention_weights: np.ndarray,
+        instance_counts: Optional[np.ndarray] = None,
+    ) -> dict:
+        """
+        Compute MIL attention health metrics.
+        Args:
+            attention_weights: Attention weights [N_samples, N_instances]
+            instance_counts: Number of valid instances per sample
+        Returns:
+            Dict with metrics
+        """
+        metrics = {}
+        n_samples, n_instances = attention_weights.shape
+        # Mask invalid instances if counts provided
+        if instance_counts is not None:
+            mask = np.arange(n_instances)[None, :] < instance_counts[:, None]
+        else:
+            mask = np.ones_like(attention_weights, dtype=bool)
+        # Attention entropy per sample
+        # Higher entropy = more distributed attention (good for MIL)
+        entropies = []
+        effective_ns = []
+        top5_masses = []
+        for i in range(n_samples):
+            attn = attention_weights[i, mask[i]]
+            if len(attn) == 0:
+                continue
+            # Normalize to sum to 1
+            attn = attn / (attn.sum() + 1e-8)
+            # Entropy
+            entropy = -np.sum(attn * np.log(attn + 1e-8))
+            entropies.append(entropy)
+            # Effective number of instances (inverse of concentration)
+            effective_n = 1.0 / (np.sum(attn**2) + 1e-8)
+            effective_ns.append(effective_n)
+            # Top-5% mass
+            k = max(1, int(len(attn) * 0.05))
+            top5_mass = np.sort(attn)[-k:].sum()
+            top5_masses.append(top5_mass)
+        if entropies:
+            metrics["mean_attention_entropy"] = np.mean(entropies)
+            metrics["min_attention_entropy"] = np.min(entropies)
+            metrics["std_attention_entropy"] = np.std(entropies)
+        if effective_ns:
+            metrics["mean_effective_instances"] = np.mean(effective_ns)
+            metrics["min_effective_instances"] = np.min(effective_ns)
+        if top5_masses:
+            metrics["mean_top5_mass"] = np.mean(top5_masses)
+            metrics["max_top5_mass"] = np.max(top5_masses)
+        # Health assessment
+        # Collapse warning if too few effective instances
+        if effective_ns:
+            collapse_ratio = np.mean(effective_ns) / np.mean(
+                [
+                    c if instance_counts is not None else n_instances
+                    for c in (
+                        instance_counts
+                        if instance_counts is not None
+                        else [n_instances]
+                    )
+                ]
+            )
+            metrics["health_ratio"] = collapse_ratio
+            metrics["attention_collapse_warning"] = (
+                collapse_ratio < 0.1
+            )  # Less than 10% of instances used
+        return metrics

TaikoChartEstimator/model/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+TaikoChartEstimator Model Package
+Provides the MIL-based difficulty estimation model with:
+- Instance encoder (Transformer-based)
+- MIL aggregator with multi-branch attention
+- Multi-head outputs (raw score, difficulty class, star rating)
+"""
+from .aggregator import GatedMILAggregator, MILAggregator
+from .encoder import InstanceEncoder, TCNInstanceEncoder
+from .heads import DifficultyClassifier, MonotonicCalibrator, RawScoreHead
+from .losses import (
+    CensoredRegressionLoss,
+    CurriculumScheduler,
+    TotalLoss,
+    WithinSongRankingLoss,
+)
+from .model import ModelConfig, ModelOutput, TaikoChartEstimator
+__all__ = [
+    "InstanceEncoder",
+    "TCNInstanceEncoder",
+    "MILAggregator",
+    "GatedMILAggregator",
+    "RawScoreHead",
+    "DifficultyClassifier",
+    "MonotonicCalibrator",
+    "TaikoChartEstimator",
+    "ModelConfig",
+    "ModelOutput",
+    "WithinSongRankingLoss",
+    "CensoredRegressionLoss",
+    "TotalLoss",
+    "CurriculumScheduler",
+]

TaikoChartEstimator/model/aggregator.py ADDED Viewed

	@@ -0,0 +1,383 @@

+"""
+MIL Bag Aggregator for Taiko Chart Estimation
+Implements Multiple Instance Learning aggregation with:
+- Three-way pooling (mean, top-k, attention)
+- Multi-branch attention (ACMIL-inspired)
+- Stochastic top-k masking to prevent attention collapse
+"""
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AttentionBranch(nn.Module):
+    """Single attention branch for multi-branch attention."""
+    def __init__(self, d_instance: int, d_hidden: int = 64):
+        super().__init__()
+        self.attention = nn.Sequential(
+            nn.Linear(d_instance, d_hidden),
+            nn.Tanh(),
+            nn.Linear(d_hidden, 1),
+        )
+    def forward(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            instances: [batch, n_instances, d_instance]
+            mask: [batch, n_instances], 1 for valid, 0 for padding
+        Returns:
+            pooled: [batch, d_instance]
+            attention_weights: [batch, n_instances]
+        """
+        # Compute attention scores
+        scores = self.attention(instances).squeeze(-1)  # [batch, n_instances]
+        # Apply mask
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float("-inf"))
+        # Softmax
+        attn_weights = F.softmax(scores, dim=-1)
+        # Handle all-masked case
+        if mask is not None:
+            attn_weights = attn_weights.masked_fill(mask == 0, 0.0)
+        # Weighted sum
+        pooled = (instances * attn_weights.unsqueeze(-1)).sum(dim=1)
+        return pooled, attn_weights
+class MILAggregator(nn.Module):
+    """
+    Multiple Instance Learning aggregator with ACMIL-inspired design.
+    Combines three complementary pooling strategies:
+    1. Mean pooling: Captures overall difficulty/stamina
+    2. Top-K pooling: Captures peak difficulty segments
+    3. Multi-branch attention: Learns multiple discriminative patterns
+    Features stochastic top-k masking during training to prevent
+    the model from relying on only a few "hardest" instances.
+    """
+    def __init__(
+        self,
+        d_instance: int = 256,
+        n_branches: int = 3,
+        top_k_ratio: float = 0.1,
+        stochastic_mask_prob: float = 0.3,
+        dropout: float = 0.1,
+    ):
+        """
+        Initialize MIL aggregator.
+        Args:
+            d_instance: Dimension of instance embeddings
+            n_branches: Number of attention branches
+            top_k_ratio: Fraction of instances for top-k pooling
+            stochastic_mask_prob: Probability of masking top instances during training
+            dropout: Dropout rate
+        """
+        super().__init__()
+        self.d_instance = d_instance
+        self.n_branches = n_branches
+        self.top_k_ratio = top_k_ratio
+        self.stochastic_mask_prob = stochastic_mask_prob
+        # Top-K scoring network
+        self.topk_scorer = nn.Sequential(
+            nn.Linear(d_instance, 64),
+            nn.ReLU(),
+            nn.Linear(64, 1),
+        )
+        # Multi-branch attention
+        self.attention_branches = nn.ModuleList(
+            [AttentionBranch(d_instance, d_hidden=64) for _ in range(n_branches)]
+        )
+        # Fusion layer: combines mean (1) + topk (1) + branches (n_branches) = 2 + n_branches
+        n_pooled = 2 + n_branches
+        self.fusion = nn.Sequential(
+            nn.Linear(d_instance * n_pooled, d_instance * 2),
+            nn.LayerNorm(d_instance * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_instance * 2, d_instance * 2),
+        )
+        self.output_dim = d_instance * 2
+    def _mean_pool(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Mean pooling over instances."""
+        if mask is not None:
+            mask_expanded = mask.unsqueeze(-1)
+            pooled = (instances * mask_expanded).sum(dim=1)
+            pooled = pooled / mask_expanded.sum(dim=1).clamp(min=1)
+        else:
+            pooled = instances.mean(dim=1)
+        return pooled
+    def _topk_pool(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Top-K pooling based on learned scores.
+        Returns:
+            pooled: [batch, d_instance]
+            topk_mask: [batch, n_instances] binary mask of selected instances
+        """
+        batch_size, n_instances, _ = instances.shape
+        # Compute scores
+        scores = self.topk_scorer(instances).squeeze(-1)  # [batch, n_instances]
+        # Apply mask
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float("-inf"))
+        # Determine k
+        if mask is not None:
+            valid_counts = mask.sum(dim=1)  # [batch]
+            k = (valid_counts * self.top_k_ratio).clamp(min=1).long()
+            max_k = k.max().item()
+        else:
+            k = max(1, int(n_instances * self.top_k_ratio))
+            max_k = k
+        # Get top-k indices
+        _, topk_indices = scores.topk(max_k, dim=1)  # [batch, max_k]
+        # Create topk mask
+        topk_mask = torch.zeros_like(mask if mask is not None else scores)
+        topk_mask.scatter_(1, topk_indices, 1.0)
+        # Pool top-k instances
+        if mask is not None:
+            combined_mask = topk_mask * mask
+        else:
+            combined_mask = topk_mask
+        mask_expanded = combined_mask.unsqueeze(-1)
+        pooled = (instances * mask_expanded).sum(dim=1)
+        pooled = pooled / mask_expanded.sum(dim=1).clamp(min=1)
+        return pooled, topk_mask
+    def _stochastic_topk_mask(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Create stochastic mask that randomly drops top instances.
+        This prevents attention collapse by forcing the model to
+        learn from non-peak instances during training.
+        """
+        if not self.training:
+            return mask
+        batch_size, n_instances, _ = instances.shape
+        # Get top-k scores
+        with torch.no_grad():
+            scores = self.topk_scorer(instances).squeeze(-1)
+            if mask is not None:
+                scores = scores.masked_fill(mask == 0, float("-inf"))
+            k = max(1, int(n_instances * self.top_k_ratio))
+            _, topk_indices = scores.topk(k, dim=1)
+        # Create mask that drops top instances with some probability
+        drop_mask = torch.ones_like(mask if mask is not None else scores)
+        # For each batch, randomly decide whether to drop top instances
+        drop_decision = (
+            torch.rand(batch_size, device=instances.device) < self.stochastic_mask_prob
+        )
+        for i in range(batch_size):
+            if drop_decision[i]:
+                drop_mask[i, topk_indices[i]] = 0.0
+        if mask is not None:
+            return mask * drop_mask
+        return drop_mask
+    def forward(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        return_attention: bool = True,
+    ) -> tuple[torch.Tensor, dict]:
+        """
+        Aggregate instance embeddings to bag embedding.
+        Args:
+            instances: [batch, n_instances, d_instance]
+            mask: [batch, n_instances], 1 for valid, 0 for padding
+            return_attention: Whether to return attention weights for analysis
+        Returns:
+            bag_embedding: [batch, output_dim]
+            attention_info: Dict with attention weights and metrics
+        """
+        # Apply stochastic top-k masking during training
+        if self.training:
+            stoch_mask = self._stochastic_topk_mask(instances, mask)
+        else:
+            stoch_mask = mask
+        # 1. Mean pooling (stamina/overall representation)
+        mean_pooled = self._mean_pool(instances, mask)
+        # 2. Top-K pooling (peak difficulty)
+        topk_pooled, topk_mask = self._topk_pool(instances, mask)
+        # 3. Multi-branch attention pooling
+        branch_outputs = []
+        branch_attns = []
+        for branch in self.attention_branches:
+            pooled, attn = branch(instances, stoch_mask)
+            branch_outputs.append(pooled)
+            branch_attns.append(attn)
+        # Concatenate all pooled representations
+        all_pooled = [mean_pooled, topk_pooled] + branch_outputs
+        concatenated = torch.cat(
+            all_pooled, dim=-1
+        )  # [batch, d_instance * (2 + n_branches)]
+        # Fuse
+        bag_embedding = self.fusion(concatenated)
+        # Compute attention health metrics
+        attention_info = {}
+        if return_attention:
+            # Stack all attention weights
+            all_attn = torch.stack(
+                branch_attns, dim=1
+            )  # [batch, n_branches, n_instances]
+            # Average attention across branches
+            avg_attn = all_attn.mean(dim=1)  # [batch, n_instances]
+            # Attention entropy (higher = more distributed)
+            entropy = -(avg_attn * (avg_attn + 1e-8).log()).sum(dim=-1)
+            # Effective number of instances (inverse of concentration)
+            effective_n = 1.0 / (avg_attn**2).sum(dim=-1)
+            # Top-5% mass
+            k = max(1, int(instances.size(1) * 0.05))
+            top5_mass = avg_attn.topk(k, dim=-1).values.sum(dim=-1)
+            attention_info = {
+                "branch_attentions": all_attn,  # [batch, n_branches, n_instances]
+                "average_attention": avg_attn,  # [batch, n_instances]
+                "topk_mask": topk_mask,  # [batch, n_instances]
+                "entropy": entropy,  # [batch]
+                "effective_n": effective_n,  # [batch]
+                "top5_mass": top5_mass,  # [batch]
+            }
+        return bag_embedding, attention_info
+class GatedMILAggregator(nn.Module):
+    """
+    Alternative MIL aggregator using gated attention.
+    Allows instance embeddings to modulate attention via gating,
+    which can capture more nuanced importance patterns.
+    """
+    def __init__(
+        self,
+        d_instance: int = 256,
+        d_hidden: int = 128,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.attention_v = nn.Sequential(
+            nn.Linear(d_instance, d_hidden),
+            nn.Tanh(),
+        )
+        self.attention_u = nn.Sequential(
+            nn.Linear(d_instance, d_hidden),
+            nn.Sigmoid(),
+        )
+        self.attention_w = nn.Linear(d_hidden, 1)
+        self.output_proj = nn.Sequential(
+            nn.Linear(d_instance, d_instance * 2),
+            nn.LayerNorm(d_instance * 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+        )
+        self.output_dim = d_instance * 2
+    def forward(
+        self,
+        instances: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        return_attention: bool = True,
+    ) -> tuple[torch.Tensor, dict]:
+        """
+        Args:
+            instances: [batch, n_instances, d_instance]
+            mask: [batch, n_instances]
+        Returns:
+            bag_embedding: [batch, output_dim]
+            attention_info: Dict with attention weights
+        """
+        # Gated attention
+        v = self.attention_v(instances)  # [batch, n_instances, d_hidden]
+        u = self.attention_u(instances)  # [batch, n_instances, d_hidden]
+        scores = self.attention_w(v * u).squeeze(-1)  # [batch, n_instances]
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float("-inf"))
+        attn_weights = F.softmax(scores, dim=-1)
+        if mask is not None:
+            attn_weights = attn_weights.masked_fill(mask == 0, 0.0)
+        # Weighted sum
+        pooled = (instances * attn_weights.unsqueeze(-1)).sum(dim=1)
+        # Project to output
+        bag_embedding = self.output_proj(pooled)
+        attention_info = {"attention": attn_weights} if return_attention else {}
+        return bag_embedding, attention_info

TaikoChartEstimator/model/encoder.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+Instance Encoder for Taiko Chart MIL
+Encodes a sequence of event tokens into a fixed-size vector representation.
+Uses Transformer encoder for capturing rhythm patterns and dependencies.
+"""
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for sequences."""
+    def __init__(self, d_model: int, max_len: int = 512, dropout: float = 0.1):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # Create positional encoding matrix
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
+        )
+        pe = torch.zeros(max_len, d_model)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe.unsqueeze(0))  # [1, max_len, d_model]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor of shape [batch, seq_len, d_model]
+        """
+        x = x + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class ContinuousFeatureEncoder(nn.Module):
+    """
+    Encodes continuous features (BPM, scroll, beat_pos, duration) to d_model dimension.
+    Uses learned linear projections with optional normalization.
+    """
+    def __init__(
+        self,
+        n_continuous: int = 5,  # beat_pos, duration, bpm, scroll, gogo
+        d_model: int = 256,
+        use_layernorm: bool = True,
+    ):
+        super().__init__()
+        self.projection = nn.Linear(n_continuous, d_model)
+        self.layernorm = nn.LayerNorm(d_model) if use_layernorm else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Continuous features [batch, seq_len, n_continuous]
+        """
+        return self.layernorm(self.projection(x))
+class InstanceEncoder(nn.Module):
+    """
+    Encodes a sequence of event tokens to a fixed-size vector.
+    Input: Token sequence [batch, seq_len, 6]
+        - Column 0: note_type (discrete, 0-9)
+        - Column 1: beat_position (continuous, 0-1)
+        - Column 2: duration (continuous, normalized)
+        - Column 3: bpm (continuous, normalized)
+        - Column 4: scroll (continuous, normalized)
+        - Column 5: gogo (binary, 0/1)
+    Output: Instance embedding [batch, d_model]
+    """
+    def __init__(
+        self,
+        d_model: int = 256,
+        n_heads: int = 4,
+        n_layers: int = 4,
+        d_feedforward: int = 512,
+        dropout: float = 0.1,
+        n_note_types: int = 10,  # 9 types + padding
+        max_seq_len: int = 128,
+        pooling: str = "cls",  # "cls", "mean", or "max"
+    ):
+        """
+        Initialize instance encoder.
+        Args:
+            d_model: Model dimension
+            n_heads: Number of attention heads
+            n_layers: Number of transformer layers
+            d_feedforward: Feedforward dimension
+            dropout: Dropout rate
+            n_note_types: Number of note type categories
+            max_seq_len: Maximum sequence length
+            pooling: Pooling strategy for sequence to vector
+        """
+        super().__init__()
+        self.d_model = d_model
+        self.pooling = pooling
+        # Discrete feature embedding (note type)
+        self.type_embedding = nn.Embedding(n_note_types, d_model, padding_idx=9)
+        # Continuous feature encoder
+        self.continuous_encoder = ContinuousFeatureEncoder(
+            n_continuous=5,  # beat_pos, duration, bpm, scroll, gogo
+            d_model=d_model,
+        )
+        # Feature fusion
+        self.fusion = nn.Linear(d_model * 2, d_model)
+        self.fusion_norm = nn.LayerNorm(d_model)
+        # Positional encoding (max_len+1 to accommodate CLS token)
+        self.pos_encoder = PositionalEncoding(d_model, max_seq_len + 1, dropout)
+        # CLS token for pooling
+        if pooling == "cls":
+            self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
+        # Transformer encoder
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=n_heads,
+            dim_feedforward=d_feedforward,
+            dropout=dropout,
+            activation="gelu",
+            batch_first=True,
+            norm_first=True,  # Pre-LN for stability
+        )
+        self.transformer = nn.TransformerEncoder(
+            encoder_layer,
+            num_layers=n_layers,
+        )
+        # Output projection
+        self.output_norm = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Encode token sequence to vector.
+        Args:
+            tokens: Token tensor [batch, seq_len, 6]
+            mask: Attention mask [batch, seq_len], 1 for valid, 0 for padding
+        Returns:
+            Instance embedding [batch, d_model]
+        """
+        batch_size, seq_len, _ = tokens.shape
+        # Split discrete and continuous features
+        note_types = tokens[:, :, 0].long()  # [batch, seq_len]
+        continuous_feats = tokens[:, :, 1:]  # [batch, seq_len, 5]
+        # Embed discrete features
+        type_emb = self.type_embedding(note_types)  # [batch, seq_len, d_model]
+        # Encode continuous features
+        cont_emb = self.continuous_encoder(
+            continuous_feats
+        )  # [batch, seq_len, d_model]
+        # Fuse embeddings
+        fused = self.fusion(torch.cat([type_emb, cont_emb], dim=-1))
+        fused = self.fusion_norm(fused)  # [batch, seq_len, d_model]
+        # Add CLS token if using CLS pooling
+        if self.pooling == "cls":
+            cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+            fused = torch.cat([cls_tokens, fused], dim=1)  # [batch, 1+seq_len, d_model]
+            # Extend mask for CLS token
+            if mask is not None:
+                cls_mask = torch.ones(
+                    batch_size, 1, device=mask.device, dtype=mask.dtype
+                )
+                mask = torch.cat([cls_mask, mask], dim=1)
+        # Add positional encoding
+        fused = self.pos_encoder(fused)
+        # Create attention mask for transformer (True = ignore)
+        if mask is not None:
+            attn_mask = mask == 0  # Invert: 0 -> True (ignore)
+        else:
+            attn_mask = None
+        # Apply transformer
+        encoded = self.transformer(fused, src_key_padding_mask=attn_mask)
+        # Pool to vector
+        if self.pooling == "cls":
+            output = encoded[:, 0]  # CLS token
+        elif self.pooling == "mean":
+            if mask is not None:
+                # Masked mean (exclude padding)
+                mask_expanded = mask.unsqueeze(-1)  # [batch, seq_len, 1]
+                output = (encoded * mask_expanded).sum(dim=1) / mask_expanded.sum(
+                    dim=1
+                ).clamp(min=1)
+            else:
+                output = encoded.mean(dim=1)
+        elif self.pooling == "max":
+            if mask is not None:
+                # Masked max (set padding to -inf)
+                mask_expanded = mask.unsqueeze(-1)
+                encoded = encoded.masked_fill(mask_expanded == 0, float("-inf"))
+            output = encoded.max(dim=1).values
+        else:
+            raise ValueError(f"Unknown pooling method: {self.pooling}")
+        return self.output_norm(output)
+class TCNBlock(nn.Module):
+    """Temporal Convolutional Network block with residual connection."""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        padding = (kernel_size - 1) * dilation // 2
+        self.conv1 = nn.Conv1d(
+            in_channels, out_channels, kernel_size, padding=padding, dilation=dilation
+        )
+        self.conv2 = nn.Conv1d(
+            out_channels, out_channels, kernel_size, padding=padding, dilation=dilation
+        )
+        self.norm1 = nn.BatchNorm1d(out_channels)
+        self.norm2 = nn.BatchNorm1d(out_channels)
+        self.dropout = nn.Dropout(dropout)
+        # Residual connection
+        self.residual = (
+            nn.Conv1d(in_channels, out_channels, 1)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [batch, channels, seq_len]
+        """
+        residual = self.residual(x)
+        out = F.gelu(self.norm1(self.conv1(x)))
+        out = self.dropout(out)
+        out = F.gelu(self.norm2(self.conv2(out)))
+        out = self.dropout(out)
+        return out + residual
+class TCNInstanceEncoder(nn.Module):
+    """
+    Alternative instance encoder using Temporal Convolutional Network.
+    Faster than Transformer with stronger local inductive bias.
+    """
+    def __init__(
+        self,
+        d_model: int = 256,
+        n_layers: int = 4,
+        kernel_size: int = 3,
+        dropout: float = 0.1,
+        n_note_types: int = 10,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        # Input projection
+        self.type_embedding = nn.Embedding(n_note_types, d_model // 2, padding_idx=9)
+        self.continuous_proj = nn.Linear(5, d_model // 2)
+        # TCN layers with exponentially increasing dilation
+        self.tcn_layers = nn.ModuleList(
+            [
+                TCNBlock(d_model, d_model, kernel_size, dilation=2**i, dropout=dropout)
+                for i in range(n_layers)
+            ]
+        )
+        self.output_norm = nn.LayerNorm(d_model)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            tokens: [batch, seq_len, 6]
+            mask: [batch, seq_len]
+        Returns:
+            [batch, d_model]
+        """
+        # Embed inputs
+        note_types = tokens[:, :, 0].long()
+        continuous = tokens[:, :, 1:]
+        type_emb = self.type_embedding(note_types)
+        cont_emb = self.continuous_proj(continuous)
+        x = torch.cat([type_emb, cont_emb], dim=-1)  # [batch, seq_len, d_model]
+        # Convert to channels-first for conv
+        x = x.transpose(1, 2)  # [batch, d_model, seq_len]
+        # Apply TCN layers
+        for layer in self.tcn_layers:
+            x = layer(x)
+        # Global average pooling
+        if mask is not None:
+            mask_expanded = mask.unsqueeze(1)  # [batch, 1, seq_len]
+            x = (x * mask_expanded).sum(dim=-1) / mask_expanded.sum(dim=-1).clamp(min=1)
+        else:
+            x = x.mean(dim=-1)
+        return self.output_norm(x)

TaikoChartEstimator/model/heads.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+Output Heads for Taiko Chart Estimation
+Three heads for multi-task learning:
+- Head A: Raw difficulty score (unbounded)
+- Head B: Difficulty classification (4-5 classes)
+- Head C: Monotonic star calibration
+"""
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class RawScoreHead(nn.Module):
+    """
+    Head A: Unbounded raw difficulty score.
+    Outputs s ∈ ℝ, the "true" continuous difficulty scale
+    before mapping to display star ratings.
+    """
+    def __init__(
+        self,
+        d_input: int = 512,
+        d_hidden: int = 128,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(d_input, d_hidden),
+            nn.LayerNorm(d_hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_hidden, d_hidden // 2),
+            nn.GELU(),
+            nn.Linear(d_hidden // 2, 1),
+        )
+        # Initialize to output reasonable range (~1-10)
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize to output values centered around 5."""
+        with torch.no_grad():
+            # Bias the final layer to output ~5
+            self.mlp[-1].bias.fill_(5.0)
+            self.mlp[-1].weight.fill_(0.01)
+    def forward(self, bag_embedding: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            bag_embedding: [batch, d_input]
+        Returns:
+            raw_score: [batch] unbounded difficulty score
+        """
+        return self.mlp(bag_embedding).squeeze(-1)
+class DifficultyClassifier(nn.Module):
+    """
+    Head B: Difficulty classification.
+    Predicts difficulty class: easy, normal, hard, oni, ura (5 classes)
+    or merged oni_ura (4 classes).
+    """
+    def __init__(
+        self,
+        d_input: int = 512,
+        n_classes: int = 5,
+        d_hidden: int = 128,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.n_classes = n_classes
+        self.mlp = nn.Sequential(
+            nn.Linear(d_input, d_hidden),
+            nn.LayerNorm(d_hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_hidden, n_classes),
+        )
+    def forward(self, bag_embedding: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            bag_embedding: [batch, d_input]
+        Returns:
+            logits: [batch, n_classes] classification logits
+        """
+        return self.mlp(bag_embedding)
+    def predict(self, bag_embedding: torch.Tensor) -> torch.Tensor:
+        """Get predicted class indices."""
+        logits = self.forward(bag_embedding)
+        return logits.argmax(dim=-1)
+class MonotonicSpline(nn.Module):
+    """
+    Monotonic spline for mapping raw score to star rating.
+    Uses I-splines (integrated B-splines) to guarantee monotonicity.
+    Learnable coefficients are constrained to be positive.
+    """
+    def __init__(
+        self,
+        n_knots: int = 8,
+        input_range: tuple[float, float] = (0, 15),
+        output_range: tuple[float, float] = (1, 10),
+    ):
+        super().__init__()
+        self.n_knots = n_knots
+        self.input_range = input_range
+        self.output_range = output_range
+        # Knot positions (fixed)
+        knots = torch.linspace(input_range[0], input_range[1], n_knots)
+        self.register_buffer("knots", knots)
+        # Learnable positive coefficients (using softplus for positivity)
+        self.raw_coefficients = nn.Parameter(torch.ones(n_knots))
+        # Learnable offset
+        self.offset = nn.Parameter(torch.tensor(float(output_range[0])))
+    def _compute_basis(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute I-spline basis functions with clamping for stability."""
+        # Clamp input to reasonable range to prevent output explosion
+        x_clamped = x.clamp(self.input_range[0], self.input_range[1])
+        x_clamped = x_clamped.unsqueeze(-1)  # [batch, 1]
+        knots = self.knots.unsqueeze(0)  # [1, n_knots]
+        # Compute distance to each knot
+        diff = x_clamped - knots  # [batch, n_knots]
+        # ReLU with cap to prevent unbounded growth
+        # Cap at input_range width for reasonable behavior
+        max_value = self.input_range[1] - self.input_range[0]
+        basis = F.relu(diff).clamp(max=max_value)  # [batch, n_knots]
+        return basis
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Map raw score to star rating (monotonically).
+        Args:
+            x: Raw scores [batch]
+        Returns:
+            Star ratings [batch]
+        """
+        # Ensure positive coefficients
+        coefficients = F.softplus(self.raw_coefficients)
+        # Normalize coefficients to control output scale
+        coefficients = coefficients / coefficients.sum()
+        scale = self.output_range[1] - self.output_range[0]
+        coefficients = coefficients * scale
+        # Compute basis
+        basis = self._compute_basis(x)  # [batch, n_knots]
+        # Weighted sum
+        output = (basis * coefficients).sum(dim=-1) + self.offset
+        return output
+class MonotonicMLP(nn.Module):
+    """
+    Monotonic MLP using positive weight constraints.
+    Ensures f(x1) >= f(x2) whenever x1 >= x2 by constraining
+    all weights to be positive and using monotonic activations.
+    """
+    def __init__(
+        self,
+        d_hidden: int = 64,
+        n_layers: int = 3,
+    ):
+        super().__init__()
+        layers = []
+        in_dim = 1
+        for i in range(n_layers):
+            out_dim = d_hidden if i < n_layers - 1 else 1
+            layers.append(nn.Linear(in_dim, out_dim))
+            if i < n_layers - 1:
+                layers.append(nn.Softplus())  # Monotonic activation
+            in_dim = out_dim
+        self.layers = nn.ModuleList(
+            [layer for layer in layers if isinstance(layer, nn.Linear)]
+        )
+        self.activations = [nn.Softplus() for _ in range(n_layers - 1)] + [
+            nn.Identity()
+        ]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Raw scores [batch]
+        Returns:
+            Calibrated scores [batch]
+        """
+        out = x.unsqueeze(-1)  # [batch, 1]
+        for layer, activation in zip(self.layers, self.activations):
+            # Apply absolute value to weights for monotonicity
+            weight = layer.weight.abs()
+            out = F.linear(out, weight, layer.bias)
+            out = activation(out)
+        return out.squeeze(-1)
+class MonotonicCalibrator(nn.Module):
+    """
+    Head C: Monotonic calibration from raw score to star rating.
+    Maintains separate calibrators per difficulty level, since
+    the star ranges differ (easy: 1-5, normal: 1-7, etc.)
+    Guarantees:
+    - Output is monotonically increasing with input
+    - Can output values outside the nominal range (for decompression)
+    """
+    def __init__(
+        self,
+        method: str = "spline",  # "spline" or "mlp"
+        n_difficulties: int = 5,
+        star_ranges: Optional[dict] = None,
+    ):
+        """
+        Args:
+            method: Calibration method ("spline" or "mlp")
+            n_difficulties: Number of difficulty classes
+            star_ranges: Dict mapping difficulty index to (min, max) star range
+        """
+        super().__init__()
+        self.method = method
+        self.n_difficulties = n_difficulties
+        # Default star ranges per difficulty
+        if star_ranges is None:
+            star_ranges = {
+                0: (1, 5),  # easy
+                1: (1, 7),  # normal
+                2: (1, 8),  # hard
+                3: (1, 10),  # oni
+                4: (1, 10),  # ura
+            }
+        self.star_ranges = star_ranges
+        # Create calibrators per difficulty
+        if method == "spline":
+            self.calibrators = nn.ModuleList(
+                [
+                    MonotonicSpline(
+                        n_knots=8,
+                        input_range=(0, 15),
+                        output_range=star_ranges.get(i, (1, 10)),
+                    )
+                    for i in range(n_difficulties)
+                ]
+            )
+        else:
+            self.calibrators = nn.ModuleList(
+                [MonotonicMLP(d_hidden=32, n_layers=3) for i in range(n_difficulties)]
+            )
+            # Add scaling parameters for MLP
+            self.scales = nn.ParameterList(
+                [
+                    nn.Parameter(
+                        torch.tensor(
+                            float(
+                                star_ranges.get(i, (1, 10))[1]
+                                - star_ranges.get(i, (1, 10))[0]
+                            )
+                        )
+                    )
+                    for i in range(n_difficulties)
+                ]
+            )
+            self.offsets = nn.ParameterList(
+                [
+                    nn.Parameter(torch.tensor(float(star_ranges.get(i, (1, 10))[0])))
+                    for i in range(n_difficulties)
+                ]
+            )
+    def forward(
+        self,
+        raw_score: torch.Tensor,
+        difficulty: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Map raw scores to star ratings based on difficulty.
+        Args:
+            raw_score: [batch] raw difficulty scores
+            difficulty: [batch] difficulty class indices
+        Returns:
+            star_rating: [batch] calibrated star ratings (can be < min or > max)
+        """
+        batch_size = raw_score.size(0)
+        star_ratings = torch.zeros_like(raw_score)
+        # Process each difficulty class
+        for diff_idx in range(self.n_difficulties):
+            mask = difficulty == diff_idx
+            if mask.any():
+                calibrator = self.calibrators[diff_idx]
+                if self.method == "spline":
+                    star_ratings[mask] = calibrator(raw_score[mask])
+                else:
+                    # MLP with scaling
+                    normalized = calibrator(raw_score[mask])
+                    star_ratings[mask] = (
+                        normalized * self.scales[diff_idx] + self.offsets[diff_idx]
+                    )
+        return star_ratings
+    def forward_all(
+        self,
+        raw_score: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute star ratings for all difficulties at once.
+        Args:
+            raw_score: [batch] raw scores
+        Returns:
+            star_ratings: [batch, n_difficulties] star per difficulty
+        """
+        batch_size = raw_score.size(0)
+        all_stars = []
+        for diff_idx in range(self.n_difficulties):
+            calibrator = self.calibrators[diff_idx]
+            if self.method == "spline":
+                stars = calibrator(raw_score)
+            else:
+                normalized = calibrator(raw_score)
+                stars = normalized * self.scales[diff_idx] + self.offsets[diff_idx]
+            all_stars.append(stars)
+        return torch.stack(all_stars, dim=-1)
+    def clip_to_display(
+        self,
+        star_rating: torch.Tensor,
+        difficulty: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Clip star ratings to display range for UI.
+        Args:
+            star_rating: [batch] raw star ratings (can be outside range)
+            difficulty: [batch] difficulty indices
+        Returns:
+            display_star: [batch] clipped to valid range per difficulty
+        """
+        display_star = star_rating.clone()
+        for diff_idx in range(self.n_difficulties):
+            mask = difficulty == diff_idx
+            if mask.any():
+                min_star, max_star = self.star_ranges[diff_idx]
+                display_star[mask] = display_star[mask].clamp(min_star, max_star)
+        return display_star

TaikoChartEstimator/model/losses.py ADDED Viewed

	@@ -0,0 +1,431 @@

+"""
+Loss Functions for Taiko Chart Estimation
+Implements:
+- Within-song ranking loss (monotonicity constraint)
+- Censored regression loss (handles star boundary labels)
+- Multi-task loss combiner with curriculum scheduling
+"""
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..constants import STAR_RANGES_BY_ID as STAR_RANGES
+class WithinSongRankingLoss(nn.Module):
+    """
+    Ranking loss for enforcing within-song monotonicity.
+    For charts from the same song, harder difficulties must have
+    higher raw scores: s_harder > s_easier.
+    Uses hinge loss: L = max(0, margin - (s_harder - s_easier))
+    """
+    def __init__(self, margin: float = 0.5):
+        """
+        Args:
+            margin: Minimum required difference between difficulty levels
+        """
+        super().__init__()
+        self.margin = margin
+    def forward(
+        self,
+        s_easier: torch.Tensor,
+        s_harder: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute ranking loss for pairs.
+        Args:
+            s_easier: [n_pairs] scores for easier charts
+            s_harder: [n_pairs] scores for harder charts
+        Returns:
+            Scalar loss value
+        """
+        if s_easier.numel() == 0:
+            return torch.tensor(0.0, device=s_easier.device)
+        # Hinge loss
+        violations = F.relu(self.margin - (s_harder - s_easier))
+        return violations.mean()
+    def compute_violation_rate(
+        self,
+        s_easier: torch.Tensor,
+        s_harder: torch.Tensor,
+    ) -> float:
+        """Compute fraction of pairs that violate monotonicity."""
+        if s_easier.numel() == 0:
+            return 0.0
+        violations = (s_easier >= s_harder).float()
+        return violations.mean().item()
+class CensoredRegressionLoss(nn.Module):
+    """
+    Censored regression loss for star ratings.
+    Handles the fact that boundary labels (1, 10) are censored:
+    - label == max_star: true value is >= max_star (right-censored)
+    - label == min_star: true value is <= min_star (left-censored)
+    For censored samples, we only penalize predictions that
+    violate the bound, not predictions that exceed it.
+    """
+    def __init__(
+        self,
+        uncensored_loss: str = "huber",  # "huber", "mse", "mae"
+        huber_delta: float = 0.5,
+        star_ranges: Optional[dict] = None,
+    ):
+        """
+        Args:
+            uncensored_loss: Loss type for uncensored samples
+            huber_delta: Delta for Huber loss
+            star_ranges: Dict mapping difficulty index to (min, max) range
+        """
+        super().__init__()
+        self.uncensored_loss = uncensored_loss
+        self.huber_delta = huber_delta
+        self.star_ranges = star_ranges if star_ranges is not None else STAR_RANGES
+    def _uncensored_loss(
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute loss for uncensored samples."""
+        if self.uncensored_loss == "huber":
+            return F.huber_loss(pred, target, delta=self.huber_delta, reduction="none")
+        elif self.uncensored_loss == "mse":
+            return F.mse_loss(pred, target, reduction="none")
+        elif self.uncensored_loss == "mae":
+            return F.l1_loss(pred, target, reduction="none")
+        else:
+            raise ValueError(f"Unknown loss type: {self.uncensored_loss}")
+    def forward(
+        self,
+        pred_star: torch.Tensor,
+        target_star: torch.Tensor,
+        difficulty: torch.Tensor,
+        is_right_censored: Optional[torch.Tensor] = None,
+        is_left_censored: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Compute censored regression loss.
+        Args:
+            pred_star: [batch] predicted star ratings
+            target_star: [batch] target star labels
+            difficulty: [batch] difficulty class indices
+            is_right_censored: [batch] bool, True if label is at max (right-censored)
+            is_left_censored: [batch] bool, True if label is at min (left-censored)
+        Returns:
+            Scalar loss value
+        """
+        batch_size = pred_star.size(0)
+        # Auto-detect censoring if not provided
+        if is_right_censored is None or is_left_censored is None:
+            is_right_censored = torch.zeros(
+                batch_size, dtype=torch.bool, device=pred_star.device
+            )
+            is_left_censored = torch.zeros(
+                batch_size, dtype=torch.bool, device=pred_star.device
+            )
+            for diff_idx, (min_star, max_star) in self.star_ranges.items():
+                mask = difficulty == diff_idx
+                is_right_censored[mask] = target_star[mask] >= max_star
+                is_left_censored[mask] = target_star[mask] <= min_star
+        # Compute losses per sample
+        losses = torch.zeros_like(pred_star)
+        # Right-censored: only penalize if pred < target
+        right_mask = is_right_censored
+        if right_mask.any():
+            shortfall = F.relu(target_star[right_mask] - pred_star[right_mask])
+            losses[right_mask] = shortfall
+        # Left-censored: only penalize if pred > target
+        left_mask = is_left_censored
+        if left_mask.any():
+            overshoot = F.relu(pred_star[left_mask] - target_star[left_mask])
+            losses[left_mask] = overshoot
+        # Uncensored: standard loss
+        uncensored_mask = ~(is_right_censored | is_left_censored)
+        if uncensored_mask.any():
+            losses[uncensored_mask] = self._uncensored_loss(
+                pred_star[uncensored_mask],
+                target_star[uncensored_mask],
+            )
+        return losses.mean()
+    def compute_censoring_metrics(
+        self,
+        pred_star: torch.Tensor,
+        target_star: torch.Tensor,
+        difficulty: torch.Tensor,
+    ) -> dict:
+        """
+        Compute censoring-related metrics.
+        Returns:
+            Dict with violation rates and shortfall/overshoot stats
+        """
+        metrics = {}
+        for diff_idx, (min_star, max_star) in self.star_ranges.items():
+            mask = difficulty == diff_idx
+            if not mask.any():
+                continue
+            preds = pred_star[mask]
+            targets = target_star[mask]
+            # Right-censored samples (at max)
+            right_mask = targets >= max_star
+            if right_mask.any():
+                right_preds = preds[right_mask]
+                violation_rate = (right_preds < max_star).float().mean().item()
+                mean_shortfall = F.relu(max_star - right_preds).mean().item()
+                metrics[f"right_violation_rate_{diff_idx}"] = violation_rate
+                metrics[f"mean_shortfall_{diff_idx}"] = mean_shortfall
+            # Left-censored samples (at min)
+            left_mask = targets <= min_star
+            if left_mask.any():
+                left_preds = preds[left_mask]
+                violation_rate = (left_preds > min_star).float().mean().item()
+                mean_overshoot = F.relu(left_preds - min_star).mean().item()
+                metrics[f"left_violation_rate_{diff_idx}"] = violation_rate
+                metrics[f"mean_overshoot_{diff_idx}"] = mean_overshoot
+        return metrics
+class TotalLoss(nn.Module):
+    """
+    Multi-task loss combiner for difficulty estimation.
+    Combines:
+    - Classification loss (difficulty prediction)
+    - Censored star regression loss
+    - Within-song ranking loss (monotonicity)
+    Supports curriculum learning with schedulable weights.
+    Note: When merge_ura_oni=True, ura (4) and oni (3) are treated as the same class.
+    """
+    def __init__(
+        self,
+        lambda_cls: float = 1.0,
+        lambda_star: float = 1.0,
+        lambda_rank: float = 1.0,
+        class_weights: Optional[torch.Tensor] = None,
+        ranking_margin: float = 0.5,
+        star_loss_type: str = "huber",
+        merge_ura_oni: bool = True,
+    ):
+        """
+        Args:
+            lambda_cls: Weight for classification loss
+            lambda_star: Weight for star regression loss
+            lambda_rank: Weight for ranking loss
+            class_weights: Optional class weights for classification
+            ranking_margin: Margin for ranking hinge loss
+            star_loss_type: Loss type for star regression
+            merge_ura_oni: If True, treat ura (4) as oni (3) for classification
+        """
+        super().__init__()
+        self.lambda_cls = lambda_cls
+        self.lambda_star = lambda_star
+        self.lambda_rank = lambda_rank
+        self.merge_ura_oni = merge_ura_oni
+        # Classification loss
+        self.cls_loss = nn.CrossEntropyLoss(weight=class_weights)
+        # Star regression loss
+        self.star_loss = CensoredRegressionLoss(uncensored_loss=star_loss_type)
+        # Ranking loss
+        self.rank_loss = WithinSongRankingLoss(margin=ranking_margin)
+    def set_weights(
+        self,
+        lambda_cls: Optional[float] = None,
+        lambda_star: Optional[float] = None,
+        lambda_rank: Optional[float] = None,
+    ):
+        """Update loss weights (for curriculum learning)."""
+        if lambda_cls is not None:
+            self.lambda_cls = lambda_cls
+        if lambda_star is not None:
+            self.lambda_star = lambda_star
+        if lambda_rank is not None:
+            self.lambda_rank = lambda_rank
+    def forward(
+        self,
+        difficulty_logits: torch.Tensor,
+        pred_star: torch.Tensor,
+        target_difficulty: torch.Tensor,
+        target_star: torch.Tensor,
+        is_right_censored: Optional[torch.Tensor] = None,
+        is_left_censored: Optional[torch.Tensor] = None,
+        ranking_pairs: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Compute total loss with breakdown.
+        Args:
+            difficulty_logits: [batch, n_classes] difficulty predictions
+            pred_star: [batch] predicted star ratings
+            target_difficulty: [batch] target difficulty classes
+            target_star: [batch] target star labels
+            is_right_censored: [batch] right-censoring flags
+            is_left_censored: [batch] left-censoring flags
+            ranking_pairs: Optional (s_easier, s_harder) for ranking loss
+        Returns:
+            Dict with total loss and breakdown:
+                - "total": Combined weighted loss
+                - "cls": Classification loss
+                - "star": Star regression loss
+                - "rank": Ranking loss (if pairs provided)
+        """
+        losses = {}
+        # Classification loss
+        # Merge ura (4) and oni (3) if enabled
+        if self.merge_ura_oni:
+            # Merge target: map ura (class 4) to oni (class 3)
+            target_difficulty_merged = target_difficulty.clone()
+            target_difficulty_merged[target_difficulty_merged == 4] = 3
+            # Correct merging: use logsumexp in log-probability space
+            # This correctly computes P(oni OR ura) = P(oni) + P(ura)
+            log_probs = F.log_softmax(difficulty_logits, dim=-1)  # [batch, 5]
+            log_probs_merged = log_probs[:, :4].clone()  # [batch, 4]
+            # logsumexp(log P(oni), log P(ura)) = log(P(oni) + P(ura))
+            log_probs_merged[:, 3] = torch.logsumexp(log_probs[:, 3:5], dim=-1)
+            cls_loss = F.nll_loss(
+                log_probs_merged,
+                target_difficulty_merged,
+                weight=self.cls_loss.weight,
+            )
+        else:
+            cls_loss = self.cls_loss(difficulty_logits, target_difficulty)
+        losses["cls"] = cls_loss
+        # Star regression loss
+        star_loss = self.star_loss(
+            pred_star,
+            target_star,
+            target_difficulty,
+            is_right_censored,
+            is_left_censored,
+        )
+        losses["star"] = star_loss
+        # Ranking loss (if pairs provided)
+        if ranking_pairs is not None:
+            s_easier, s_harder = ranking_pairs
+            rank_loss = self.rank_loss(s_easier, s_harder)
+            losses["rank"] = rank_loss
+        else:
+            rank_loss = torch.tensor(0.0, device=pred_star.device)
+            losses["rank"] = rank_loss
+        # Combine with weights
+        total = (
+            self.lambda_cls * cls_loss
+            + self.lambda_star * star_loss
+            + self.lambda_rank * rank_loss
+        )
+        losses["total"] = total
+        return losses
+class CurriculumScheduler:
+    """
+    Scheduler for curriculum learning of loss weights.
+    Early training: focus on classification (coarse alignment)
+    Later training: increase ranking + star loss (fine-grained)
+    """
+    def __init__(
+        self,
+        total_steps: int,
+        warmup_fraction: float = 0.2,
+        cls_start: float = 2.0,
+        cls_end: float = 0.5,
+        rank_start: float = 0.1,
+        rank_end: float = 1.5,
+        star_start: float = 0.5,
+        star_end: float = 1.5,
+    ):
+        """
+        Args:
+            total_steps: Total training steps
+            warmup_fraction: Fraction of training for warmup
+            *_start/*_end: Start and end values for each loss weight
+        """
+        self.total_steps = total_steps
+        self.warmup_steps = int(total_steps * warmup_fraction)
+        self.cls_start = cls_start
+        self.cls_end = cls_end
+        self.rank_start = rank_start
+        self.rank_end = rank_end
+        self.star_start = star_start
+        self.star_end = star_end
+    def get_weights(self, step: int) -> dict[str, float]:
+        """
+        Get loss weights for current step.
+        Returns:
+            Dict with lambda_cls, lambda_star, lambda_rank
+        """
+        if step < self.warmup_steps:
+            # During warmup: interpolate from start to mid
+            t = step / self.warmup_steps
+        else:
+            # After warmup: continue to end
+            t = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+            t = min(1.0, t)  # Clamp at 1
+        # Linear interpolation
+        lambda_cls = self.cls_start + t * (self.cls_end - self.cls_start)
+        lambda_rank = self.rank_start + t * (self.rank_end - self.rank_start)
+        lambda_star = self.star_start + t * (self.star_end - self.star_start)
+        return {
+            "lambda_cls": lambda_cls,
+            "lambda_star": lambda_star,
+            "lambda_rank": lambda_rank,
+        }

TaikoChartEstimator/model/model.py ADDED Viewed

	@@ -0,0 +1,374 @@

+"""
+Main TaikoChartEstimator Model
+Combines instance encoder, MIL aggregator, and output heads
+into a unified model for difficulty estimation.
+"""
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from ..data.tokenizer import DIFFICULTY_ORDER
+from .aggregator import GatedMILAggregator, MILAggregator
+from .encoder import InstanceEncoder, TCNInstanceEncoder
+from .heads import DifficultyClassifier, MonotonicCalibrator, RawScoreHead
+@dataclass
+class ModelConfig:
+    """Configuration for TaikoChartEstimator."""
+    # Instance encoder config
+    encoder_type: str = "transformer"  # "transformer" or "tcn"
+    d_model: int = 256
+    n_encoder_layers: int = 4
+    n_heads: int = 4
+    d_feedforward: int = 512
+    encoder_dropout: float = 0.1
+    max_seq_len: int = 128
+    encoder_pooling: str = "cls"
+    # MIL aggregator config
+    aggregator_type: str = "multibranch"  # "multibranch" or "gated"
+    n_attention_branches: int = 3
+    top_k_ratio: float = 0.1
+    stochastic_mask_prob: float = 0.3
+    aggregator_dropout: float = 0.1
+    # Head config
+    n_difficulty_classes: int = 5  # easy, normal, hard, oni, ura
+    head_hidden_dim: int = 128
+    head_dropout: float = 0.1
+    calibrator_method: str = "spline"  # "spline" or "mlp"
+    # Star ranges per difficulty
+    star_ranges: dict = None
+    def __post_init__(self):
+        if self.star_ranges is None:
+            self.star_ranges = {
+                0: (1, 5),  # easy
+                1: (1, 7),  # normal
+                2: (1, 8),  # hard
+                3: (1, 10),  # oni
+                4: (1, 10),  # ura
+            }
+        else:
+            # Fix JSON serialization issue: keys become strings, values become lists
+            # Convert back to int keys and tuple values
+            self.star_ranges = {
+                int(k): tuple(v) if isinstance(v, list) else v
+                for k, v in self.star_ranges.items()
+            }
+@dataclass
+class ModelOutput:
+    """Output from TaikoChartEstimator forward pass."""
+    raw_score: torch.Tensor  # [batch] unbounded difficulty score
+    difficulty_logits: torch.Tensor  # [batch, n_classes] difficulty logits
+    raw_star: torch.Tensor  # [batch] star rating (can be < 1 or > 10)
+    display_star: torch.Tensor  # [batch] star rating clipped to range
+    attention_info: dict  # MIL attention weights and metrics
+    instance_embeddings: torch.Tensor  # [batch, n_instances, d_model] for analysis
+class TaikoChartEstimator(nn.Module, PyTorchModelHubMixin):
+    """
+    MIL-based Taiko chart difficulty estimation model.
+    Takes a bag of chart instances (beat-aligned windows) and predicts:
+    1. Raw difficulty score (unbounded, ℝ)
+    2. Difficulty class (easy/normal/hard/oni/ura)
+    3. Star rating (per difficulty, can exceed nominal range)
+    Architecture:
+    - Instance Encoder: Transformer or TCN to encode each window
+    - MIL Aggregator: Multi-branch attention pooling
+    - Output Heads: Raw score, classifier, monotonic calibrator
+    """
+    def __init__(self, config: Optional[ModelConfig] = None):
+        """
+        Initialize model.
+        Args:
+            config: Model configuration (uses defaults if None)
+        """
+        super().__init__()
+        if config is None:
+            config = ModelConfig()
+        self.config = config
+        # Build instance encoder
+        if config.encoder_type == "transformer":
+            self.instance_encoder = InstanceEncoder(
+                d_model=config.d_model,
+                n_heads=config.n_heads,
+                n_layers=config.n_encoder_layers,
+                d_feedforward=config.d_feedforward,
+                dropout=config.encoder_dropout,
+                max_seq_len=config.max_seq_len,
+                pooling=config.encoder_pooling,
+            )
+        else:
+            self.instance_encoder = TCNInstanceEncoder(
+                d_model=config.d_model,
+                n_layers=config.n_encoder_layers,
+                dropout=config.encoder_dropout,
+            )
+        # Build MIL aggregator
+        if config.aggregator_type == "multibranch":
+            self.aggregator = MILAggregator(
+                d_instance=config.d_model,
+                n_branches=config.n_attention_branches,
+                top_k_ratio=config.top_k_ratio,
+                stochastic_mask_prob=config.stochastic_mask_prob,
+                dropout=config.aggregator_dropout,
+            )
+        else:
+            self.aggregator = GatedMILAggregator(
+                d_instance=config.d_model,
+                dropout=config.aggregator_dropout,
+            )
+        # Output heads
+        bag_dim = self.aggregator.output_dim
+        self.raw_score_head = RawScoreHead(
+            d_input=bag_dim,
+            d_hidden=config.head_hidden_dim,
+            dropout=config.head_dropout,
+        )
+        self.difficulty_classifier = DifficultyClassifier(
+            d_input=bag_dim,
+            n_classes=config.n_difficulty_classes,
+            d_hidden=config.head_hidden_dim,
+            dropout=config.head_dropout,
+        )
+        self.calibrator = MonotonicCalibrator(
+            method=config.calibrator_method,
+            n_difficulties=config.n_difficulty_classes,
+            star_ranges=config.star_ranges,
+        )
+    def encode_instances(
+        self,
+        instances: torch.Tensor,
+        instance_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Encode all instances in a batch.
+        Args:
+            instances: [batch, n_instances, seq_len, 6] token sequences
+            instance_masks: [batch, n_instances, seq_len] attention masks
+        Returns:
+            instance_embeddings: [batch, n_instances, d_model]
+        """
+        batch_size, n_instances, seq_len, n_features = instances.shape
+        # Flatten batch and instances
+        flat_instances = instances.view(batch_size * n_instances, seq_len, n_features)
+        flat_masks = instance_masks.view(batch_size * n_instances, seq_len)
+        # Encode
+        flat_embeddings = self.instance_encoder(flat_instances, flat_masks)
+        # Reshape back
+        instance_embeddings = flat_embeddings.view(batch_size, n_instances, -1)
+        return instance_embeddings
+    def forward(
+        self,
+        instances: torch.Tensor,
+        instance_masks: torch.Tensor,
+        instance_counts: Optional[torch.Tensor] = None,
+        difficulty_hint: Optional[torch.Tensor] = None,
+        return_attention: bool = True,
+    ) -> ModelOutput:
+        """
+        Forward pass through the model.
+        Args:
+            instances: [batch, n_instances, seq_len, 6] token sequences
+            instance_masks: [batch, n_instances, seq_len] token masks
+            instance_counts: [batch] number of valid instances per sample
+            difficulty_hint: [batch] difficulty class for calibration (uses predicted if None)
+            return_attention: Whether to return attention weights
+        Returns:
+            ModelOutput with all predictions
+        """
+        batch_size, n_instances, seq_len, _ = instances.shape
+        # Create instance-level mask from counts
+        if instance_counts is not None:
+            bag_mask = torch.arange(n_instances, device=instances.device).unsqueeze(0)
+            bag_mask = (bag_mask < instance_counts.unsqueeze(1)).float()
+        else:
+            # Infer from instance masks (if any token is valid, instance is valid)
+            bag_mask = (instance_masks.sum(dim=-1) > 0).float()
+        # Encode instances
+        instance_embeddings = self.encode_instances(instances, instance_masks)
+        # Aggregate to bag embedding
+        bag_embedding, attention_info = self.aggregator(
+            instance_embeddings,
+            bag_mask,
+            return_attention=return_attention,
+        )
+        # Raw score prediction (unbounded)
+        raw_score = self.raw_score_head(bag_embedding)
+        # Difficulty classification
+        difficulty_logits = self.difficulty_classifier(bag_embedding)
+        # Determine difficulty for calibration
+        if difficulty_hint is not None:
+            calibration_diff = difficulty_hint
+        else:
+            calibration_diff = difficulty_logits.argmax(dim=-1)
+        # Calibrate to star rating
+        raw_star = self.calibrator(raw_score, calibration_diff)
+        display_star = self.calibrator.clip_to_display(raw_star, calibration_diff)
+        return ModelOutput(
+            raw_score=raw_score,
+            difficulty_logits=difficulty_logits,
+            raw_star=raw_star,
+            display_star=display_star,
+            attention_info=attention_info,
+            instance_embeddings=instance_embeddings,
+        )
+    def predict(
+        self,
+        instances: torch.Tensor,
+        instance_masks: torch.Tensor,
+        instance_counts: Optional[torch.Tensor] = None,
+    ) -> dict:
+        """
+        Convenience method for inference.
+        Returns dict with human-readable outputs:
+        - difficulty_class: Predicted difficulty name
+        - raw_score: Unbounded difficulty score
+        - raw_star: Star rating (may exceed range)
+        - display_star: Star rating for display (clipped)
+        """
+        output = self.forward(
+            instances,
+            instance_masks,
+            instance_counts,
+            difficulty_hint=None,
+            return_attention=False,
+        )
+        difficulty_names = ["easy", "normal", "hard", "oni", "ura"]
+        predicted_class = output.difficulty_logits.argmax(dim=-1)
+        return {
+            "difficulty_class": [difficulty_names[c] for c in predicted_class.tolist()],
+            "difficulty_class_id": predicted_class,
+            "raw_score": output.raw_score,
+            "raw_star": output.raw_star,
+            "display_star": output.display_star,
+        }
+    def get_ranking_pairs_from_batch(
+        self,
+        raw_scores: torch.Tensor,
+        song_ids: list[str],
+        difficulties: list[str],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Extract within-song ranking pairs from a batch.
+        Args:
+            raw_scores: [batch] raw difficulty scores
+            song_ids: List of song IDs
+            difficulties: List of difficulty names
+        Returns:
+            (s_easier, s_harder) tensors for ranking loss
+        """
+        # Group by song
+        song_to_indices: dict[str, list[int]] = {}
+        for i, song_id in enumerate(song_ids):
+            if song_id not in song_to_indices:
+                song_to_indices[song_id] = []
+            song_to_indices[song_id].append(i)
+        easier_scores = []
+        harder_scores = []
+        for song_id, indices in song_to_indices.items():
+            if len(indices) < 2:
+                continue
+            # Sort by difficulty
+            sorted_indices = sorted(
+                indices, key=lambda i: DIFFICULTY_ORDER.get(difficulties[i], 0)
+            )
+            # Create pairs
+            for i in range(len(sorted_indices) - 1):
+                easier_idx = sorted_indices[i]
+                harder_idx = sorted_indices[i + 1]
+                easier_scores.append(raw_scores[easier_idx])
+                harder_scores.append(raw_scores[harder_idx])
+        if not easier_scores:
+            return (
+                torch.tensor([], device=raw_scores.device),
+                torch.tensor([], device=raw_scores.device),
+            )
+        return (
+            torch.stack(easier_scores),
+            torch.stack(harder_scores),
+        )
+def create_model(
+    d_model: int = 256,
+    n_layers: int = 4,
+    encoder_type: str = "transformer",
+    **kwargs,
+) -> TaikoChartEstimator:
+    """
+    Factory function to create model with common configurations.
+    Args:
+        d_model: Model dimension
+        n_layers: Number of encoder layers
+        encoder_type: "transformer" or "tcn"
+        **kwargs: Additional config overrides
+    Returns:
+        Configured TaikoChartEstimator
+    """
+    config = ModelConfig(
+        encoder_type=encoder_type,
+        d_model=d_model,
+        n_encoder_layers=n_layers,
+        **kwargs,
+    )
+    return TaikoChartEstimator(config)

TaikoChartEstimator/train/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+TaikoChartEstimator Training Package
+"""
+from . import __main__
+__all__ = ["__main__"]

TaikoChartEstimator/train/__main__.py ADDED Viewed

	@@ -0,0 +1,808 @@

+"""
+Training Script for TaikoChartEstimator
+Main entry point for training the MIL-based difficulty estimation model.
+Supports:
+- Multi-task learning (classification + regression + ranking)
+- Curriculum learning for loss weights
+- TensorBoard logging
+- Multi-objective checkpoint selection
+"""
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+import torch.optim as optim
+from scipy.stats import spearmanr
+from sklearn.metrics import (
+    balanced_accuracy_score,
+    f1_score,
+    precision_score,
+    recall_score,
+)
+from torch.utils.data import DataLoader, Subset
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from ..data import TaikoChartDataset, WithinSongPairSampler, collate_chart_bags
+from ..data.tokenizer import DIFFICULTY_ORDER
+from ..model import CurriculumScheduler, ModelConfig, TaikoChartEstimator, TotalLoss
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train TaikoChartEstimator")
+    # Data arguments
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="JacobLinCool/taiko-1000-parsed",
+        help="HuggingFace dataset name",
+    )
+    parser.add_argument(
+        "--cache-dir", type=str, default=None, help="Cache directory for dataset"
+    )
+    parser.add_argument(
+        "--include-audio", action="store_true", help="Include audio features (slower)"
+    )
+    # Model arguments
+    parser.add_argument("--d-model", type=int, default=256, help="Model dimension")
+    parser.add_argument(
+        "--n-layers", type=int, default=4, help="Number of encoder layers"
+    )
+    parser.add_argument(
+        "--encoder-type",
+        type=str,
+        default="transformer",
+        choices=["transformer", "tcn"],
+        help="Instance encoder type",
+    )
+    parser.add_argument(
+        "--n-branches", type=int, default=3, help="Number of attention branches in MIL"
+    )
+    # Training arguments
+    parser.add_argument(
+        "--epochs", type=int, default=100, help="Number of training epochs"
+    )
+    parser.add_argument("--batch-size", type=int, default=16, help="Batch size")
+    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
+    parser.add_argument("--weight-decay", type=float, default=0.01, help="Weight decay")
+    parser.add_argument(
+        "--grad-clip", type=float, default=1.0, help="Gradient clipping norm"
+    )
+    # Loss weights
+    parser.add_argument(
+        "--lambda-cls", type=float, default=1.0, help="Classification loss weight"
+    )
+    parser.add_argument(
+        "--lambda-star", type=float, default=1.0, help="Star regression loss weight"
+    )
+    parser.add_argument(
+        "--lambda-rank", type=float, default=1.0, help="Ranking loss weight"
+    )
+    parser.add_argument(
+        "--use-curriculum",
+        action="store_true",
+        help="Use curriculum learning for loss weights",
+    )
+    # Checkpointing and logging
+    parser.add_argument(
+        "--output-dir", type=str, default="outputs", help="Output directory"
+    )
+    parser.add_argument(
+        "--tensorboard-dir", type=str, default="runs", help="TensorBoard log directory"
+    )
+    parser.add_argument(
+        "--save-every", type=int, default=5, help="Save checkpoint every N epochs"
+    )
+    parser.add_argument(
+        "--eval-every", type=int, default=1, help="Evaluate every N epochs"
+    )
+    # Misc
+    parser.add_argument("--seed", type=int, default=2025, help="Random seed")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda" if torch.cuda.is_available() else "cpu",
+        help="Device to use",
+    )
+    parser.add_argument(
+        "--overfit-batch",
+        action="store_true",
+        help="Overfit on a single batch (for debugging)",
+    )
+    parser.add_argument(
+        "--num-workers", type=int, default=16, help="Number of data loader workers"
+    )
+    return parser.parse_args()
+def set_seed(seed: int):
+    """Set random seeds for reproducibility."""
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+def compute_class_weights(
+    dataset: TaikoChartDataset, merge_ura_oni: bool = True
+) -> torch.Tensor:
+    """Compute class weights based on class frequencies.
+    Args:
+        dataset: The training dataset
+        merge_ura_oni: If True, treat ura and oni as the same class (4 classes total)
+    Returns:
+        Class weights tensor (4 or 5 weights depending on merge_ura_oni)
+    """
+    n_classes = 4 if merge_ura_oni else 5
+    class_counts = [0] * n_classes
+    for song_idx, diff in dataset.chart_index:
+        diff_id = {"easy": 0, "normal": 1, "hard": 2, "oni": 3, "ura": 4}.get(diff, 0)
+        # Merge ura into oni if enabled
+        if merge_ura_oni and diff_id == 4:
+            diff_id = 3
+        class_counts[diff_id] += 1
+    total = sum(class_counts)
+    weights = [
+        total / (n_classes * count) if count > 0 else 1.0 for count in class_counts
+    ]
+    return torch.tensor(weights, dtype=torch.float32)
+def extract_ranking_pairs(
+    batch: dict, raw_scores: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Extract within-song ranking pairs from batch."""
+    song_ids = batch["song_ids"]
+    difficulties = batch["difficulties"]
+    # Group by song
+    song_to_indices: dict[str, list[int]] = {}
+    for i, song_id in enumerate(song_ids):
+        if song_id not in song_to_indices:
+            song_to_indices[song_id] = []
+        song_to_indices[song_id].append(i)
+    easier_scores = []
+    harder_scores = []
+    for song_id, indices in song_to_indices.items():
+        if len(indices) < 2:
+            continue
+        # Sort by difficulty
+        sorted_indices = sorted(
+            indices, key=lambda i: DIFFICULTY_ORDER.get(difficulties[i], 0)
+        )
+        # Create adjacent pairs
+        for i in range(len(sorted_indices) - 1):
+            easier_idx = sorted_indices[i]
+            harder_idx = sorted_indices[i + 1]
+            easier_scores.append(raw_scores[easier_idx])
+            harder_scores.append(raw_scores[harder_idx])
+    if not easier_scores:
+        return (
+            torch.tensor([], device=raw_scores.device),
+            torch.tensor([], device=raw_scores.device),
+        )
+    return torch.stack(easier_scores), torch.stack(harder_scores)
+def train_epoch(
+    model: TaikoChartEstimator,
+    dataloader: DataLoader,
+    criterion: TotalLoss,
+    optimizer: optim.Optimizer,
+    scheduler: Optional[optim.lr_scheduler._LRScheduler],
+    device: torch.device,
+    epoch: int,
+    writer: Optional[SummaryWriter] = None,
+    curriculum: Optional[CurriculumScheduler] = None,
+    grad_clip: float = 1.0,
+) -> dict:
+    """Train for one epoch."""
+    model.train()
+    total_loss = 0.0
+    total_cls_loss = 0.0
+    total_star_loss = 0.0
+    total_rank_loss = 0.0
+    n_batches = 0
+    n_ranking_pairs = 0
+    pbar = tqdm(dataloader, desc=f"Epoch {epoch}")
+    for batch_idx, batch in enumerate(pbar):
+        global_step = epoch * len(dataloader) + batch_idx
+        # Update curriculum weights
+        if curriculum is not None:
+            weights = curriculum.get_weights(global_step)
+            criterion.set_weights(**weights)
+        # Move batch to device
+        instances = batch["instances"].to(device)
+        instance_masks = batch["instance_masks"].to(device)
+        instance_counts = batch["instance_counts"].to(device)
+        difficulty_class = batch["difficulty_class"].to(device)
+        star = batch["star"].to(device)
+        is_right_censored = batch["is_right_censored"].to(device)
+        is_left_censored = batch["is_left_censored"].to(device)
+        # Forward pass
+        output = model(
+            instances,
+            instance_masks,
+            instance_counts,
+            difficulty_hint=difficulty_class,  # Use ground truth for training
+        )
+        # Extract ranking pairs
+        s_easier, s_harder = extract_ranking_pairs(batch, output.raw_score)
+        ranking_pairs = (s_easier, s_harder) if s_easier.numel() > 0 else None
+        n_ranking_pairs += s_easier.numel()
+        # Compute losses
+        losses = criterion(
+            difficulty_logits=output.difficulty_logits,
+            pred_star=output.raw_star,
+            target_difficulty=difficulty_class,
+            target_star=star,
+            is_right_censored=is_right_censored,
+            is_left_censored=is_left_censored,
+            ranking_pairs=ranking_pairs,
+        )
+        # Backward pass
+        optimizer.zero_grad()
+        losses["total"].backward()
+        # Gradient clipping
+        if grad_clip > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        optimizer.step()
+        # Track losses
+        total_loss += losses["total"].item()
+        total_cls_loss += losses["cls"].item()
+        total_star_loss += losses["star"].item()
+        total_rank_loss += losses["rank"].item()
+        n_batches += 1
+        # Update progress bar
+        pbar.set_postfix(
+            {
+                "loss": f"{losses['total'].item():.4f}",
+                "cls": f"{losses['cls'].item():.4f}",
+                "star": f"{losses['star'].item():.4f}",
+                "rank": f"{losses['rank'].item():.4f}",
+            }
+        )
+        # Log to TensorBoard
+        if writer is not None and batch_idx % 10 == 0:
+            writer.add_scalar("train/loss_total", losses["total"].item(), global_step)
+            writer.add_scalar("train/loss_cls", losses["cls"].item(), global_step)
+            writer.add_scalar("train/loss_star", losses["star"].item(), global_step)
+            writer.add_scalar("train/loss_rank", losses["rank"].item(), global_step)
+            # Log attention health metrics
+            if "entropy" in output.attention_info:
+                writer.add_scalar(
+                    "train/attention_entropy",
+                    output.attention_info["entropy"].mean().item(),
+                    global_step,
+                )
+            if "effective_n" in output.attention_info:
+                writer.add_scalar(
+                    "train/effective_instances",
+                    output.attention_info["effective_n"].mean().item(),
+                    global_step,
+                )
+            if "top5_mass" in output.attention_info:
+                writer.add_scalar(
+                    "train/top5_attention_mass",
+                    output.attention_info["top5_mass"].mean().item(),
+                    global_step,
+                )
+    if scheduler is not None:
+        scheduler.step()
+    return {
+        "loss": total_loss / n_batches,
+        "cls_loss": total_cls_loss / n_batches,
+        "star_loss": total_star_loss / n_batches,
+        "rank_loss": total_rank_loss / n_batches,
+        "n_ranking_pairs": n_ranking_pairs,
+    }
+@torch.no_grad()
+def evaluate(
+    model: TaikoChartEstimator,
+    dataloader: DataLoader,
+    criterion: TotalLoss,
+    device: torch.device,
+) -> dict:
+    """Evaluate model on validation set."""
+    model.eval()
+    all_pred_class = []
+    all_true_class = []
+    all_pred_star = []
+    all_true_star = []
+    all_raw_scores = []
+    all_difficulties = []
+    all_song_ids = []
+    all_is_right_censored = []
+    total_loss = 0.0
+    n_batches = 0
+    for batch in tqdm(dataloader, desc="Evaluating"):
+        instances = batch["instances"].to(device)
+        instance_masks = batch["instance_masks"].to(device)
+        instance_counts = batch["instance_counts"].to(device)
+        difficulty_class = batch["difficulty_class"].to(device)
+        star = batch["star"].to(device)
+        is_right_censored = batch["is_right_censored"].to(device)
+        is_left_censored = batch["is_left_censored"].to(device)
+        output = model(
+            instances,
+            instance_masks,
+            instance_counts,
+            difficulty_hint=difficulty_class,
+        )
+        # Compute loss
+        losses = criterion(
+            difficulty_logits=output.difficulty_logits,
+            pred_star=output.raw_star,
+            target_difficulty=difficulty_class,
+            target_star=star,
+            is_right_censored=is_right_censored,
+            is_left_censored=is_left_censored,
+        )
+        total_loss += losses["total"].item()
+        n_batches += 1
+        # Collect predictions
+        all_pred_class.extend(output.difficulty_logits.argmax(dim=-1).cpu().tolist())
+        all_true_class.extend(difficulty_class.cpu().tolist())
+        all_pred_star.extend(output.raw_star.cpu().tolist())
+        all_true_star.extend(star.cpu().tolist())
+        all_raw_scores.extend(output.raw_score.cpu().tolist())
+        all_difficulties.extend(batch["difficulties"])
+        all_song_ids.extend(batch["song_ids"])
+        all_is_right_censored.extend(is_right_censored.cpu().tolist())
+    # Compute metrics
+    all_pred_class = np.array(all_pred_class)
+    all_true_class = np.array(all_true_class)
+    all_pred_star = np.array(all_pred_star)
+    all_true_star = np.array(all_true_star)
+    all_raw_scores = np.array(all_raw_scores)
+    all_is_right_censored = np.array(all_is_right_censored)
+    # Merge ura (4) and oni (3) for classification metrics
+    # They are essentially the same difficulty level
+    all_pred_class_merged = all_pred_class.copy()
+    all_true_class_merged = all_true_class.copy()
+    all_pred_class_merged[all_pred_class_merged == 4] = 3  # Map ura -> oni
+    all_true_class_merged[all_true_class_merged == 4] = 3  # Map ura -> oni
+    # Classification metrics (using merged classes)
+    macro_f1 = f1_score(all_true_class_merged, all_pred_class_merged, average="macro")
+    balanced_acc = balanced_accuracy_score(all_true_class_merged, all_pred_class_merged)
+    plus_minus_1_acc = (
+        np.abs(all_pred_class_merged - all_true_class_merged) <= 1
+    ).mean()
+    # Per-difficulty classification metrics (precision, recall, F1)
+    diff_names_cls = ["easy", "normal", "hard", "oni_ura"]
+    per_diff_cls_metrics = {}
+    per_class_f1 = f1_score(
+        all_true_class_merged, all_pred_class_merged, average=None, labels=[0, 1, 2, 3]
+    )
+    per_class_precision = precision_score(
+        all_true_class_merged,
+        all_pred_class_merged,
+        average=None,
+        labels=[0, 1, 2, 3],
+        zero_division=0,
+    )
+    per_class_recall = recall_score(
+        all_true_class_merged,
+        all_pred_class_merged,
+        average=None,
+        labels=[0, 1, 2, 3],
+        zero_division=0,
+    )
+    for i, name in enumerate(diff_names_cls):
+        if i < len(per_class_f1):
+            per_diff_cls_metrics[f"f1_{name}"] = per_class_f1[i]
+            per_diff_cls_metrics[f"precision_{name}"] = per_class_precision[i]
+            per_diff_cls_metrics[f"recall_{name}"] = per_class_recall[i]
+    # Star regression metrics (on uncensored samples)
+    uncensored_mask = ~all_is_right_censored
+    if uncensored_mask.sum() > 0:
+        mae_uncensored = np.abs(
+            all_pred_star[uncensored_mask] - all_true_star[uncensored_mask]
+        ).mean()
+        spearman_rho, _ = spearmanr(all_pred_star, all_true_star)
+    else:
+        mae_uncensored = 0.0
+        spearman_rho = 0.0
+    # Per-difficulty Star MAE & RMSE (using merged oni/ura as same class)
+    diff_names_merged = ["easy", "normal", "hard", "oni_ura"]
+    per_diff_star_metrics = {}
+    for diff_idx, diff_name in enumerate(diff_names_merged):
+        if diff_idx == 3:
+            # oni_ura: merge classes 3 and 4
+            mask = (all_true_class == 3) | (all_true_class == 4)
+        else:
+            mask = all_true_class == diff_idx
+        if mask.sum() > 0:
+            diff_pred = all_pred_star[mask]
+            diff_true = all_true_star[mask]
+            diff_errors = diff_pred - diff_true
+            per_diff_star_metrics[f"mae_star_{diff_name}"] = np.abs(diff_errors).mean()
+            per_diff_star_metrics[f"rmse_star_{diff_name}"] = np.sqrt(
+                (diff_errors**2).mean()
+            )
+        else:
+            per_diff_star_metrics[f"mae_star_{diff_name}"] = 0.0
+            per_diff_star_metrics[f"rmse_star_{diff_name}"] = 0.0
+    # Monotonicity metrics
+    song_groups: dict[str, list] = {}
+    for i, song_id in enumerate(all_song_ids):
+        if song_id not in song_groups:
+            song_groups[song_id] = []
+        song_groups[song_id].append(
+            {
+                "difficulty": all_difficulties[i],
+                "raw_score": all_raw_scores[i],
+            }
+        )
+    n_violations = 0
+    n_pairs = 0
+    for song_id, charts in song_groups.items():
+        if len(charts) < 2:
+            continue
+        sorted_charts = sorted(
+            charts, key=lambda c: DIFFICULTY_ORDER.get(c["difficulty"], 0)
+        )
+        for i in range(len(sorted_charts) - 1):
+            n_pairs += 1
+            if sorted_charts[i]["raw_score"] >= sorted_charts[i + 1]["raw_score"]:
+                n_violations += 1
+    violation_rate = n_violations / n_pairs if n_pairs > 0 else 0.0
+    # Decompression metrics (for 10-star samples)
+    max_star_mask = all_true_star >= 10.0
+    if max_star_mask.sum() > 1:
+        pred_10star = all_pred_star[max_star_mask]
+        decompression_std = pred_10star.std()
+        p90_p50 = np.percentile(pred_10star, 90) - np.percentile(pred_10star, 50)
+    else:
+        decompression_std = 0.0
+        p90_p50 = 0.0
+    result = {
+        "loss": total_loss / n_batches,
+        "macro_f1": macro_f1,
+        "balanced_accuracy": balanced_acc,
+        "plus_minus_1_accuracy": plus_minus_1_acc,
+        "mae_uncensored": mae_uncensored,
+        "spearman_rho": spearman_rho,
+        "monotonicity_violation_rate": violation_rate,
+        "decompression_std": decompression_std,
+        "decompression_p90_p50": p90_p50,
+    }
+    # Add per-difficulty classification metrics
+    result.update(per_diff_cls_metrics)
+    # Add per-difficulty star metrics
+    result.update(per_diff_star_metrics)
+    return result
+def save_checkpoint(
+    model: TaikoChartEstimator,
+    optimizer: optim.Optimizer,
+    epoch: int,
+    metrics: dict,
+    output_dir: Path,
+    name: str = "checkpoint",
+):
+    """Save model checkpoint."""
+    checkpoint = {
+        "epoch": epoch,
+        "model_state_dict": model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "metrics": metrics,
+        "config": model.config.__dict__,
+    }
+    pretrained_path = output_dir / "pretrained" / name
+    model.save_pretrained(pretrained_path)
+    path = output_dir / f"{name}_epoch{epoch}.pt"
+    torch.save(checkpoint, path)
+    # Also save as latest
+    latest_path = output_dir / f"{name}_latest.pt"
+    torch.save(checkpoint, latest_path)
+    return path
+def main():
+    args = parse_args()
+    set_seed(args.seed)
+    # Create output directories
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = Path(args.output_dir) / timestamp
+    output_dir.mkdir(parents=True, exist_ok=True)
+    tensorboard_dir = Path(args.tensorboard_dir) / timestamp
+    writer = SummaryWriter(tensorboard_dir)
+    # Save args
+    with open(output_dir / "args.json", "w") as f:
+        json.dump(vars(args), f, indent=2)
+    print(f"Output directory: {output_dir}")
+    print(f"TensorBoard directory: {tensorboard_dir}")
+    # Load datasets
+    print("Loading datasets...")
+    train_dataset = TaikoChartDataset(
+        split="train",
+        dataset_name=args.dataset,
+        include_audio=args.include_audio,
+        cache_dir=args.cache_dir,
+    )
+    val_dataset = TaikoChartDataset(
+        split="test",
+        dataset_name=args.dataset,
+        include_audio=args.include_audio,
+        cache_dir=args.cache_dir,
+    )
+    print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")
+    # Create data loaders
+    if args.overfit_batch:
+        # Take a small subset for debugging
+        train_dataset = Subset(train_dataset, list(range(min(32, len(train_dataset)))))
+        val_dataset = Subset(val_dataset, list(range(min(8, len(val_dataset)))))
+    train_sampler = WithinSongPairSampler(
+        train_dataset
+        if not isinstance(train_dataset, torch.utils.data.Subset)
+        else train_dataset.dataset,
+        min_batch_size=args.batch_size,
+        shuffle=True,
+        seed=args.seed,
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler if not args.overfit_batch else None,
+        batch_size=args.batch_size if args.overfit_batch else 1,
+        shuffle=args.overfit_batch,
+        collate_fn=collate_chart_bags,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=args.batch_size,
+        shuffle=False,
+        collate_fn=collate_chart_bags,
+        num_workers=args.num_workers,
+        pin_memory=True,
+    )
+    # Create model
+    print("Creating model...")
+    config = ModelConfig(
+        encoder_type=args.encoder_type,
+        d_model=args.d_model,
+        n_encoder_layers=args.n_layers,
+        n_attention_branches=args.n_branches,
+    )
+    model = TaikoChartEstimator(config)
+    model = model.to(args.device)
+    # Count parameters
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Model parameters: {n_params:,}")
+    # Create optimizer and scheduler
+    optimizer = optim.AdamW(
+        model.parameters(),
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+    )
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(
+        optimizer,
+        T_max=args.epochs,
+        eta_min=args.lr * 0.01,
+    )
+    # Create loss function
+    class_weights = compute_class_weights(
+        train_dataset
+        if not isinstance(train_dataset, torch.utils.data.Subset)
+        else train_dataset.dataset
+    ).to(args.device)
+    criterion = TotalLoss(
+        lambda_cls=args.lambda_cls,
+        lambda_star=args.lambda_star,
+        lambda_rank=args.lambda_rank,
+        class_weights=class_weights,
+    )
+    # Curriculum scheduler
+    curriculum = None
+    if args.use_curriculum:
+        total_steps = args.epochs * len(train_loader)
+        curriculum = CurriculumScheduler(total_steps)
+    # Composite score function for model selection
+    def compute_composite_score(metrics: dict) -> float:
+        """
+        Compute weighted composite score for model selection.
+        Weights prioritize Spearman (star ranking) as the core objective.
+        - Spearman ρ: 55% (star prediction ranking accuracy)
+        - Macro-F1: 25% (difficulty classification)
+        - Violation Rate: 20% (monotonicity constraint)
+        """
+        # Clamp to reasonable ranges observed in training
+        f1 = max(0.70, min(0.90, metrics["macro_f1"]))
+        spearman = max(0.80, min(0.98, metrics["spearman_rho"]))
+        violation = max(0.0, min(0.10, metrics["monotonicity_violation_rate"]))
+        # Normalize to 0-1
+        f1_norm = (f1 - 0.70) / 0.20
+        spearman_norm = (spearman - 0.80) / 0.18
+        violation_norm = 1.0 - violation / 0.10  # Lower is better
+        return 0.6 * spearman_norm + 0.25 * f1_norm + 0.15 * violation_norm
+    # Training loop
+    print("Starting training...")
+    best_metrics = {
+        "macro_f1": 0.0,
+        "spearman_rho": 0.0,
+        "monotonicity_violation_rate": 1.0,
+    }
+    best_composite_score = 0.0
+    for epoch in range(1, args.epochs + 1):
+        # Train
+        train_metrics = train_epoch(
+            model=model,
+            dataloader=train_loader,
+            criterion=criterion,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            device=torch.device(args.device),
+            epoch=epoch,
+            writer=writer,
+            curriculum=curriculum,
+            grad_clip=args.grad_clip,
+        )
+        print(
+            f"Epoch {epoch} - Train Loss: {train_metrics['loss']:.4f}, "
+            f"Cls: {train_metrics['cls_loss']:.4f}, Star: {train_metrics['star_loss']:.4f}, "
+            f"Rank: {train_metrics['rank_loss']:.4f} ({train_metrics['n_ranking_pairs']} pairs)"
+        )
+        # Log training metrics
+        writer.add_scalar("epoch/train_loss", train_metrics["loss"], epoch)
+        writer.add_scalar("epoch/learning_rate", scheduler.get_last_lr()[0], epoch)
+        # Evaluate
+        if epoch % args.eval_every == 0:
+            val_metrics = evaluate(
+                model=model,
+                dataloader=val_loader,
+                criterion=criterion,
+                device=torch.device(args.device),
+            )
+            # Compute composite score
+            composite_score = compute_composite_score(val_metrics)
+            print(
+                f"Epoch {epoch} - Val Loss: {val_metrics['loss']:.4f}, "
+                f"Macro-F1: {val_metrics['macro_f1']:.4f}, "
+                f"Spearman: {val_metrics['spearman_rho']:.4f}, "
+                f"Violation Rate: {val_metrics['monotonicity_violation_rate']:.4f}, "
+                f"Decomp Std: {val_metrics['decompression_std']:.4f}, "
+                f"Composite: {composite_score:.4f}"
+            )
+            # Log validation metrics
+            for key, value in val_metrics.items():
+                writer.add_scalar(f"val/{key}", value, epoch)
+            writer.add_scalar("val/composite_score", composite_score, epoch)
+            # Save best model based on composite score
+            if composite_score > best_composite_score:
+                best_composite_score = composite_score
+                best_metrics = val_metrics
+                save_checkpoint(
+                    model, optimizer, epoch, val_metrics, output_dir, "best"
+                )
+                print(f"  -> New best model saved! (Composite: {composite_score:.4f})")
+        # Periodic checkpoint
+        if epoch % args.save_every == 0:
+            save_checkpoint(
+                model, optimizer, epoch, train_metrics, output_dir, "checkpoint"
+            )
+    # Save final model
+    save_checkpoint(model, optimizer, args.epochs, best_metrics, output_dir, "final")
+    print(f"\nTraining complete!")
+    print(f"Best Composite Score: {best_composite_score:.4f}")
+    print(f"  - Macro-F1: {best_metrics['macro_f1']:.4f}")
+    print(f"  - Spearman: {best_metrics['spearman_rho']:.4f}")
+    print(f"  - Violation Rate: {best_metrics['monotonicity_violation_rate']:.4f}")
+    print(f"Checkpoints saved to: {output_dir}")
+    writer.close()
+if __name__ == "__main__":
+    main()