Add CM3P model

Browse files

Files changed (12) hide show

audio_feature_extractor/preprocessor_config.json +18 -0
beatmap_parser/preprocessor_config.json +18 -0
beatmap_tokenizer/special_tokens_map.json +56 -0
beatmap_tokenizer/tokenizer_config.json +119 -0
beatmap_tokenizer/vocab.json +0 -0
metadata_tokenizer/special_tokens_map.json +46 -0
metadata_tokenizer/tokenizer_config.json +0 -0
metadata_tokenizer/vocab.json +0 -0
parsing_cm3p.py +757 -0
processing_cm3p.py +835 -0
processor_config.json +33 -0
tokenization_cm3p.py +808 -0

audio_feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
+  "chunk_length": 30,
+  "dither": 0.0,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "CM3PProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

beatmap_parser/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "add_distances": false,
+  "add_hitsounds": true,
+  "add_kiai": true,
+  "add_mania_sv": true,
+  "add_positions": true,
+  "add_snapping": false,
+  "add_sv": true,
+  "add_timing": true,
+  "add_timing_points": true,
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
+  "feature_extractor_type": "CM3PBeatmapParser",
+  "mania_bpm_normalized_scroll_speed": true,
+  "processor_class": "CM3PProcessor",
+  "slider_version": 2
+}

beatmap_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "additional_special_tokens": [
+    "[AUDIO_BOS]",
+    "[AUDIO_EOS]",
+    "[AUDIO]"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[EOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

beatmap_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,119 @@

+{
+  "add_cls_token": true,
+  "added_tokens_decoder": {
+    "3958": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3959": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3960": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3961": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3962": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3963": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3964": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3965": {
+      "content": "[AUDIO_BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3966": {
+      "content": "[AUDIO_EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3967": {
+      "content": "[AUDIO]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[AUDIO_BOS]",
+    "[AUDIO_EOS]",
+    "[AUDIO]"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "distance_step": 4,
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_distance": 640,
+  "max_time": 16000,
+  "min_time": 0,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "position_range": [
+    -256,
+    768,
+    -256,
+    640
+  ],
+  "position_split_axes": true,
+  "position_step": 4,
+  "processor_class": "CM3PProcessor",
+  "sep_token": "[SEP]",
+  "separate_new_combo_token": false,
+  "time_step": 10,
+  "tokenizer_class": "CM3PBeatmapTokenizer",
+  "unk_token": "[UNK]"
+}

beatmap_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

metadata_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "additional_special_tokens": [
+    "[DIFFICULTY_UNK]",
+    "[YEAR_UNK]",
+    "[MODE_UNK]",
+    "[STATUS_UNK]",
+    "[MAPPER_UNK]",
+    "[CS_UNK]",
+    "[HITSOUNDED_UNK]",
+    "[SONG_LENGTH_UNK]",
+    "[SONG_POSITION_UNK]",
+    "[GLOBAL_SV_UNK]",
+    "[MANIA_KEYCOUNT_UNK]",
+    "[HOLD_NOTE_RATIO_UNK]",
+    "[SCROLL_SPEED_RATIO_UNK]",
+    "[TAG_UNK]"
+  ],
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[EOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

metadata_tokenizer/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

metadata_tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

parsing_cm3p.py ADDED Viewed

	@@ -0,0 +1,757 @@

+import dataclasses
+from datetime import timedelta
+from enum import Enum
+from os import PathLike
+from typing import Optional, Union, IO
+import numpy as np
+import numpy.typing as npt
+from slider import Beatmap, Circle, Slider, Spinner, HoldNote, TimingPoint
+from slider.curve import Linear, Catmull, Perfect, MultiBezier
+from transformers import FeatureExtractionMixin, AutoFeatureExtractor
+from .configuration_cm3p import CM3PConfig
+class EventType(Enum):
+    CIRCLE = "circle"
+    SPINNER = "spinner"
+    SPINNER_END = "spinner_end"
+    SLIDER_HEAD = "slider_head"
+    BEZIER_ANCHOR = "bezier_anchor"
+    PERFECT_ANCHOR = "perfect_anchor"
+    CATMULL_ANCHOR = "catmull_anchor"
+    RED_ANCHOR = "red_anchor"
+    LAST_ANCHOR = "last_anchor"
+    SLIDER_END = "slider_end"
+    REPEAT_END = "repeat_end"
+    BEAT = "beat"
+    MEASURE = "measure"
+    TIMING_POINT = "timing_point"
+    KIAI_ON = "kiai_on"
+    KIAI_OFF = "kiai_off"
+    HOLD_NOTE = "hold_note"
+    HOLD_NOTE_END = "hold_note_end"
+    SCROLL_SPEED_CHANGE = "scroll_speed_change"
+    DRUMROLL = "drumroll"
+    DRUMROLL_END = "drumroll_end"
+    DENDEN = "denden"
+    DENDEN_END = "denden_end"
+EVENT_TYPES_WITH_NEW_COMBO = [
+    EventType.CIRCLE,
+    EventType.SLIDER_HEAD,
+]
+@dataclasses.dataclass
+class Group:
+    event_type: EventType = None
+    time: int = 0
+    has_time: bool = False
+    snapping: int = None
+    distance: int = None
+    x: int = None
+    y: int = None
+    mania_column: int = None
+    new_combo: bool = False
+    hitsounds: list[int] = dataclasses.field(default_factory=list)
+    samplesets: list[int] = dataclasses.field(default_factory=list)
+    additions: list[int] = dataclasses.field(default_factory=list)
+    volumes: list[int] = dataclasses.field(default_factory=list)
+    scroll_speed: float = None
+def merge_groups(groups1: list[Group], groups2: list[Group]) -> list[Group]:
+    """Merge two lists of groups in a time sorted manner. Assumes both lists are sorted by time.
+    Args:
+        groups1: List of groups.
+        groups2: List of groups.
+    Returns:
+        merged_groups: Merged list of groups.
+    """
+    merged_groups = []
+    i = 0
+    j = 0
+    t1 = -np.inf
+    t2 = -np.inf
+    while i < len(groups1) and j < len(groups2):
+        t1 = groups1[i].time or t1
+        t2 = groups2[j].time or t2
+        if t1 <= t2:
+            merged_groups.append(groups1[i])
+            i += 1
+        else:
+            merged_groups.append(groups2[j])
+            j += 1
+    # Add remaining groups from both lists
+    merged_groups.extend(groups1[i:])
+    merged_groups.extend(groups2[j:])
+    return merged_groups
+def speed_groups(groups: list[Group], speed: float) -> list[Group]:
+    """Change the speed of a list of groups.
+    Args:
+        groups: List of groups.
+        speed: Speed multiplier.
+    Returns:
+        sped_groups: Sped up list of groups.
+    """
+    sped_groups = []
+    for group in groups:
+        group.time = int(group.time / speed)
+        sped_groups.append(group)
+    return sped_groups
+def get_median_mpb_beatmap(beatmap: Beatmap) -> float:
+    # Not include last slider's end time
+    last_time = max(ho.end_time if isinstance(ho, HoldNote) else ho.time for ho in beatmap.hit_objects(stacking=False))
+    last_time = int(last_time.seconds * 1000)
+    return get_median_mpb(beatmap.timing_points, last_time)
+def get_median_mpb(timing_points: list[TimingPoint], last_time: float) -> float:
+    # This is identical to osu! stable implementation
+    this_beat_length = 0
+    bpm_durations = {}
+    for i in range(len(timing_points) - 1, -1, -1):
+        tp = timing_points[i]
+        offset = int(tp.offset.seconds * 1000)
+        if tp.parent is None:
+            this_beat_length = tp.ms_per_beat
+        if this_beat_length == 0 or offset > last_time or (tp.parent is not None and i > 0):
+            continue
+        if this_beat_length in bpm_durations:
+            bpm_durations[this_beat_length] += int(last_time - (0 if i == 0 else offset))
+        else:
+            bpm_durations[this_beat_length] = int(last_time - (0 if i == 0 else offset))
+        last_time = offset
+    longest_time = 0
+    median = 0
+    for bpm, duration in bpm_durations.items():
+        if duration > longest_time:
+            longest_time = duration
+            median = bpm
+    return median
+def load_beatmap(beatmap: Union[str, PathLike, IO[str], Beatmap]) -> Beatmap:
+    """Load a beatmap from a file path, file object, or Beatmap object.
+    Args:
+        beatmap: Beatmap file path, file object, or Beatmap object.
+    Returns:
+        beatmap: Loaded Beatmap object.
+    """
+    if isinstance(beatmap, (str, PathLike)):
+        beatmap = Beatmap.from_path(beatmap)
+    elif isinstance(beatmap, IO):
+        beatmap = Beatmap.from_file(beatmap.name)
+    return beatmap
+def get_song_length(
+        samples: np.ndarray = None,
+        sample_rate: int = None,
+        beatmap: Union[Beatmap | list[TimingPoint]] = None,
+) -> float:
+    if samples is not None and sample_rate is not None:
+        return len(samples) / sample_rate
+    if beatmap is None:
+        return 0
+    if isinstance(beatmap, Beatmap) and len(beatmap.hit_objects(stacking=False)) > 0:
+        last_ho = beatmap.hit_objects(stacking=False)[-1]
+        last_time = last_ho.end_time if hasattr(last_ho, "end_time") else last_ho.time
+        return last_time.total_seconds() + 0.000999  # Add a small buffer to the last time
+    timing = beatmap.timing_points if isinstance(beatmap, Beatmap) else beatmap
+    if len(timing) == 0:
+        return 0
+    return timing[-1].offset.total_seconds() + 0.01
+class CM3PBeatmapParser(FeatureExtractionMixin):
+    """
+    A class to parse CM3P beatmap files.
+    """
+    def __init__(
+            self,
+            add_timing: bool = True,
+            add_snapping: bool = True,
+            add_timing_points: bool = True,
+            add_hitsounds: bool = True,
+            add_distances: bool = True,
+            add_positions: bool = True,
+            add_kiai: bool = True,
+            add_sv: bool = True,
+            add_mania_sv: bool = True,
+            mania_bpm_normalized_scroll_speed: bool = True,
+            slider_version: int = 2,
+            **kwargs,
+    ):
+        self.add_timing = add_timing
+        self.add_snapping = add_snapping
+        self.add_timing_points = add_timing_points
+        self.add_hitsounds = add_hitsounds
+        self.add_distances = add_distances
+        self.add_positions = add_positions
+        self.add_kiai = add_kiai
+        self.add_sv = add_sv
+        self.add_mania_sv = add_mania_sv
+        self.mania_bpm_normalized_scroll_speed = mania_bpm_normalized_scroll_speed
+        self.slider_version = slider_version
+        super().__init__(**kwargs)
+    def parse_beatmap(
+            self,
+            beatmap: Union[str, PathLike, IO[str], Beatmap],
+            speed: float = 1.0,
+            song_length: Optional[float] = None
+    ) -> list[Group]:
+        """Parse an .osu beatmap.
+        Each hit object is parsed into a list of Event objects, in order of its
+        appearance in the beatmap. In other words, in ascending order of time.
+        Args:
+            beatmap: Beatmap object parsed from an .osu file.
+            speed: Speed multiplier for the beatmap.
+            song_length: Length of the song in seconds. If not provided, it will be calculated from the beatmap.
+        Returns:
+            events: List of Event object lists.
+            event_times: List of event times.
+        """
+        beatmap = load_beatmap(beatmap)
+        hit_objects = beatmap.hit_objects(stacking=False)
+        last_pos = np.array((256, 192))
+        groups = []
+        for hit_object in hit_objects:
+            if isinstance(hit_object, Circle):
+                last_pos = self._parse_circle(hit_object, groups, last_pos, beatmap)
+            elif isinstance(hit_object, Slider):
+                if beatmap.mode == 1:
+                    self._parse_drumroll(hit_object, groups, beatmap)
+                else:
+                    last_pos = self._parse_slider(hit_object, groups, last_pos, beatmap)
+            elif isinstance(hit_object, Spinner):
+                if beatmap.mode == 1:
+                    self._parse_denden(hit_object, groups, beatmap)
+                else:
+                    last_pos = self._parse_spinner(hit_object, groups, beatmap)
+            elif isinstance(hit_object, HoldNote):
+                last_pos = self._parse_hold_note(hit_object, groups, beatmap)
+        # Sort groups by time
+        if len(groups) > 0:
+            groups = sorted(groups, key=lambda x: x.time)
+        result = list(groups)
+        if self.add_mania_sv and beatmap.mode == 3:
+            scroll_speed_events = self.parse_scroll_speeds(beatmap)
+            result = merge_groups(scroll_speed_events, result)
+        if self.add_kiai:
+            kiai_events = self.parse_kiai(beatmap)
+            result = merge_groups(kiai_events, result)
+        if self.add_timing:
+            timing_events = self.parse_timing(beatmap, song_length=song_length)
+            result = merge_groups(timing_events, result)
+        if speed != 1.0:
+            result = speed_groups(result, speed)
+        return result
+    def parse_scroll_speeds(self, beatmap: Beatmap, speed: float = 1.0) -> list[Group]:
+        """Extract all BPM-normalized scroll speed changes from a beatmap."""
+        normalized = self.mania_bpm_normalized_scroll_speed
+        groups = []
+        median_mpb = get_median_mpb_beatmap(beatmap)
+        mpb = median_mpb
+        last_normalized_scroll_speed = -1
+        for i, tp in enumerate(beatmap.timing_points):
+            if tp.parent is None:
+                mpb = tp.ms_per_beat
+                scroll_speed = 1
+            else:
+                scroll_speed = -100 / tp.ms_per_beat
+            if i == len(beatmap.timing_points) - 1 or beatmap.timing_points[i + 1].offset > tp.offset:
+                normalized_scroll_speed = scroll_speed * median_mpb / mpb if normalized else scroll_speed
+                if normalized_scroll_speed != last_normalized_scroll_speed or last_normalized_scroll_speed == -1:
+                    self._add_group(
+                        EventType.SCROLL_SPEED_CHANGE,
+                        groups,
+                        time=tp.offset,
+                        beatmap=beatmap,
+                        scroll_speed=normalized_scroll_speed,
+                    )
+                last_normalized_scroll_speed = normalized_scroll_speed
+        if speed != 1.0:
+            groups = speed_groups(groups, speed)
+        return groups
+    def parse_kiai(self, beatmap: Beatmap, speed: float = 1.0) -> list[Group]:
+        """Extract all kiai information from a beatmap."""
+        groups = []
+        kiai = False
+        for tp in beatmap.timing_points:
+            if tp.kiai_mode == kiai:
+                continue
+            self._add_group(
+                EventType.KIAI_ON if tp.kiai_mode else EventType.KIAI_OFF,
+                groups,
+                time=tp.offset,
+                beatmap=beatmap,
+            )
+            kiai = tp.kiai_mode
+        if speed != 1.0:
+            groups = speed_groups(groups, speed)
+        return groups
+    def parse_timing(self, beatmap: Beatmap | list[TimingPoint], speed: float = 1.0, song_length: Optional[float] = None) -> list[Group]:
+        """Extract all timing information from a beatmap."""
+        timing = beatmap.timing_points if isinstance(beatmap, Beatmap) else beatmap
+        assert len(timing) > 0, "No timing points found in beatmap."
+        groups = []
+        last_time = song_length or get_song_length(beatmap=beatmap)
+        last_time = int(last_time * 1000)
+        # Get all timing points with BPM changes
+        timing_points = [tp for tp in timing if tp.bpm]
+        for i, tp in enumerate(timing_points):
+            # Generate beat and measure events until the next timing point
+            next_tp = timing_points[i + 1] if i + 1 < len(timing_points) else None
+            next_time = next_tp.offset.total_seconds() * 1000 - 10 if next_tp else last_time
+            start_time = tp.offset.total_seconds() * 1000
+            time = start_time
+            measure_counter = 0
+            beat_delta = tp.ms_per_beat
+            while time <= next_time:
+                if self.add_timing_points and measure_counter == 0:
+                    event_type = EventType.TIMING_POINT
+                elif measure_counter % tp.meter == 0:
+                    event_type = EventType.MEASURE
+                else:
+                    event_type = EventType.BEAT
+                self._add_group(
+                    event_type,
+                    groups,
+                    time=timedelta(milliseconds=time),
+                    add_snap=False,
+                )
+                # Exit early if the beat_delta is too small to avoid infinite loops
+                if beat_delta <= 10:
+                    break
+                measure_counter += 1
+                time = start_time + measure_counter * beat_delta
+        if speed != 1.0:
+            groups = speed_groups(groups, speed)
+        return groups
+    @staticmethod
+    def uninherited_point_at(time: timedelta, beatmap: Beatmap):
+        tp = beatmap.timing_point_at(time)
+        return tp if tp.parent is None else tp.parent
+    @staticmethod
+    def hitsound_point_at(time: timedelta, beatmap: Beatmap):
+        hs_query = time + timedelta(milliseconds=5)
+        return beatmap.timing_point_at(hs_query)
+    def scroll_speed_at(self, time: timedelta, beatmap: Beatmap) -> float:
+        query = time
+        tp = beatmap.timing_point_at(query)
+        return self.tp_to_scroll_speed(tp)
+    def tp_to_scroll_speed(self, tp: TimingPoint) -> float:
+        if tp.parent is None or tp.ms_per_beat >= 0 or np.isnan(tp.ms_per_beat):
+            return 1
+        else:
+            return np.clip(-100 / tp.ms_per_beat, 0.01, 10)
+    def _get_snapping(self, time: timedelta, beatmap: Beatmap, add_snap: bool = True) -> int:
+        """Add a snapping event to the event list.
+        Args:
+            time: Time of the snapping event.
+            beatmap: Beatmap object.
+            add_snap: Whether to add a snapping event.
+        """
+        if not add_snap or not self.add_snapping:
+            return None
+        tp = self.uninherited_point_at(time, beatmap)
+        beats = (time - tp.offset).total_seconds() * 1000 / tp.ms_per_beat
+        snapping = 0
+        for i in range(1, 17):
+            # If the difference between the time and the snapped time is less than 2 ms, that is the correct snapping
+            if abs(beats - round(beats * i) / i) * tp.ms_per_beat < 2:
+                snapping = i
+                break
+        return snapping
+    def _get_hitsounds(self, time: timedelta, hitsound: int, addition: str, beatmap: Beatmap) -> tuple[int, int, int, int]:
+        tp = self.hitsound_point_at(time, beatmap)
+        tp_sample_set = tp.sample_type if tp.sample_type != 0 else 2  # Inherit to soft sample set
+        addition_split = addition.split(":")
+        sample_set = int(addition_split[0]) if addition_split[0] != "0" else tp_sample_set
+        addition_set = int(addition_split[1]) if addition_split[1] != "0" else sample_set
+        volume = int(addition_split[3]) if len(addition_split) > 3 and addition_split[3] != "0" else tp.volume
+        sample_set = sample_set if 0 < sample_set < 4 else 1  # Overflow default to normal sample set
+        addition_set = addition_set if 0 < addition_set < 4 else 1  # Overflow default to normal sample set
+        hitsound = hitsound & 14  # Only take the bits for whistle, finish, and clap
+        volume = np.clip(volume, 0, 100)
+        return hitsound, sample_set, addition_set, volume
+    def _get_position(self, pos: npt.NDArray, last_pos: npt.NDArray) -> tuple[int, int, int, npt.NDArray]:
+        x, y, dist = None, None, None
+        if self.add_distances:
+            dist = int(np.linalg.norm(pos - last_pos))
+        if self.add_positions:
+            x = int(pos[0])
+            y = int(pos[1])
+        return x, y, dist, pos
+    def _get_mania_column(self, pos: npt.NDArray, columns: int) -> int:
+        column = int(np.clip(pos[0] / 512 * columns, 0, columns - 1))
+        return column
+    def _add_group(
+            self,
+            event_type: EventType,
+            groups: list[Group],
+            time: timedelta,
+            *,
+            beatmap: Beatmap = None,
+            add_snap: bool = True,
+            has_time: bool = True,
+            pos: npt.NDArray = None,
+            last_pos: npt.NDArray = None,
+            new_combo: bool = False,
+            hitsound_ref_times: list[timedelta] = None,
+            hitsounds: list[int] = None,
+            additions: list[str] = None,
+            scroll_speed: Optional[float] = None,
+    ) -> npt.NDArray:
+        """Add a group of events to the event list."""
+        group = Group(
+            event_type=event_type,
+            time=int(time.total_seconds() * 1000 + 1e-5)
+        )
+        if has_time:
+            group.has_time = True
+            group.snapping = self._get_snapping(time, beatmap, add_snap)
+        if pos is not None:
+            if beatmap.mode in [0, 2]:
+                x, y, dist, last_pos = self._get_position(pos, last_pos)
+                group.x = x
+                group.y = y
+                group.distance = dist
+            elif beatmap.mode == 3:
+                group.column = self._get_mania_column(pos, int(beatmap.circle_size))
+        if new_combo and beatmap.mode in [0, 2]:
+            group.new_combo = True
+        if scroll_speed is not None:
+            group.scroll_speed = scroll_speed
+        if hitsound_ref_times is not None and self.add_hitsounds:
+            for i, ref_time in enumerate(hitsound_ref_times):
+                hitsound, sample_set, addition_set, volume = self._get_hitsounds(ref_time, hitsounds[i], additions[i], beatmap)
+                group.hitsounds.append(hitsound)
+                group.samplesets.append(sample_set)
+                group.additions.append(addition_set)
+                group.volumes.append(volume)
+        groups.append(group)
+        return last_pos
+    def _parse_circle(self, circle: Circle, groups: list[Group], last_pos: npt.NDArray, beatmap: Beatmap) -> npt.NDArray:
+        """Parse a circle hit object.
+        Args:
+            circle: Circle object.
+            groups: List of groups to add to.
+            last_pos: Last position of the hit objects.
+        Returns:
+            pos: Position of the circle.
+        """
+        return self._add_group(
+            EventType.CIRCLE,
+            groups,
+            time=circle.time,
+            beatmap=beatmap,
+            pos=np.array(circle.position),
+            last_pos=last_pos,
+            new_combo=circle.new_combo,
+            hitsound_ref_times=[circle.time],
+            hitsounds=[circle.hitsound],
+            additions=[circle.addition],
+            scroll_speed=self.scroll_speed_at(circle.time, beatmap) if beatmap.mode == 1 else None,
+        )
+    def _parse_slider(self, slider: Slider, groups: list[Group], last_pos: npt.NDArray, beatmap: Beatmap) -> npt.NDArray:
+        """Parse a slider hit object.
+        Args:
+            slider: Slider object.
+            groups: List of groups to add to.
+            last_pos: Last position of the hit objects.
+        Returns:
+            pos: Last position of the slider.
+        """
+        # Ignore sliders which are too big
+        if len(slider.curve.points) >= 100:
+            return last_pos
+        last_pos = self._add_group(
+            EventType.SLIDER_HEAD,
+            groups,
+            time=slider.time,
+            beatmap=beatmap,
+            pos=np.array(slider.position),
+            last_pos=last_pos,
+            new_combo=slider.new_combo,
+            hitsound_ref_times=[slider.time],
+            hitsounds=[slider.edge_sounds[0] if len(slider.edge_sounds) > 0 else 0],
+            additions=[slider.edge_additions[0] if len(slider.edge_additions) > 0 else '0:0'],
+            scroll_speed=self.scroll_speed_at(slider.time, beatmap) if self.add_sv else None,
+        )
+        duration: timedelta = (slider.end_time - slider.time) / slider.repeat
+        control_point_count = len(slider.curve.points)
+        def append_control_points(event_type: EventType, last_pos: npt.NDArray = last_pos) -> npt.NDArray:
+            for i in range(1, control_point_count - 1):
+                last_pos = add_anchor(event_type, i, last_pos)
+            return last_pos
+        def add_anchor(event_type: EventType, i: int, last_pos: npt.NDArray) -> npt.NDArray:
+            return self._add_group(
+                event_type,
+                groups,
+                time=slider.time + i / (control_point_count - 1) * duration if self.slider_version == 1 else slider.time,
+                beatmap=beatmap,
+                has_time=False,
+                pos=np.array(slider.curve.points[i]),
+                last_pos=last_pos,
+            )
+        if isinstance(slider.curve, Linear):
+            last_pos = append_control_points(EventType.RED_ANCHOR, last_pos)
+        elif isinstance(slider.curve, Catmull):
+            last_pos = append_control_points(EventType.CATMULL_ANCHOR, last_pos)
+        elif isinstance(slider.curve, Perfect):
+            last_pos = append_control_points(EventType.PERFECT_ANCHOR, last_pos)
+        elif isinstance(slider.curve, MultiBezier):
+            for i in range(1, control_point_count - 1):
+                if slider.curve.points[i] == slider.curve.points[i + 1]:
+                    last_pos = add_anchor(EventType.RED_ANCHOR, i, last_pos)
+                elif slider.curve.points[i] != slider.curve.points[i - 1]:
+                    last_pos = add_anchor(EventType.BEZIER_ANCHOR, i, last_pos)
+        if self.slider_version == 2:
+            # Add last control point without time
+            last_pos = self._add_group(
+                EventType.LAST_ANCHOR,
+                groups,
+                time=slider.time,
+                beatmap=beatmap,
+                has_time=False,
+                pos=np.array(slider.curve.points[-1]),
+                last_pos=last_pos,
+            )
+        # Add body hitsounds and remaining edge hitsounds
+        last_pos = self._add_group(
+            EventType.SLIDER_END,
+            groups,
+            time=slider.time + duration,
+            beatmap=beatmap,
+            pos=np.array(slider.curve.points[-1]) if self.slider_version == 1 else None,
+            last_pos=last_pos,
+            hitsound_ref_times=[slider.time + timedelta(milliseconds=1)] + [slider.time + i * duration for i in range(1, slider.repeat)],
+            hitsounds=[slider.hitsound] + [slider.edge_sounds[i] if len(slider.edge_sounds) > i else 0 for i in range(1, slider.repeat)],
+            additions=[slider.addition] + [slider.edge_additions[i] if len(slider.edge_additions) > i else '0:0' for i in range(1, slider.repeat)],
+        )
+        return self._add_group(
+            EventType.REPEAT_END,
+            groups,
+            time=slider.end_time,
+            beatmap=beatmap,
+            pos=np.array(slider.curve(1)),
+            last_pos=last_pos,
+            hitsound_ref_times=[slider.end_time],
+            hitsounds=[slider.edge_sounds[-1] if len(slider.edge_sounds) > 0 else 0],
+            additions=[slider.edge_additions[-1] if len(slider.edge_additions) > 0 else '0:0'],
+        )
+    def _parse_spinner(self, spinner: Spinner, groups: list[Group], beatmap: Beatmap) -> npt.NDArray:
+        """Parse a spinner hit object.
+        Args:
+            spinner: Spinner object.
+            groups: List of groups to add to.
+        Returns:
+            pos: Last position of the spinner.
+        """
+        self._add_group(
+            EventType.SPINNER,
+            groups,
+            time=spinner.time,
+            beatmap=beatmap,
+        )
+        self._add_group(
+            EventType.SPINNER_END,
+            groups,
+            time=spinner.end_time,
+            beatmap=beatmap,
+            hitsound_ref_times=[spinner.end_time],
+            hitsounds=[spinner.hitsound],
+            additions=[spinner.addition],
+        )
+        return np.array((256, 192))
+    def _parse_hold_note(self, hold_note: HoldNote, groups: list[Group], beatmap: Beatmap) -> npt.NDArray:
+        """Parse a hold note hit object.
+        Args:
+            hold note: Hold note object.
+            groups: List of groups to add to.
+        Returns:
+            pos: Last position of the spinner.
+        """
+        pos = np.array(hold_note.position)
+        self._add_group(
+            EventType.HOLD_NOTE,
+            groups,
+            time=hold_note.time,
+            beatmap=beatmap,
+            pos=pos,
+            hitsound_ref_times=[hold_note.time],
+            hitsounds=[hold_note.hitsound],
+            additions=[hold_note.addition],
+        )
+        self._add_group(
+            EventType.HOLD_NOTE_END,
+            groups,
+            time=hold_note.end_time,
+            beatmap=beatmap,
+            pos=pos,
+        )
+        return pos
+    def _parse_drumroll(self, slider: Slider, groups: list[Group], beatmap: Beatmap):
+        """Parse a drumroll hit object.
+        Args:
+            slider: Slider object.
+            groups: List of groups to add to.
+        """
+        self._add_group(
+            EventType.DRUMROLL,
+            groups,
+            time=slider.time,
+            beatmap=beatmap,
+            hitsound_ref_times=[slider.time],
+            hitsounds=[slider.hitsound],  # Edge hitsounds are not supported in drumrolls
+            additions=[slider.addition],
+            scroll_speed=self.scroll_speed_at(slider.time, beatmap),
+        )
+        self._add_group(
+            EventType.DRUMROLL_END,
+            groups,
+            time=slider.end_time,
+            beatmap=beatmap,
+        )
+    def _parse_denden(self, spinner: Spinner, groups: list[Group], beatmap: Beatmap):
+        """Parse a denden hit object.
+        Args:
+            spinner: Spinner object.
+            groups: List of groups to add to.
+        """
+        self._add_group(
+            EventType.DENDEN,
+            groups,
+            time=spinner.time,
+            beatmap=beatmap,
+            hitsound_ref_times=[spinner.time],
+            hitsounds=[spinner.hitsound],
+            additions=[spinner.addition],
+            scroll_speed=self.scroll_speed_at(spinner.time, beatmap),
+        )
+        self._add_group(
+            EventType.DENDEN_END,
+            groups,
+            time=spinner.end_time,
+            beatmap=beatmap,
+        )
+AutoFeatureExtractor.register(CM3PConfig, CM3PBeatmapParser)
+__all__ = ["CM3PBeatmapParser", "EventType", "Group", "load_beatmap", "get_song_length", "EVENT_TYPES_WITH_NEW_COMBO"]

processing_cm3p.py ADDED Viewed

	@@ -0,0 +1,835 @@

+import copy
+import itertools
+import math
+import os
+from os import PathLike
+from pathlib import Path
+from typing import Optional, Union, IO, TypedDict
+import numpy as np
+from huggingface_hub.errors import HfHubHTTPError
+from pandas import Series
+from slider import Beatmap, HoldNote
+from transformers import WhisperFeatureExtractor, AutoProcessor, BatchEncoding
+from transformers.dynamic_module_utils import custom_object_save
+from transformers.tokenization_utils_base import TruncationStrategy, PreTrainedTokenizerBase
+from transformers.utils import is_torch_available, PaddingStrategy, PROCESSOR_NAME, logging
+from huggingface_hub import CommitOperationAdd, create_branch, create_commit
+from .configuration_cm3p import CM3PConfig
+from .parsing_cm3p import CM3PBeatmapParser, load_beatmap, get_song_length
+from .tokenization_cm3p import CM3PBeatmapTokenizer, CM3PMetadataTokenizer, CM3PMetadata, merge_metadata_dicts
+if is_torch_available():
+    import torch
+from transformers.audio_utils import AudioInput, make_list_of_audio, load_audio
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import AudioKwargs, ProcessorMixin, CommonKwargs
+logger = logging.get_logger(__name__)
+def get_hold_note_ratio(beatmap: Beatmap) -> Optional[float]:
+    notes = beatmap.hit_objects(stacking=False)
+    if len(notes) == 0:
+        return None
+    hold_note_count = 0
+    for note in notes:
+        if isinstance(note, HoldNote):
+            hold_note_count += 1
+    return hold_note_count / len(notes)
+def get_scroll_speed_ratio(beatmap: Beatmap) -> Optional[float]:
+    # Number of scroll speed changes divided by number of distinct hit object times
+    notes = beatmap.hit_objects(stacking=False)
+    if len(notes) == 0:
+        return None
+    last_time = -1
+    num_note_times = 0
+    for note in notes:
+        if note.time != last_time:
+            num_note_times += 1
+            last_time = note.time
+    last_scroll_speed = -1
+    num_scroll_speed_changes = 0
+    for timing_point in beatmap.timing_points:
+        if timing_point.parent is None:
+            last_scroll_speed = 1
+        else:
+            scroll_speed = -100 / timing_point.ms_per_beat
+            if scroll_speed != last_scroll_speed and last_scroll_speed != -1:
+                num_scroll_speed_changes += 1
+            last_scroll_speed = scroll_speed
+    return num_scroll_speed_changes / num_note_times
+def get_hitsounded_status(beatmap: Beatmap) -> bool:
+    notes = beatmap.hit_objects(stacking=False)
+    for note in notes:
+        if note.hitsound != 0:
+            return True
+    return False
+def get_difficulty(beatmap_metadata: Series, speed: float = 1.0) -> float:
+    # StarRating is an array that gives the difficulty for the speeds:
+    # 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0
+    # Linearly interpolate between the two closest speeds
+    star_ratings = beatmap_metadata["StarRating"]
+    speed_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
+    return np.interp(speed, speed_ratios, star_ratings)
+def get_metadata(
+        beatmap_metadata: Series = None,
+        beatmap: Beatmap = None,
+        audio_samples: np.ndarray = None,
+        sampling_rate: int = None,
+        speed: float = 1.0,
+        song_position: Optional[float] = None,
+) -> CM3PMetadata:
+    mode = beatmap.mode if beatmap is not None else beatmap_metadata["ModeInt"] if beatmap_metadata is not None else None
+    circle_size = beatmap.circle_size if beatmap is not None else beatmap_metadata["Cs"] if beatmap_metadata is not None else None
+    song_length = get_song_length(audio_samples, sampling_rate, beatmap)
+    return CM3PMetadata(
+        difficulty=get_difficulty(beatmap_metadata, speed) if beatmap_metadata is not None else None,
+        year=beatmap_metadata["SubmittedDate"].year if beatmap_metadata is not None else None,
+        mode=mode,
+        status=beatmap_metadata["Status"] if beatmap_metadata is not None else None,
+        mapper=beatmap_metadata["UserId"] if beatmap_metadata is not None else None,
+        cs=circle_size if mode in [0, 2] is not None else None,
+        hitsounded=get_hitsounded_status(beatmap) if beatmap is not None else None,
+        song_length=song_length,
+        song_position=song_position,
+        global_sv=beatmap.slider_multiplier if mode in [0, 2] and beatmap is not None else None,
+        mania_keycount=int(circle_size) if mode == 3 and beatmap is not None else None,
+        hold_note_ratio=get_hold_note_ratio(beatmap) if mode == 3 and beatmap is not None else None,
+        scroll_speed_ratio=get_scroll_speed_ratio(beatmap) if mode in [1, 3] and beatmap is not None else None,
+        tags=beatmap_metadata["TopTagIds"].tolist() if beatmap_metadata is not None else None,
+    )
+class CM3PTokenizerKwargs(TypedDict, total=False):
+    add_special_tokens: Optional[bool]
+    padding: Union[bool, str, PaddingStrategy]
+    truncation: Union[bool, str, TruncationStrategy]
+    max_length: Optional[int]
+    pad_to_multiple_of: Optional[int]
+    return_token_type_ids: Optional[bool]
+    return_attention_mask: Optional[bool]
+    return_overflowing_tokens: Optional[bool]
+    return_special_tokens_mask: Optional[bool]
+    return_offsets_mapping: Optional[bool]
+    return_length: Optional[bool]
+    verbose: Optional[bool]
+    padding_side: Optional[str]
+    return_mm_token_type_ids: Optional[bool]
+class CM3PBeatmapKwargs(CM3PTokenizerKwargs, total=False):
+    window_length_sec: float
+    window_stride_sec: float
+class CM3PAudioKwargs(AudioKwargs, total=False):
+    max_source_positions: Optional[int]
+    hop_length: Optional[int]
+    window_size: Optional[int]
+    audio_length_per_tok: Optional[int]
+    device: Optional[str]
+# noinspection PyTypedDict
+class CM3PProcessorKwargs(CommonKwargs, CM3PBeatmapKwargs, CM3PTokenizerKwargs, CM3PAudioKwargs, total=False):
+    _defaults = {
+        "beatmap_kwargs": {
+            "max_length": 8000,
+            "padding": PaddingStrategy.LONGEST,
+            "truncation": TruncationStrategy.LONGEST_FIRST,
+            "window_length_sec": 30.0,
+            "window_stride_sec": 30.0,
+            "min_window_length_sec": 1.0,
+        },
+        "metadata_kwargs": {
+            "max_length": 128,
+            "padding": PaddingStrategy.LONGEST,
+            "truncation": TruncationStrategy.LONGEST_FIRST,
+        },
+        "audio_kwargs": {
+            "sampling_rate": 16000,
+            "padding": True,
+            "truncation": False,
+            "pad_to_multiple_of": 480000,
+            "max_source_positions": 3000,
+            "hop_length": 160,
+            "window_size": 400,
+            "audio_length_per_tok": 8,
+            "device": "cpu",
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+    common_kwargs: CommonKwargs = {
+        **CommonKwargs.__annotations__,
+    }
+    beatmap_kwargs: CM3PBeatmapKwargs = {
+        **CM3PTokenizerKwargs.__annotations__,
+    }
+    metadata_kwargs: CM3PTokenizerKwargs = {
+        **CM3PTokenizerKwargs.__annotations__,
+    }
+    audio_kwargs: CM3PAudioKwargs = {
+        **CM3PAudioKwargs.__annotations__,
+    }
+class CM3PProcessor(ProcessorMixin):
+    r"""
+    Constructs a CM3P processor which wraps [`WhisperFeatureExtractor`] and
+    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
+    tokenizer functionalities.
+    Args:
+        audio_feature_extractor ([`WhisperFeatureExtractor`]):
+            The feature extractor is a required input.
+        beatmap_parser ([`CM3PBeatmapParser`]):
+            The beatmap parser is a required input.
+        beatmap_tokenizer ([`CM3PBeatmapTokenizer`]):
+            The beatmap tokenizer is a required input.
+        metadata_tokenizer ([`CM3PMetadataTokenizer`]):
+            The metadata tokenizer is a required input.
+        default_kwargs (`CM3PProcessorKwargs`, *optional*):
+            Default keyword arguments for the processor. If not provided, the processor will use its own defaults
+    """
+    attributes = ["audio_feature_extractor", "beatmap_parser", "beatmap_tokenizer", "metadata_tokenizer"]
+    audio_feature_extractor_class = "WhisperFeatureExtractor"
+    beatmap_parser_class = "CM3PBeatmapParser"
+    beatmap_tokenizer_class = "CM3PBeatmapTokenizer"
+    metadata_tokenizer_class = "CM3PMetadataTokenizer"
+    def __init__(
+        self,
+        audio_feature_extractor: WhisperFeatureExtractor,
+        beatmap_parser: CM3PBeatmapParser,
+        beatmap_tokenizer: CM3PBeatmapTokenizer,
+        metadata_tokenizer: CM3PMetadataTokenizer,
+        default_kwargs: Optional[CM3PProcessorKwargs] = None,
+    ):
+        self.audio_feature_extractor = audio_feature_extractor
+        self.beatmap_parser = beatmap_parser
+        self.beatmap_tokenizer = beatmap_tokenizer
+        self.metadata_tokenizer = metadata_tokenizer
+        self.audio_token = beatmap_tokenizer.audio_token
+        # noinspection PyProtectedMember
+        self.default_kwargs = default_kwargs or copy.deepcopy(CM3PProcessorKwargs._defaults)
+        super().__init__(audio_feature_extractor, beatmap_parser, beatmap_tokenizer, metadata_tokenizer)
+    def _pad_audio(
+            self,
+            audio_array: np.ndarray,
+            window_size: int = 400,
+            pad_to_multiple_of: Optional[int] = 480000,
+            **_,
+    ) -> np.ndarray:
+        r"""Pad the audio array to the desired length.
+        Args:
+            audio_array: Audio data as a numpy array.
+            sampling_rate: Sampling rate of the audio.
+        Returns:
+            Padded audio array.
+        """
+        if pad_to_multiple_of:
+            next_multiple_of_chunk_frames = math.ceil(audio_array.shape[-1] / pad_to_multiple_of) * pad_to_multiple_of
+            audio_array = np.pad(audio_array, (0, next_multiple_of_chunk_frames - audio_array.shape[-1]))
+        elif audio_array.shape[-1] < window_size:
+            # minimum length for audios is at least one spectrogram frame
+            audio_array = np.pad(audio_array, (0, window_size - audio_array.shape[-1]))
+        return audio_array
+    def _encode_audio(
+            self,
+            audio: np.ndarray,
+            hop_length: int = 160,
+            audio_length_per_tok: int = 8,
+            **kwargs,
+    ) -> tuple[np.ndarray, int]:
+        audio = self._pad_audio(audio, **kwargs)
+        signal_length = audio.shape[0]
+        # for spectrogram-based models, the waveform is downsampled by the hop_length when computing the log-mel
+        if signal_length % hop_length != 0:
+            signal_length = math.ceil(signal_length / hop_length - 1)
+        else:
+            signal_length = signal_length // hop_length
+        num_audio_tokens = math.ceil(signal_length / audio_length_per_tok)
+        return audio, num_audio_tokens
+    def _retrieve_input_features(self, audio, max_source_positions, **kwargs) -> Union[torch.Tensor, np.ndarray]:
+        """
+        Handles specific logic of CM3P expected input features: audio arrays should be padded to next multiple of 480000 (duration is a multiple of 30s), see CM3PProcessorKwargs' default audio_kwargs.
+        Then mel input features are extracted and stacked along batch dimension, splitting into chunks of max_source_positions.
+        """
+        return_tensors = kwargs.get("return_tensors", "pt")
+        input_features_list = []
+        for audio_array in audio:
+            audio_inputs = self.audio_feature_extractor(audio_array, **kwargs)
+            # let's split into chunks of max_source_positions, and then stack them along batch dimension
+            input_features = audio_inputs["input_features"].reshape(
+                self.audio_feature_extractor.feature_size, -1, max_source_positions
+            )
+            input_features_list.append(input_features.swapaxes(0, 1))
+        if return_tensors == "pt":
+            return torch.cat(input_features_list)
+        return np.concatenate(input_features_list)
+    def _load_audio(
+        self,
+        sampling_rate: int,
+        audio: Union[str, list[str], Path, list[Path], AudioInput],
+        audio_sampling_rate: Optional[Union[int, list[int]]] = None,
+        speed: float = 1.0,
+    ) -> list[np.ndarray]:
+        """
+        Helper method to load audio from various formats and return a list of audio buffers.
+        """
+        # convert Path objects to str
+        if isinstance(audio, Path):
+            audio = str(audio)
+        if isinstance(audio, list) and all(isinstance(el, Path) for el in audio):
+            audio = [str(el) for el in audio]
+        # validate audio input
+        is_str = isinstance(audio, str)
+        is_list_of_str = isinstance(audio, list) and all(isinstance(el, str) for el in audio)
+        is_list_of_audio = not (is_str or is_list_of_str)
+        if is_list_of_audio:
+            if audio_sampling_rate is None:
+                # noinspection PyUnresolvedReferences
+                logger.warning_once(
+                    f"You've provided audio without specifying the sampling rate. It will be assumed to be {sampling_rate}, which can result in silent errors."
+                )
+                audio_sampling_rate = sampling_rate
+        if is_str:
+            audio = [load_audio(audio, sampling_rate=int(sampling_rate // speed))]
+            audio_sampling_rate = sampling_rate
+        elif is_list_of_str:
+            audio = [load_audio(el, sampling_rate=int(sampling_rate // speed)) for el in audio]
+            audio_sampling_rate = sampling_rate
+        audio = make_list_of_audio(audio)
+        if isinstance(audio_sampling_rate, int):
+            audio_sampling_rate = [audio_sampling_rate] * len(audio)
+        audio_buffers = []
+        for array, s in zip(audio, audio_sampling_rate):
+            array = np.asarray(array)
+            # Convert to mono if needed
+            if array.ndim == 2:
+                array = array.mean(axis=1)
+            # Resample if the sampling rate is different from the expected one
+            if s != sampling_rate:
+                import soxr
+                array = soxr.resample(array, s, sampling_rate, quality="HQ")
+            audio_buffers.append(array)
+        return audio_buffers
+    # noinspection PyTypedDict
+    def _merge_kwargs(self, **kwargs) -> CM3PProcessorKwargs:
+        output_kwargs = CM3PProcessorKwargs()
+        nested_modalities = ["beatmap_kwargs", "metadata_kwargs", "audio_kwargs", "common_kwargs"]
+        possible_modality_keywords = {"beatmap", "metadata", "audio"}
+        used_keys = set()
+        # pass defaults to output dictionary
+        output_kwargs.update(copy.deepcopy(self.default_kwargs))
+        # update modality kwargs with passed kwargs
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
+        for modality, output_kwarg in output_kwargs.items():
+            for modality_key in CM3PProcessorKwargs.__annotations__[modality].__annotations__:
+                # check if we received a structured kwarg dict or not to handle it correctly
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                    # check if this key was passed as a flat kwarg.
+                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
+                        raise ValueError(
+                            f"Keyword argument {modality_key} was passed two times:\n"
+                            f"in a dictionary for {modality} and as a **kwarg."
+                        )
+                elif modality_key in kwargs:
+                    # we get a modality_key instead of popping it because modality-specific processors
+                    # can have overlapping kwargs
+                    kwarg_value = kwargs.get(modality_key, "__empty__")
+                else:
+                    kwarg_value = "__empty__"
+                if not isinstance(kwarg_value, str) or kwarg_value != "__empty__":
+                    output_kwarg[modality_key] = kwarg_value
+                    used_keys.add(modality_key)
+        # Determine if kwargs is a flat dictionary or contains nested dictionaries
+        if any(key in nested_modalities for key in kwargs):
+            # kwargs is dictionary-based, and some keys match modality names
+            for modality, subdict in kwargs.items():
+                if modality in nested_modalities:
+                    for subkey, subvalue in subdict.items():
+                        if subkey not in used_keys:
+                            output_kwargs[modality][subkey] = subvalue
+                            used_keys.add(subkey)
+        else:
+            # kwargs is a flat dictionary
+            for key, kwarg in kwargs.items():
+                if key not in used_keys:
+                    if key in CM3PProcessorKwargs.__annotations__["common_kwargs"].__annotations__:
+                        output_kwargs["common_kwargs"][key] = kwarg
+                    elif key not in possible_modality_keywords:
+                        # noinspection PyUnresolvedReferences
+                        logger.warning_once(
+                            f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
+                        )
+        # all modality-specific kwargs are updated with common kwargs
+        for kwarg in output_kwargs.values():
+            kwarg.update(output_kwargs["common_kwargs"])
+        return output_kwargs
+    def __call__(
+        self,
+        metadata: Optional[Union[CM3PMetadata, list[CM3PMetadata]]] = None,
+        beatmap: Optional[Union[str, list[str], PathLike, list[PathLike], IO[str], list[IO[str]], Beatmap, list[Beatmap]]] = None,
+        audio: Optional[Union[str, list[str], Path, list[Path], AudioInput]] = None,
+        audio_sampling_rate: Optional[Union[int, list[int]]] = None,
+        speed: float = 1.0,
+        multiply_metadata: bool = False,
+        populate_metadata: bool = False,
+        metadata_dropout_prob: float = 0.0,
+        metadata_variations: int = 1,
+        **kwargs,
+    ):
+        output_kwargs = self._merge_kwargs(**kwargs)
+        beatmap_kwargs: CM3PTokenizerKwargs = output_kwargs["beatmap_kwargs"]
+        metadata_kwargs: CM3PTokenizerKwargs = output_kwargs["metadata_kwargs"]
+        audio_kwargs: CM3PAudioKwargs = output_kwargs["audio_kwargs"]
+        common_kwargs: CommonKwargs = output_kwargs["common_kwargs"]
+        window_length_sec = beatmap_kwargs.pop("window_length_sec")
+        window_stride_sec = beatmap_kwargs.pop("window_stride_sec")
+        min_window_length_sec = beatmap_kwargs.pop("min_window_length_sec", 1.0)
+        max_length = beatmap_kwargs.get("max_length", 8000)
+        metadata_max_length = metadata_kwargs.get("max_length", 128)
+        sampling_rate = audio_kwargs["sampling_rate"]
+        max_source_positions = audio_kwargs.get("max_source_positions", 3000)
+        audio_kwargs["padding"] = False
+        return_tensors = common_kwargs["return_tensors"]
+        metadata_encoding, beatmap_encoding, num_audio_tokens, metadata_variation_classes = None, None, None, None
+        if return_tensors is not None and return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'` or `return_tensors=None`.")
+        if metadata is None and beatmap is None:
+            raise ValueError("You have to specify either metadata or beatmap. Both cannot be none.")
+        if audio is not None:
+            audio = self._load_audio(
+                sampling_rate,
+                audio,
+                audio_sampling_rate=audio_sampling_rate,
+            )
+        if beatmap is not None:
+            if not isinstance(beatmap, list):
+                beatmap = [beatmap]
+            if audio is not None:
+                if len(beatmap) != len(audio):
+                    raise ValueError(
+                        f"The number of beatmaps ({len(beatmap)}) must match the number of audio ({len(audio)})"
+                    )
+            else:
+                audio = [None] * len(beatmap)
+            if multiply_metadata or populate_metadata and metadata is not None:
+                matched_metadata = metadata
+                if not isinstance(matched_metadata, list):
+                    matched_metadata = [matched_metadata]
+                if (multiply_metadata or populate_metadata) and len(matched_metadata) != len(beatmap):
+                    raise ValueError(
+                        f"The number of metadata entries ({len(matched_metadata)}) must match the number of beatmaps ({len(beatmap)})"
+                        "` if multiply_metadata` or `populate_metadata` is set to True."
+                    )
+            else:
+                matched_metadata = [CM3PMetadata()] * len(beatmap) if populate_metadata else [None] * len(beatmap)
+            new_metadata = []
+            batch_start_ms = []
+            batch_groups = []
+            batch_audio = []
+            batch_num_audio_tokens = []
+            for b, m, audio_array in zip(beatmap, matched_metadata, audio):
+                b: Beatmap = load_beatmap(b)
+                song_length = get_song_length(audio_array, sampling_rate, b)
+                beatmap_groups = self.beatmap_parser.parse_beatmap(b, speed=speed, song_length=song_length)
+                def add_metadata(song_position: Optional[float] = None):
+                    if populate_metadata:
+                        new_metadata.append(merge_metadata_dicts(m, get_metadata(
+                            beatmap=b,
+                            audio_samples=audio_array,
+                            sampling_rate=sampling_rate,
+                            speed=speed,
+                            song_position=song_position,
+                        )))
+                    else:
+                        new_metadata.append(m)
+                if not multiply_metadata:
+                    add_metadata()
+                # Loop through with sliding window
+                groups_search_index = 0
+                for start_sec in np.arange(0, song_length - min_window_length_sec, window_stride_sec):
+                    end_sec = start_sec + window_length_sec
+                    if audio_array is not None:
+                        # Slice audio waveform
+                        start_frame = int(start_sec * sampling_rate)
+                        end_frame = int(end_sec * sampling_rate)
+                        audio_slice = audio_array[start_frame:end_frame]
+                        # Pad the audio array and calculate the number of audio tokens
+                        audio_slice, num_audio_tokens = self._encode_audio(audio_slice, **audio_kwargs)
+                    else:
+                        audio_slice = None
+                        num_audio_tokens = 0
+                    # Find groups that fall within the current window
+                    # Groups are sorted by time, so we can use a simple linear search from the last index
+                    start_ms = start_sec * 1000
+                    end_ms = end_sec * 1000
+                    next_start_ms = (start_sec + window_stride_sec) * 1000
+                    window_groups = []
+                    for group in itertools.islice(beatmap_groups, groups_search_index, None):
+                        if group.time < next_start_ms:
+                            groups_search_index += 1
+                        if group.time < start_ms:
+                            continue
+                        elif group.time < end_ms:
+                            window_groups.append(group)
+                        else:
+                            break
+                    batch_start_ms.append(start_ms)
+                    batch_groups.append(window_groups)
+                    batch_audio.append(audio_slice)
+                    batch_num_audio_tokens.append(num_audio_tokens)
+                    if multiply_metadata:
+                        add_metadata(start_sec / song_length)
+            if populate_metadata or multiply_metadata:
+                metadata = new_metadata
+            if len(batch_groups) > 0:
+                beatmap_encoding = self.beatmap_tokenizer(
+                    groups=batch_groups,
+                    window_start_ms=batch_start_ms,
+                    num_audio_tokens=batch_num_audio_tokens,
+                    **beatmap_kwargs,
+                )
+                if audio is not None:
+                    data = dict(beatmap_encoding)
+                    data["input_features"] = self._retrieve_input_features(batch_audio, **audio_kwargs)
+                    beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
+            else:
+                # No windows with hit objects were found, return empty encoding
+                logger.warning("Warning: No windows with hit objects were found in the provided beatmap(s). Returning empty encoding.")
+                beatmap_encoding = BatchEncoding(
+                    {
+                        "input_ids": torch.zeros((0, max_length), dtype=torch.long) if return_tensors == "pt" else [],
+                        "attention_mask": torch.zeros((0, max_length), dtype=torch.long) if return_tensors == "pt" else [],
+                    },
+                    tensor_type=return_tensors,
+                )
+                if audio is not None:
+                    data = dict(beatmap_encoding)
+                    data["input_features"] = torch.zeros((0, self.audio_feature_extractor.feature_size, max_source_positions), dtype=torch.float) if return_tensors == "pt" else []
+                    beatmap_encoding = BatchFeature(data, tensor_type=return_tensors)
+        if metadata is not None and not (isinstance(metadata, list) and any(m is None for m in metadata)):
+            if not isinstance(metadata, list):
+                metadata = [metadata]
+            if metadata_dropout_prob > 0.0:
+                for m in metadata:
+                    # Randomly drop out metadata fields
+                    for key, value in m.items():
+                        if value is not None and np.random.rand() < metadata_dropout_prob:
+                            # noinspection PyTypedDict
+                            m[key] = None
+            if metadata_variations > 1:
+                extended_metadata = []
+                metadata_variation_classes = []
+                for m in metadata:
+                    m_vars, m_classes = zip(*self.metadata_tokenizer.metadata_variations(m, metadata_variations - 1))
+                    extended_metadata.append(m)
+                    extended_metadata.extend(m_vars)
+                    metadata_variation_classes.append([0] + list(m_classes))  # Class 0 is the original metadata
+                assert len(extended_metadata) == len(metadata) * metadata_variations
+                metadata = extended_metadata
+            if len(metadata) > 0:
+                metadata_encoding = self.metadata_tokenizer(
+                    metadata,
+                    **metadata_kwargs,
+                )
+                if metadata_variations > 1:
+                    # Reshape to (batch_size, variations, seq_len)
+                    for k, v in metadata_encoding.items():
+                        if return_tensors == "pt":
+                            v = v.view(len(metadata) // metadata_variations, metadata_variations, -1)
+                        else:
+                            v = [v[i:i + metadata_variations] for i in range(0, len(v), metadata_variations)]
+                        metadata_encoding[k] = v
+                if metadata_variation_classes is not None:
+                    metadata_encoding["metadata_variation_classes"] = torch.tensor(metadata_variation_classes, dtype=torch.long) if return_tensors == "pt" else metadata_variation_classes
+            else:
+                metadata_encoding = BatchEncoding(
+                    {
+                        "input_ids": torch.zeros((0, metadata_max_length), dtype=torch.long) if return_tensors == "pt" else [],
+                        "attention_mask": torch.zeros((0, metadata_max_length), dtype=torch.long) if return_tensors == "pt" else [],
+                    },
+                    tensor_type=return_tensors,
+                )
+        if metadata_encoding is not None and beatmap_encoding is not None:
+            beatmap_encoding["metadata_ids"] = metadata_encoding["input_ids"]
+            beatmap_encoding["metadata_attention_mask"] = metadata_encoding["attention_mask"]
+            if "metadata_variation_classes" in metadata_encoding:
+                beatmap_encoding["metadata_variation_classes"] = metadata_encoding["metadata_variation_classes"]
+            return beatmap_encoding
+        elif beatmap_encoding is not None:
+            return beatmap_encoding
+        else:
+            return metadata_encoding
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CM3PBeatmapTokenizer's [`~CM3PBeatmapTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.beatmap_tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CM3PBeatmapTokenizer's [`~CM3PBeatmapTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.beatmap_tokenizer.decode(*args, **kwargs)
+    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
+        """
+        Save processor and its sub-components, with support for AutoProcessor remote code.
+        This is a lightly adapted version of ProcessorMixin.save_pretrained:
+        - child attributes are saved into subfolders (audio_feature_extractor/, beatmap_parser/, ...);
+        - when self._auto_class is set (via register_for_auto_class), custom_object_save is used
+          so that auto_map and dynamic modules are written correctly.
+        """
+        os.makedirs(save_directory, exist_ok=True)
+        # Handle Hub integration (same as ProcessorMixin / your existing code)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+        else:
+            commit_message = None
+            repo_id = None
+            files_timestamps = None
+        # If we have a custom processor registered for an Auto class,
+        # save its code and dependencies as a dynamic module and
+        # populate the auto_map field in processor_config.json.
+        if self._auto_class is not None:
+            attrs = [getattr(self, attribute_name) for attribute_name in self.attributes]
+            # For tokenizers, we pass their init_kwargs; for other objects, we pass the object itself.
+            configs = []
+            for a in attrs:
+                if isinstance(a, PreTrainedTokenizerBase):
+                    configs.append(a.init_kwargs)
+                else:
+                    configs.append(a)
+            # Include the processor itself so its class is exported.
+            configs.append(self)
+            custom_object_save(self, save_directory, config=configs)
+        # Save each sub-component into its own subfolder
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this
+            # processor can then be reloaded with the AutoProcessor API.
+            if hasattr(attribute, "_set_processor_class"):
+                # noinspection PyProtectedMember
+                attribute._set_processor_class(self.__class__.__name__)
+            attribute.save_pretrained(os.path.join(save_directory, attribute_name))
+        # Clean up temporary auto_map injected into tokenizers, if any
+        if self._auto_class is not None:
+            for attribute_name in self.attributes:
+                attribute = getattr(self, attribute_name)
+                if isinstance(attribute, PreTrainedTokenizerBase) and "auto_map" in attribute.init_kwargs:
+                    del attribute.init_kwargs["auto_map"]
+        # Write processor_config.json (or equivalent)
+        output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)
+        processor_dict = self.to_dict()
+        # If processor_dict only contains processor_class, we skip writing the file,
+        # matching the upstream behavior; otherwise we save it.
+        if set(processor_dict.keys()) != {"processor_class"}:
+            self.to_json_file(output_processor_file)
+            # noinspection PyUnresolvedReferences
+            logger.warning_once(f"processor saved in {output_processor_file}")
+        # If requested, upload the modified files to the Hub
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+                create_pr=kwargs.get("create_pr", False),
+                revision=kwargs.get("revision"),
+                commit_description=kwargs.get("commit_description"),
+            )
+        if set(processor_dict.keys()) == {"processor_class"}:
+            return []
+        return [output_processor_file]
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        subfolder = kwargs.pop("subfolder", None)
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            attribute_class = cls.get_possibly_dynamic_module(class_name)
+            attribute_subfolder = os.path.join(subfolder, attribute_name) if subfolder else attribute_name
+            args.append(attribute_class.from_pretrained(
+                pretrained_model_name_or_path,
+                subfolder=attribute_subfolder,
+                **kwargs
+            ))
+        return args
+    def _upload_modified_files(
+        self,
+        working_dir: Union[str, os.PathLike],
+        repo_id: str,
+        files_timestamps: dict[str, float],
+        commit_message: Optional[str] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
+    ):
+        """
+        Uploads all modified files in `working_dir` to `repo_id`, based on `files_timestamps`.
+        """
+        working_dir = Path(working_dir)
+        if commit_message is None:
+            commit_message = "Upload CM3P processor"
+        modified_files = [
+            f
+            for f in working_dir.iterdir()
+            if str(f) not in files_timestamps or f.stat().st_mtime > files_timestamps[str(f)]
+        ]
+        # filter for actual files + folders at the root level
+        modified_files = [
+            f
+            for f in modified_files
+            if f.is_file() or f.is_dir()
+        ]
+        operations = []
+        # upload standalone files
+        for file in modified_files:
+            if file.is_dir():
+                # go over individual files of folder
+                for f in file.iterdir():
+                    operations.append(
+                        CommitOperationAdd(
+                            path_or_fileobj=f, path_in_repo=f.relative_to(working_dir).as_posix()
+                        )
+                    )
+            else:
+                operations.append(
+                    CommitOperationAdd(path_or_fileobj=file, path_in_repo=file.relative_to(working_dir).as_posix())
+                )
+        if revision is not None and not revision.startswith("refs/pr"):
+            try:
+                create_branch(repo_id=repo_id, branch=revision, token=token, exist_ok=True)
+            except HfHubHTTPError as e:
+                if e.response.status_code == 403 and create_pr:
+                    # If we are creating a PR on a repo we don't have access to, we can't create the branch.
+                    # so let's assume the branch already exists. If it's not the case, an error will be raised when
+                    # calling `create_commit` below.
+                    pass
+                else:
+                    raise
+        logger.info(f"Uploading the following files to {repo_id}: {','.join([f.relative_to(working_dir).as_posix() for f in modified_files])}")
+        return create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            token=token,
+            create_pr=create_pr,
+            revision=revision,
+        )
+AutoProcessor.register(CM3PConfig, CM3PProcessor)
+__all__ = ["CM3PProcessor", "get_metadata"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_cm3p.CM3PProcessor"
+  },
+  "default_kwargs": {
+    "audio_kwargs": {
+      "audio_length_per_tok": 8,
+      "hop_length": 160,
+      "max_source_positions": 1600,
+      "pad_to_multiple_of": 256000,
+      "padding": false,
+      "sampling_rate": 16000,
+      "truncation": false,
+      "window_size": 400
+    },
+    "beatmap_kwargs": {
+      "max_length": 2000,
+      "padding": "longest",
+      "truncation": "longest_first",
+      "window_length_sec": 16.0,
+      "window_stride_sec": 16.0
+    },
+    "common_kwargs": {
+      "return_tensors": "pt"
+    },
+    "metadata_kwargs": {
+      "max_length": 128,
+      "padding": "longest",
+      "truncation": "longest_first"
+    }
+  },
+  "processor_class": "CM3PProcessor"
+}

tokenization_cm3p.py ADDED Viewed

	@@ -0,0 +1,808 @@

+import copy
+import json
+from typing import Optional, Union, TypedDict
+import numpy as np
+from transformers import PreTrainedTokenizer, BatchEncoding, AutoTokenizer
+from transformers.tokenization_utils_base import TruncationStrategy
+from transformers.utils import PaddingStrategy
+from .configuration_cm3p import CM3PBeatmapConfig, CM3PMetadataConfig
+from .parsing_cm3p import Group, EventType, EVENT_TYPES_WITH_NEW_COMBO
+class CM3PBeatmapTokenizer(PreTrainedTokenizer):
+    model_input_names: list[str] = ["input_ids", "attention_mask"]
+    vocab_files_names: dict[str, str] = {"vocab_file": "vocab.json"}
+    def __init__(
+            self,
+            vocab_file: Optional[str] = None,
+            min_time: int = 0,
+            max_time: int = 30000,
+            time_step: int = 10,
+            max_distance: int = 640,
+            distance_step: int = 4,
+            position_range: tuple[int, int, int, int] = (-256, 768, -256, 640),
+            position_step: int = 4,
+            position_split_axes: bool = True,
+            add_cls_token: bool = False,
+            separate_new_combo_token: bool = True,
+            **kwargs,
+    ):
+        self.min_time = min_time
+        self.max_time = max_time
+        self.time_step = time_step
+        self.max_distance = max_distance
+        self.distance_step = distance_step
+        self.position_range = position_range
+        self.position_step = position_step
+        self.position_split_axes = position_split_axes
+        self.add_cls_token = add_cls_token
+        self.separate_new_combo_token = separate_new_combo_token
+        self.audio_bos_token = "[AUDIO_BOS]"
+        self.audio_eos_token = "[AUDIO_EOS]"
+        self.audio_token = "[AUDIO]"
+        if vocab_file is None:
+            self.vocab = self._build_vocab_from_config()
+        else:
+            with open(vocab_file, 'r', encoding='utf-8') as f:
+                self.vocab = json.load(f)
+        self.ids_to_tokens = {i: t for t, i in self.vocab.items()}
+        super().__init__(
+            bos_token=kwargs.pop("bos_token", "[BOS]"),
+            eos_token=kwargs.pop("eos_token", "[EOS]"),
+            unk_token=kwargs.pop("unk_token", "[UNK]"),
+            sep_token=kwargs.pop("sep_token", "[SEP]"),
+            pad_token=kwargs.pop("pad_token", "[PAD]"),
+            cls_token=kwargs.pop("cls_token", "[CLS]"),
+            mask_token=kwargs.pop("mask_token", "[MASK]"),
+            additional_special_tokens=kwargs.pop("additional_special_tokens", [
+                self.audio_bos_token,
+                self.audio_eos_token,
+                self.audio_token,
+            ]),
+            min_time=min_time,
+            max_time=max_time,
+            time_step=time_step,
+            max_distance=max_distance,
+            distance_step=distance_step,
+            position_range=position_range,
+            position_step=position_step,
+            position_split_axes=position_split_axes,
+            add_cls_token=add_cls_token,
+            separate_new_combo_token=separate_new_combo_token,
+            **kwargs
+        )
+    def _build_vocab_from_config(self):
+        vocab = []
+        for event_type in EventType:
+            vocab.append(f"[{event_type.value.upper()}]")
+        if not self.separate_new_combo_token:
+            for event_type in EVENT_TYPES_WITH_NEW_COMBO:
+                vocab.append(f"[{event_type.value.upper()}_NEW_COMBO]")
+        for time in np.arange(self.min_time, self.max_time + 1e-5, self.time_step):
+            vocab.append(f"[TIME_SHIFT_{int(time)}]")
+        for snapping in range(0, 17):
+            vocab.append(f"[SNAPPING_{snapping}]")
+        for distance in range(0, self.max_distance + 1):
+            vocab.append(f"[DISTANCE_{distance}]")
+        if self.position_split_axes:
+            for x in np.arange(self.position_range[0], self.position_range[1] + 1e-5, self.position_step):
+                vocab.append(f"[POS_X_{int(x)}]")
+            for y in np.arange(self.position_range[2], self.position_range[3] + 1e-5, self.position_step):
+                vocab.append(f"[POS_Y_{int(y)}]")
+        else:
+            for x in np.arange(self.position_range[0], self.position_range[1] + 1e-5, self.position_step):
+                for y in np.arange(self.position_range[2], self.position_range[3] + 1e-5, self.position_step):
+                    vocab.append(f"[POS_{int(x)}_{int(y)}]")
+        for mania_column in range(1, 19):
+            vocab.append(f"[MANIA_COLUMN_{mania_column}]")
+        for scroll_speed in np.arange(0.0, 10.0 + 1e-5, 0.01):
+            vocab.append(f"[SCROLL_SPEED_{scroll_speed:.2f}]")
+        if self.separate_new_combo_token:
+            vocab.append("[NEW_COMBO]")
+        for hitsound in range(8):
+            for sampleset in range(1, 4):
+                for additions in range(1, 4):
+                    vocab.append(f"[HITSOUND_{(hitsound << 1)}_{sampleset}_{additions}]")
+        for volume in range(101):
+            vocab.append(f"[VOLUME_{volume}]")
+        return {token: idx for idx, token in enumerate(vocab)}
+    def _tokenize_time_shift(self, time: int):
+        time = np.clip(time, self.min_time, self.max_time)
+        time = round(time / self.time_step) * self.time_step
+        return f"[TIME_SHIFT_{int(time)}]"
+    def _tokenize_distance(self, distance: int):
+        distance = np.clip(distance, 0, self.max_distance)
+        distance = round(distance / self.distance_step) * self.distance_step
+        return f"[DISTANCE_{distance}]"
+    def _tokenize_position(self, pos_x: int, pos_y: int):
+        pos_x = np.clip(pos_x, self.position_range[0], self.position_range[1])
+        pos_y = np.clip(pos_y, self.position_range[2], self.position_range[3])
+        pos_x = round(pos_x / self.position_step) * self.position_step
+        pos_y = round(pos_y / self.position_step) * self.position_step
+        if self.position_split_axes:
+            yield f"[POS_X_{int(pos_x)}]"
+            yield f"[POS_Y_{int(pos_y)}]"
+        else:
+            yield f"[POS_{int(pos_x)}_{int(pos_y)}]"
+    def _tokenize_mania_column(self, mania_column: int):
+        mania_column = np.clip(mania_column, 1, 18)
+        return f"[MANIA_COLUMN_{mania_column}]"
+    def _tokenize_scroll_speed(self, scroll_speed: float):
+        scroll_speed = np.clip(scroll_speed, 0.0, 10.0)
+        scroll_speed = round(scroll_speed / 0.01) * 0.01
+        return f"[SCROLL_SPEED_{scroll_speed:.2f}]"
+    def _tokenize_hitsound(self, hitsound: int, sampleset: int, addition: int):
+        hitsound = np.clip(hitsound >> 1, 0, 7) << 1
+        sampleset = np.clip(sampleset, 1, 3)
+        addition = np.clip(addition, 1, 3)
+        return f"[HITSOUND_{hitsound}_{sampleset}_{addition}]"
+    def _tokenize_groups(
+            self,
+            groups: list[Group],
+            window_start_ms: Optional[int] = None,
+            **_
+    ):
+        window_start_ms = window_start_ms or 0
+        tokens = []
+        if self.add_cls_token:
+            tokens.append(self.cls_token)
+        tokens.append(self.bos_token)
+        for group in groups:
+            if group.new_combo and not self.separate_new_combo_token and group.event_type in EVENT_TYPES_WITH_NEW_COMBO:
+                tokens.append(f"[{group.event_type.value.upper()}_NEW_COMBO]")
+            else:
+                tokens.append(f"[{group.event_type.value.upper()}]")
+            if group.has_time:
+                tokens.append(self._tokenize_time_shift(group.time - window_start_ms))
+                if group.snapping is not None:
+                    tokens.append(f"[SNAPPING_{group.snapping}]")
+            if group.distance is not None:
+                tokens.append(self._tokenize_distance(group.distance))
+            if group.x is not None and group.y is not None:
+                tokens.extend(self._tokenize_position(group.x, group.y))
+            if group.mania_column is not None:
+                tokens.append(self._tokenize_mania_column(group.mania_column))
+            if group.new_combo and self.separate_new_combo_token:
+                tokens.append("[NEW_COMBO]")
+            if group.scroll_speed is not None:
+                tokens.append(self._tokenize_scroll_speed(group.scroll_speed))
+            for h, s, a, v, in zip(
+                    group.hitsounds,
+                    group.samplesets,
+                    group.additions,
+                    group.volumes,
+            ):
+                tokens.append(self._tokenize_hitsound(h, s, a))
+                tokens.append(f"[VOLUME_{v}]")
+        tokens.append(self.eos_token)
+        return tokens
+    def _encode_single(
+            self,
+            groups: Optional[Union[list[Group]]] = None,
+            window_start_ms: Optional[int] = None,
+            num_audio_tokens: Optional[int] = None,
+    ):
+        token_strings = self._tokenize_groups(groups, window_start_ms=window_start_ms)
+        token_ids = self.convert_tokens_to_ids(token_strings)
+        if num_audio_tokens is not None and num_audio_tokens > 0:
+            audio_tokens = [self.audio_bos_token] + [self.audio_token] * num_audio_tokens + [self.audio_eos_token]
+            token_ids = self.convert_tokens_to_ids(audio_tokens) + token_ids
+        return token_ids
+    def __call__(
+            self,
+            groups: Optional[Union[list[Group], list[list[Group]]]] = None,
+            window_start_ms: Optional[Union[int, list[int]]] = None,
+            num_audio_tokens: Optional[Union[int, list[int]]] = None,
+            padding: PaddingStrategy = PaddingStrategy.LONGEST,
+            truncation: TruncationStrategy = TruncationStrategy.LONGEST_FIRST,
+            **kwargs
+    ) -> BatchEncoding:
+        if len(groups) == 0:
+            raise ValueError("Input groups list is empty.")
+        if isinstance(groups, list) and all(isinstance(g, Group) for g in groups):
+            token_ids = self._encode_single(
+                groups=groups,
+                window_start_ms=window_start_ms,
+                num_audio_tokens=num_audio_tokens,
+            )
+            encoding = self.prepare_for_model(
+                token_ids,
+                padding=padding,
+                truncation=truncation,
+                **kwargs,
+            )
+        elif isinstance(groups, list):
+            if num_audio_tokens is None:
+                num_audio_tokens = [None] * len(groups)
+            if window_start_ms is None:
+                window_start_ms = [None] * len(groups)
+            if len(groups) != len(num_audio_tokens):
+                raise ValueError("Number of num_audio_tokens inputs must match the number of sequences.")
+            if len(window_start_ms) != len(groups):
+                raise ValueError("Number of window start times must match the number of sequences.")
+            all_token_ids = []
+            for g, w, a in zip(groups, window_start_ms, num_audio_tokens):
+                token_ids = self._encode_single(
+                    groups=g,
+                    window_start_ms=w,
+                    num_audio_tokens=a,
+                )
+                all_token_ids.append((token_ids, None))
+            encoding = self._batch_prepare_for_model(
+                all_token_ids,
+                padding_strategy=PaddingStrategy(padding),
+                truncation_strategy=TruncationStrategy(truncation),
+                **kwargs,
+            )
+        else:
+            raise ValueError("Input must be a list of Group objects or a single Group object.")
+        return encoding
+    @property
+    def vocab_size(self):
+        return len(self.vocab) + len(self._added_tokens_encoder)
+    def get_vocab(self):
+        return self.vocab | self._added_tokens_encoder
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not save_directory:
+            raise ValueError("The save_directory must be specified.")
+        vocab_file = f"{save_directory}/{filename_prefix or ''}vocab.json"
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            json.dump(self.vocab, f, ensure_ascii=False)
+        return (vocab_file,)
+class CM3PMetadata(TypedDict, total=False):
+    """
+    Metadata fields for a beatmap.
+    difficulty: Star rating, unitless (osu! difficulty)
+    year: Year of beatmap creation (YYYY)
+    mode: Game mode ID or name (e.g., "osu", "mania")
+    mapper: Beatmap creator's ID or username
+    cs: Circle size (osu!std), unitless
+    hitsounded: Whether the beatmap is hitsounded (True/False)
+    song_length: Song length in seconds
+    song_position: Relative position in song [0.0-1.0], unitless
+    global_sv: Global scroll velocity (osu!mania), multiplier
+    mania_keycount: Number of keys in osu!mania [1-18]
+    hold_note_ratio: Ratio of hold notes [0.0-1.0], unitless
+    scroll_speed_ratio: Ratio of scroll speed changes [0.0-1.0], unitless
+    tags: List of beatmap tag IDs or names
+    """
+    difficulty: float  # Star rating, unitless (osu! difficulty)
+    year: int  # Year of beatmap creation (YYYY)
+    mode: Union[int, str]  # Game mode ID or name (e.g., "osu", "mania")
+    status: Union[int, str]  # Beatmap status (e.g., "ranked", "approved", "loved", "pending", "graveyard")
+    mapper: Union[int, str]  # Beatmap creator's ID or username
+    cs: float  # Circle size (osu!std), unitless
+    hitsounded: bool  # Whether the beatmap is hitsounded (True/False)
+    song_length: float  # Song length in seconds
+    song_position: float  # Relative position in song [0.0-1.0], unitless
+    global_sv: float  # Global slider velocity (osu!standard/catch), multiplier
+    mania_keycount: int  # Number of keys in osu!mania [1-18]
+    hold_note_ratio: float  # Ratio of hold notes [0.0-1.0], unitless
+    scroll_speed_ratio: float  # Ratio of scroll speed changes [0.0-1.0], unitless
+    tags: list[Union[int, str]]  # List of beatmap tag IDs or names
+def merge_metadata_dicts(m1, m2):
+    if m1 is None:
+        return m2
+    if m2 is None:
+        return m1
+    merged = {}
+    for key in CM3PMetadata.__annotations__.keys():
+        v1 = m1.get(key, None)
+        v2 = m2.get(key, None)
+        merged[key] = v2 if v1 is None else v1
+    return CM3PMetadata(**merged)
+class CM3PMetadataTokenizer(PreTrainedTokenizer):
+    model_input_names: list[str] = ["input_ids", "attention_mask"]
+    vocab_files_names: dict[str, str] = {"vocab_file": "vocab.json"}
+    def __init__(
+            self,
+            vocab_file: Optional[str] = None,
+            modes: Optional[dict[int, str]] = None,
+            statuses: Optional[dict[int, str]] = None,
+            mappers: Optional[dict[int, str]] = None,
+            tags: Optional[dict[int, dict]] = None,
+            min_difficculty: float = 0.0,
+            max_difficulty: float = 14.0,
+            difficulty_step: float = 0.1,
+            min_year: int = 2000,
+            max_year: int = 2023,
+            max_song_length: int = 600,
+            song_length_step: int = 10,
+            song_position_step: float = 0.01,
+            global_sv_step: float = 0.01,
+            hold_note_ratio_step: float = 0.1,
+            scroll_speed_ratio_step: float = 0.1,
+            add_cls_token: bool = False,
+            **kwargs,
+    ):
+        self.min_difficulty = min_difficculty
+        self.max_difficulty = max_difficulty
+        self.difficulty_step = difficulty_step
+        self.min_year = min_year
+        self.max_year = max_year
+        self.max_song_length = max_song_length
+        self.song_length_step = song_length_step
+        self.song_position_step = song_position_step
+        self.global_sv_step = global_sv_step
+        self.hold_note_ratio_step = hold_note_ratio_step
+        self.scroll_speed_ratio_step = scroll_speed_ratio_step
+        self.add_cls_token = add_cls_token
+        self.difficulty_unk_token = "[DIFFICULTY_UNK]"
+        self.year_unk_token = "[YEAR_UNK]"
+        self.mode_unk_token = "[MODE_UNK]"
+        self.status_unk_token = "[STATUS_UNK]"
+        self.mapper_unk_token = "[MAPPER_UNK]"
+        self.cs_unk_token = "[CS_UNK]"
+        self.hitsounded_unk_token = "[HITSOUNDED_UNK]"
+        self.song_length_unk_token = "[SONG_LENGTH_UNK]"
+        self.song_position_unk_token = "[SONG_POSITION_UNK]"
+        self.global_sv_unk_token = "[GLOBAL_SV_UNK]"
+        self.mania_keycount_unk_token = "[MANIA_KEYCOUNT_UNK]"
+        self.hold_note_ratio_unk_token = "[HOLD_NOTE_RATIO_UNK]"
+        self.scroll_speed_ratio_unk_token = "[SCROLL_SPEED_RATIO_UNK]"
+        self.tag_unk_token = "[TAG_UNK]"
+        self.modes = modes or {}
+        self.statuses = statuses or {}
+        self.mappers = mappers or {}
+        self.tags = tags or {}
+        self.mode_names_to_ids = {v: k for k, v in self.modes.items()}
+        self.mode_ids_to_names = self.modes
+        self.status_names_to_ids = {v: k for k, v in self.statuses.items()}
+        self.status_ids_to_names = self.statuses
+        self.mapper_names_to_ids = {v: k for k, v in self.mappers.items()}
+        self.mapper_ids_to_names = self.mappers
+        self.tag_names_to_ids = {v['name']: k for k, v in self.tags.items()}
+        self.tag_ids_to_names = {k: v['name'] for k, v in self.tags.items()}
+        if vocab_file is None:
+            self.vocab = self._build_vocab_from_config()
+        else:
+            with open(vocab_file, 'r', encoding='utf-8') as f:
+                self.vocab = json.load(f)
+        self.ids_to_tokens = {i: t for t, i in self.vocab.items()}
+        super().__init__(
+            bos_token=kwargs.pop("bos_token", "[BOS]"),
+            eos_token=kwargs.pop("eos_token", "[EOS]"),
+            pad_token=kwargs.pop("pad_token", "[PAD]"),
+            cls_token=kwargs.pop("cls_token", "[CLS]"),
+            additional_special_tokens=kwargs.pop("additional_special_tokens", [
+                self.difficulty_unk_token,
+                self.year_unk_token,
+                self.mode_unk_token,
+                self.status_unk_token,
+                self.mapper_unk_token,
+                self.cs_unk_token,
+                self.hitsounded_unk_token,
+                self.song_length_unk_token,
+                self.song_position_unk_token,
+                self.global_sv_unk_token,
+                self.mania_keycount_unk_token,
+                self.hold_note_ratio_unk_token,
+                self.scroll_speed_ratio_unk_token,
+                self.tag_unk_token,
+            ]),
+            modes=modes,
+            statuses=statuses,
+            mappers=mappers,
+            tags=tags,
+            min_difficculty=min_difficculty,
+            max_difficulty=max_difficulty,
+            difficulty_step=difficulty_step,
+            min_year=min_year,
+            max_year=max_year,
+            max_song_length=max_song_length,
+            song_length_step=song_length_step,
+            song_position_step=song_position_step,
+            global_sv_step=global_sv_step,
+            hold_note_ratio_step=hold_note_ratio_step,
+            scroll_speed_ratio_step=scroll_speed_ratio_step,
+            add_cls_token=add_cls_token,
+            **kwargs
+        )
+    def _build_vocab_from_config(self):
+        vocab = []
+        for difficulty in np.arange(self.min_difficulty, self.max_difficulty + 1e-5, self.difficulty_step):
+            vocab.append(f"[DIFFICULTY_{difficulty:.1f}]")
+        for year in range(self.min_year, self.max_year + 1):
+            vocab.append(f"[YEAR_{year}]")
+        for mode in self.mode_ids_to_names.values():
+            vocab.append(f"[MODE_{str(mode)}]")
+        for status in self.status_ids_to_names.values():
+            vocab.append(f"[STATUS_{str(status)}]")
+        for mapper in self.mapper_ids_to_names.keys():
+            vocab.append(f"[MAPPER_{str(mapper)}]")
+        for cs in np.arange(0.0, 10.0 + 1e-5, 0.1):
+            vocab.append(f"[CS_{cs:.1f}]")
+        for hitsounded in [True, False]:
+            vocab.append(f"[HITSOUNDED_{str(hitsounded).upper()}]")
+        for song_length in np.arange(0, self.max_song_length + 1e-5, self.song_length_step):
+            vocab.append(f"[SONG_LENGTH_{int(song_length)}]")
+        for song_position in np.arange(0.0, 1.0 + 1e-5, self.song_position_step):
+            vocab.append(f"[SONG_POSITION_{song_position:.2f}]")
+        for global_sv in np.arange(0.4, 3.6 + 1e-5, self.global_sv_step):
+            vocab.append(f"[GLOBAL_SV_{global_sv:.2f}]")
+        for mania_keycount in range(1, 19):
+            vocab.append(f"[MANIA_KEYCOUNT_{mania_keycount}]")
+        for hold_note_ratio in np.arange(0.0, 1.0 + 1e-5, self.hold_note_ratio_step):
+            vocab.append(f"[HOLD_NOTE_RATIO_{hold_note_ratio:.1f}]")
+        for scroll_speed_ratio in np.arange(0.0, 1.0 + 1e-5, self.scroll_speed_ratio_step):
+            vocab.append(f"[SCROLL_SPEED_RATIO_{scroll_speed_ratio:.1f}]")
+        for tag in self.tag_ids_to_names.values():
+            vocab.append(f"[TAG_{tag}]")
+        return {token: idx for idx, token in enumerate(vocab)}
+    def _tokenize_difficulty(self, metadata: CM3PMetadata):
+        difficulty = metadata.get('difficulty', None)
+        if difficulty is None:
+            return self.difficulty_unk_token
+        difficulty = np.clip(difficulty, self.min_difficulty, self.max_difficulty)
+        difficulty = round(difficulty / self.difficulty_step) * self.difficulty_step
+        return f"[DIFFICULTY_{difficulty:.1f}]"
+    def _tokenize_year(self, metadata: CM3PMetadata):
+        year = metadata.get('year', None)
+        if year is None:
+            return self.year_unk_token
+        year = np.clip(year, self.min_year, self.max_year)
+        return f"[YEAR_{year}]"
+    def _tokenize_mode(self, metadata: CM3PMetadata):
+        mode_str = metadata.get('mode', None)
+        if isinstance(mode_str, int):
+            mode_str = self.mode_ids_to_names.get(mode_str, None)
+        if mode_str is None or mode_str not in self.mode_names_to_ids:
+            return self.mode_unk_token
+        return f"[MODE_{str(mode_str)}]"
+    def _tokenize_status(self, metadata: CM3PMetadata):
+        status_str = metadata.get('status', None)
+        if isinstance(status_str, int):
+            status_str = self.status_ids_to_names.get(status_str, None)
+        if status_str is None or status_str not in self.status_names_to_ids:
+            return self.status_unk_token
+        return f"[STATUS_{str(status_str)}]"
+    def _tokenize_mapper(self, metadata: CM3PMetadata):
+        mapper_id = metadata.get('mapper', None)
+        if isinstance(mapper_id, str):
+            mapper_id = self.mapper_names_to_ids.get(mapper_id, None)
+        if mapper_id is None or mapper_id not in self.mapper_ids_to_names:
+            return self.mapper_unk_token
+        return f"[MAPPER_{str(mapper_id)}]"
+    def _tokenize_cs(self, metadata: CM3PMetadata):
+        cs = metadata.get('cs', None)
+        if cs is None:
+            return self.cs_unk_token
+        cs = np.clip(cs, 0.0, 10.0)
+        cs = round(cs / 0.1) * 0.1
+        return f"[CS_{cs:.1f}]"
+    def _tokenize_hitsounded(self, metadata: CM3PMetadata):
+        hitsounded = metadata.get('hitsounded', None)
+        if hitsounded is None:
+            return self.hitsounded_unk_token
+        return f"[HITSOUNDED_{str(hitsounded).upper()}]"
+    def _tokenize_song_length(self, metadata: CM3PMetadata):
+        song_length = metadata.get('song_length', None)
+        if song_length is None:
+            return self.song_length_unk_token
+        song_length = np.clip(song_length, 0, self.max_song_length)
+        song_length = round(song_length / self.song_length_step) * self.song_length_step
+        return f"[SONG_LENGTH_{int(song_length)}]"
+    def _tokenize_song_position(self, metadata: CM3PMetadata):
+        song_position = metadata.get('song_position', None)
+        if song_position is None:
+            return self.song_position_unk_token
+        song_position = np.clip(song_position, 0.0, 1.0)
+        song_position = round(song_position / self.song_position_step) * self.song_position_step
+        return f"[SONG_POSITION_{song_position:.2f}]"
+    def _tokenize_global_sv(self, metadata: CM3PMetadata):
+        global_sv = metadata.get('global_sv', None)
+        if global_sv is None:
+            return self.global_sv_unk_token
+        global_sv = np.clip(global_sv, 0.4, 3.6)
+        global_sv = round(global_sv / self.global_sv_step) * self.global_sv_step
+        return f"[GLOBAL_SV_{global_sv:.2f}]"
+    def _tokenize_mania_keycount(self, metadata: CM3PMetadata):
+        mania_keycount = metadata.get('mania_keycount', None)
+        if mania_keycount is None:
+            return self.mania_keycount_unk_token
+        mania_keycount = int(mania_keycount)
+        mania_keycount = np.clip(mania_keycount, 1, 18)
+        return f"[MANIA_KEYCOUNT_{mania_keycount}]"
+    def _tokenize_hold_note_ratio(self, metadata: CM3PMetadata):
+        hold_note_ratio = metadata.get('hold_note_ratio', None)
+        if hold_note_ratio is None:
+            return self.hold_note_ratio_unk_token
+        hold_note_ratio = np.clip(hold_note_ratio, 0.0, 1.0)
+        hold_note_ratio = round(hold_note_ratio / self.hold_note_ratio_step) * self.hold_note_ratio_step
+        return f"[HOLD_NOTE_RATIO_{hold_note_ratio:.1f}]"
+    def _tokenize_scroll_speed_ratio(self, metadata: CM3PMetadata):
+        scroll_speed_ratio = metadata.get('scroll_speed_ratio', None)
+        if scroll_speed_ratio is None:
+            return self.scroll_speed_ratio_unk_token
+        scroll_speed_ratio = np.clip(scroll_speed_ratio, 0.0, 1.0)
+        scroll_speed_ratio = round(scroll_speed_ratio / self.scroll_speed_ratio_step) * self.scroll_speed_ratio_step
+        return f"[SCROLL_SPEED_RATIO_{scroll_speed_ratio:.1f}]"
+    def _validate_tags(self, tags):
+        if tags is None:
+            return None
+        new_tags = []
+        for tag in tags:
+            if isinstance(tag, str) and tag in self.tag_names_to_ids:
+                new_tags.append(tag)
+            elif tag in self.tag_ids_to_names:
+                new_tags.append(self.tag_ids_to_names[tag])
+        return new_tags
+    def _tokenize_tags(self, metadata: CM3PMetadata):
+        tags = metadata.get('tags', None)
+        valid_tags = self._validate_tags(tags)
+        if not valid_tags:
+            return [self.tag_unk_token]
+        return [f"[TAG_{tag}]" for tag in valid_tags]
+    def _tokenize_metadata(self, metadata: CM3PMetadata):
+        tokens = []
+        if self.add_cls_token:
+            tokens.append(self.cls_token)
+        tokens.extend([
+            self.bos_token,
+            self._tokenize_difficulty(metadata),
+            self._tokenize_year(metadata),
+            self._tokenize_mode(metadata),
+            self._tokenize_status(metadata),
+            self._tokenize_mapper(metadata),
+            self._tokenize_cs(metadata),
+            self._tokenize_hitsounded(metadata),
+            self._tokenize_song_length(metadata),
+            self._tokenize_song_position(metadata),
+            self._tokenize_global_sv(metadata),
+            self._tokenize_mania_keycount(metadata),
+            self._tokenize_hold_note_ratio(metadata),
+            self._tokenize_scroll_speed_ratio(metadata),
+        ])
+        tokens.extend(self._tokenize_tags(metadata))
+        tokens.append(self.eos_token)
+        return tokens
+    def __call__(
+            self,
+            metadata: Optional[Union[CM3PMetadata, list[CM3PMetadata]]] = None,
+            padding: PaddingStrategy = PaddingStrategy.LONGEST,
+            truncation: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
+            return_tensors: Optional[str] = "pt",
+            **kwargs
+    ) -> BatchEncoding:
+        if isinstance(metadata, dict):
+            token_strings = self._tokenize_metadata(metadata)
+            token_ids = self.convert_tokens_to_ids(token_strings)
+            return self.prepare_for_model(
+                token_ids,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        elif isinstance(metadata, list):
+            all_token_ids = []
+            for m in metadata:
+                token_strings = self._tokenize_metadata(m)
+                token_ids = self.convert_tokens_to_ids(token_strings)
+                all_token_ids.append((token_ids, None))
+            return self._batch_prepare_for_model(
+                all_token_ids,
+                padding_strategy=PaddingStrategy(padding),
+                truncation_strategy=TruncationStrategy(truncation),
+                max_length=max_length,
+                return_tensors=return_tensors,
+            )
+    def metadata_variations(self, metadata: CM3PMetadata, num_variations: int = 1000) -> tuple[CM3PMetadata, int]:
+        def year_variations():
+            min_year = max(2007, self.min_year)
+            if metadata["year"] is None or (min_year > metadata["year"] or metadata["year"] > self.max_year):
+                return
+            for year in range(min_year, self.max_year + 1):
+                if year != metadata["year"]:
+                    new_m = copy.deepcopy(metadata)
+                    new_m["year"] = year
+                    yield new_m, 1
+        def status_variations():
+            if metadata["status"] is None:
+                return
+            current_status = self.status_ids_to_names.get(metadata["status"], None) or metadata["status"]
+            if current_status not in self.status_names_to_ids:
+                return
+            for status in self.status_ids_to_names.values():
+                if status != current_status:
+                    new_m = copy.deepcopy(metadata)
+                    new_m["status"] = status
+                    yield new_m, 2
+        def tags_variations():
+            # Replace/add/remove some tags
+            if metadata["tags"] is None or len(metadata["tags"]) <= 0:
+                return
+            current_tags = self._validate_tags(metadata["tags"])
+            if len(current_tags) <= 0:
+                return
+            for tag in self.tag_ids_to_names.values():
+                if tag not in current_tags:
+                    new_m = copy.deepcopy(metadata)
+                    new_m["tags"][np.random.randint(0, len(new_m["tags"]))] = tag
+                    yield new_m, 3
+            for tag in self.tag_ids_to_names.values():
+                if tag not in current_tags:
+                    new_m = copy.deepcopy(metadata)
+                    new_m["tags"].insert(np.random.randint(0, len(new_m["tags"]) + 1), tag)
+                    yield new_m, 3
+            if len(current_tags) <= 1:
+                return
+            for tag in current_tags:
+                new_m = copy.deepcopy(metadata)
+                new_tags = [t for t in current_tags if t != tag]
+                new_m["tags"] = new_tags
+                yield new_m, 3
+        def mapper_variations():
+            if metadata['mapper'] is None:
+                return
+            current_mapper = self.mapper_names_to_ids.get(metadata["mapper"], None) or metadata["mapper"]
+            mapper_variations = list(self.mapper_ids_to_names.keys())
+            if current_mapper in self.mapper_ids_to_names:
+                mapper_variations.remove(current_mapper)
+            # Randomly sample mappers to avoid too many variations
+            np.random.shuffle(mapper_variations)
+            for mapper in mapper_variations:
+                new_m = copy.deepcopy(metadata)
+                new_m["mapper"] = mapper
+                yield new_m, 4
+        def padding_variations():
+            while True:
+                yield CM3PMetadata(), -1
+        # Add variations with one field changed at a time
+        current_num_variations = 0
+        workers = [
+            year_variations(),
+            status_variations(),
+            tags_variations(),
+            mapper_variations(),
+        ]
+        padding_iterable = padding_variations()
+        index = 0
+        while current_num_variations < num_variations and len(workers) > 0:
+            try:
+                index = index % len(workers)
+                item = workers[index].__next__()
+                index += 1
+                current_num_variations += 1
+                yield item
+            except StopIteration:
+                workers.remove(workers[index])
+        while current_num_variations < num_variations:
+            current_num_variations += 1
+            yield padding_iterable.__next__()
+    @property
+    def vocab_size(self):
+        return len(self.vocab) + len(self._added_tokens_encoder)
+    def get_vocab(self):
+        return self.vocab | self._added_tokens_encoder
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not save_directory:
+            raise ValueError("The save_directory must be specified.")
+        vocab_file = f"{save_directory}/{filename_prefix or ''}vocab.json"
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            json.dump(self.vocab, f, ensure_ascii=False)
+        return (vocab_file,)
+AutoTokenizer.register(CM3PBeatmapConfig, CM3PBeatmapTokenizer)
+AutoTokenizer.register(CM3PMetadataConfig, CM3PMetadataTokenizer)
+__all__ = ["CM3PBeatmapTokenizer", "CM3PMetadataTokenizer", "CM3PMetadata", "merge_metadata_dicts"]