Spaces:

Prajanya23
/

Coda

Sleeping

Coda / src /tokenizer.py

Prajanya Gupta

initial deploy

6b7b403 10 days ago

36 kB

	"""MIDI tokenizer for bach-gpt.

	Vocabulary
	----------
	Structural: PAD, EOS, BAR_START, BAR_END, PHRASE_START, PHRASE_END,
	REST, CHORD_START, CHORD_END (9)
	Pitch: MIDI 21..108 (88)
	Duration: 32 log-quantized bins over [0.03125, 4.0] seconds (32)
	Time-shift: 32 log-quantized bins over [0.03125, 4.0] seconds (32)
	Velocity: 16 uniform bins over [0, 127] (V0..V15) (16)
	Voice/chan: 16 GM families + 1 drums (VC0..VC16) (17)
	Tempo: 16 log-spaced bins over [40, 240] BPM (T0..T15) (16)
	Position: 16 sub-beat positions per bar (POS0..POS15) (16)
	Meter: 8 common time signatures + OTHER (METER_*) (9)
	Voice role: ROLE_BASS, ROLE_INNER, ROLE_TOP (within chord brackets) (3)

	Total vocab size: 238 tokens.

	API
	---
	encode(pm: pretty_midi.PrettyMIDI) -> List[int]
	decode(ids: List[int]) -> pretty_midi.PrettyMIDI
	round_trip_test(pm) -> (passed: bool, details: dict)

	Note on fidelity: encode/decode preserve the pitch multiset and instrument
	family per note but timing is fuzzy due to log quantization. BAR_* and
	PHRASE_* markers are emitted from PrettyMIDI downbeats when a time signature
	is present. A tempo token (T*) is emitted at PHRASE_START and on tempo
	changes. A VC* token is emitted whenever the active track's instrument
	family changes.
	"""

	from __future__ import annotations

	import json
	import math
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple

	import numpy as np
	import pretty_midi


	# --- Vocabulary construction --------------------------------------------------

	STRUCTURAL = [
	"PAD",
	"EOS",
	"BAR_START",
	"BAR_END",
	"PHRASE_START",
	"PHRASE_END",
	"REST",
	"CHORD_START",
	"CHORD_END",
	]
	ROLES = ["ROLE_BASS", "ROLE_INNER", "ROLE_TOP"]
	# 24 keys: indices 0..11 = C..B major; 12..23 = C..B minor (PrettyMIDI's
	# convention via key_number).
	KEYS = [f"KEY_{i}" for i in range(24)]
	# Bar-header axes: emitted right after each BAR_START as a coarse summary
	# of the bar's harmonic, density, and register content.
	ROOT_NAMES = [f"ROOT_{i}" for i in range(12)]
	N_DENS_BINS = 4 # 0-3, 4-7, 8-15, 16+
	N_REG_BINS = 4 # <48, 48-59, 60-71, 72+
	DENS_NAMES = [f"DENS_{i}" for i in range(N_DENS_BINS)]
	REG_NAMES = [f"REG_{i}" for i in range(N_REG_BINS)]
	# Bar-repetition markers: emitted right after bar header when this bar's
	# pitch multiset matches a bar K positions earlier (K in {1, 2, 4, 8}).
	REF_DISTANCES = [1, 2, 4, 8]
	REF_NAMES = [f"REF_BAR_{k}" for k in REF_DISTANCES]
	# Caption-segment markers for cross-modal alignment with MidiCaps-style
	# multi-sentence captions. Emit CAP_SEG_<i> at PHRASE_START to bind the
	# next phrase to the i-th caption segment.
	N_CAP_SEGS = 8
	CAP_SEG_NAMES = [f"CAP_SEG_{i}" for i in range(N_CAP_SEGS)]

	# Pedal tokens: GM CC#64 (sustain), CC#66 (sostenuto), CC#67 (soft).
	PEDAL_CC_NUMBERS = {64: "SUS", 66: "SOS", 67: "SFT"}
	PEDAL_NAMES = [
	f"PEDAL_{p}_{state}"
	for p in PEDAL_CC_NUMBERS.values()
	for state in ("UP", "DOWN")
	]

	# Continuous-controller tokens for the most common expressive CCs, each
	# quantized to 8 bins over [0, 128).
	N_CC_BINS = 8
	CC_TYPES = {1: "MOD", 7: "VOL", 10: "PAN", 11: "EXPR"}
	CC_NAMES = [
	f"CC_{name}_{i}"
	for name in CC_TYPES.values()
	for i in range(N_CC_BINS)
	]

	# Pitch-bend tokens. PrettyMIDI gives 14-bit values in [-8192, 8191];
	# quantize to 16 uniform bins and emit as PB_<i>.
	N_PB_BINS = 16
	PB_NAMES = [f"PB_{i}" for i in range(N_PB_BINS)]

	# Reverse maps for the decoder: short name -> CC number.
	PEDAL_NAME_TO_CC = {v: k for k, v in PEDAL_CC_NUMBERS.items()}
	CC_NAME_TO_NUMBER = {v: k for k, v in CC_TYPES.items()}
	METERS = [
	"METER_2_4",
	"METER_3_4",
	"METER_4_4",
	"METER_5_4",
	"METER_6_8",
	"METER_7_8",
	"METER_9_8",
	"METER_12_8",
	"METER_OTHER",
	]
	# Bar length in quarter-notes for each meter (used by decoder).
	METER_QUARTERS: Dict[str, float] = {
	"METER_2_4": 2.0,
	"METER_3_4": 3.0,
	"METER_4_4": 4.0,
	"METER_5_4": 5.0,
	"METER_6_8": 3.0,
	"METER_7_8": 3.5,
	"METER_9_8": 4.5,
	"METER_12_8": 6.0,
	"METER_OTHER": 4.0,
	}
	PITCH_MIN, PITCH_MAX = 21, 108 # 88 pitches (A0..C8)
	N_PITCH = PITCH_MAX - PITCH_MIN + 1 # 88

	N_DUR_BINS = 32
	N_SHIFT_BINS = 32
	N_VEL_BINS = 16
	# 16 GM instrument families + 1 reserved drum voice (VC16).
	N_VOICE_BINS = 17
	DRUM_VOICE = 16
	N_TEMPO_BINS = 16
	# Sub-beat resolution per bar (sixteenth-note grid in 4/4).
	N_POS_BINS = 16

	# Log-quantization range: 2**-5 s (~31 ms) to 4 s.
	TIME_MIN, TIME_MAX = 2 ** -5, 4.0
	LOG_TIME_EDGES = np.linspace(math.log(TIME_MIN), math.log(TIME_MAX), N_DUR_BINS + 1)

	# Tempo log-quantization range: 40..240 BPM.
	TEMPO_MIN, TEMPO_MAX = 40.0, 240.0
	LOG_TEMPO_EDGES = np.linspace(
	math.log(TEMPO_MIN), math.log(TEMPO_MAX), N_TEMPO_BINS + 1
	)


	def _build_vocab() -> Tuple[List[str], Dict[str, int]]:
	tokens: List[str] = list(STRUCTURAL)
	tokens += [f"P{p}" for p in range(PITCH_MIN, PITCH_MAX + 1)]
	tokens += [f"D{i}" for i in range(N_DUR_BINS)]
	tokens += [f"TS{i}" for i in range(N_SHIFT_BINS)]
	tokens += [f"V{i}" for i in range(N_VEL_BINS)]
	tokens += [f"VC{i}" for i in range(N_VOICE_BINS)]
	tokens += [f"T{i}" for i in range(N_TEMPO_BINS)]
	tokens += [f"POS{i}" for i in range(N_POS_BINS)]
	tokens += list(METERS)
	tokens += list(ROLES)
	tokens += list(KEYS)
	tokens += list(ROOT_NAMES)
	tokens += list(DENS_NAMES)
	tokens += list(REG_NAMES)
	tokens += list(REF_NAMES)
	tokens += list(CAP_SEG_NAMES)
	tokens += list(PEDAL_NAMES)
	tokens += list(CC_NAMES)
	tokens += list(PB_NAMES)
	t2i = {t: i for i, t in enumerate(tokens)}
	return tokens, t2i


	TOKENS, TOKEN2ID = _build_vocab()
	ID2TOKEN = {i: t for t, i in TOKEN2ID.items()}
	VOCAB_SIZE = len(TOKENS)

	# Default location for fitted velocity quantiles. See _maybe_load_vel_edges
	# at the end of this module for the auto-load.
	DEFAULT_VEL_QUANTILES_PATH = (
	Path(__file__).resolve().parent.parent / "data" / "tokenizer" / "velocity_quantiles.json"
	)

	PAD = TOKEN2ID["PAD"]
	EOS = TOKEN2ID["EOS"]
	BAR_START = TOKEN2ID["BAR_START"]
	BAR_END = TOKEN2ID["BAR_END"]
	PHRASE_START = TOKEN2ID["PHRASE_START"]
	PHRASE_END = TOKEN2ID["PHRASE_END"]
	REST = TOKEN2ID["REST"]
	CHORD_START = TOKEN2ID["CHORD_START"]
	CHORD_END = TOKEN2ID["CHORD_END"]
	ROLE_BASS = TOKEN2ID["ROLE_BASS"]
	ROLE_INNER = TOKEN2ID["ROLE_INNER"]
	ROLE_TOP = TOKEN2ID["ROLE_TOP"]


	# --- Quantization helpers -----------------------------------------------------

	def _log_bin(x: float, edges=LOG_TIME_EDGES) -> int:
	"""Map a positive time (s) to a log-bin index in [0, N-1]."""
	x = max(x, TIME_MIN)
	x = min(x, TIME_MAX)
	logx = math.log(x)
	# digitize returns 1..len(edges)-1; clip to valid bin range.
	idx = int(np.digitize(logx, edges)) - 1
	return max(0, min(N_DUR_BINS - 1, idx))


	def _bin_center(i: int, edges=LOG_TIME_EDGES) -> float:
	lo, hi = edges[i], edges[i + 1]
	return math.exp(0.5 * (lo + hi))


	# Optional corpus-fit quantile edges for velocity bins. When set, _vel_bin
	# uses these instead of uniform binning. Loaded from a JSON file by
	# load_velocity_quantiles(); fit_velocity_quantiles() trains them.
	_VEL_EDGES: Optional[np.ndarray] = None


	def _vel_bin(v: int) -> int:
	v = max(0, min(127, int(v)))
	if _VEL_EDGES is not None:
	idx = int(np.searchsorted(_VEL_EDGES, v, side="right")) - 1
	return max(0, min(N_VEL_BINS - 1, idx))
	return min(N_VEL_BINS - 1, v * N_VEL_BINS // 128)


	def _vel_center(i: int) -> int:
	if _VEL_EDGES is not None:
	lo = float(_VEL_EDGES[i])
	hi = float(_VEL_EDGES[i + 1])
	return int(round(0.5 * (lo + hi)))
	return int((i + 0.5) * 128 / N_VEL_BINS)


	def fit_velocity_quantiles(velocities: List[int], n_bins: int = N_VEL_BINS) -> List[float]:
	"""Compute quantile bin edges for a corpus of velocity values."""
	if not velocities:
	return [i * 128 / n_bins for i in range(n_bins + 1)]
	vs = np.asarray(velocities, dtype=np.float64)
	qs = np.linspace(0.0, 1.0, n_bins + 1)
	edges = np.quantile(vs, qs).tolist()
	edges[0] = 0.0
	edges[-1] = 128.0
	# Force monotonic increasing in case of heavy ties.
	for i in range(1, len(edges)):
	if edges[i] <= edges[i - 1]:
	edges[i] = edges[i - 1] + 1e-3
	return edges


	def load_velocity_quantiles(path: Path) -> bool:
	"""Install bin edges from a JSON file. Returns True if loaded."""
	global _VEL_EDGES
	p = Path(path)
	if not p.exists():
	return False
	edges = json.loads(p.read_text())
	if not isinstance(edges, list) or len(edges) != N_VEL_BINS + 1:
	return False
	_VEL_EDGES = np.asarray(edges, dtype=np.float64)
	return True


	def save_velocity_quantiles(edges: List[float], path: Path) -> None:
	p = Path(path)
	p.parent.mkdir(parents=True, exist_ok=True)
	p.write_text(json.dumps(list(edges)))


	def _tempo_bin(bpm: float) -> int:
	bpm = max(TEMPO_MIN, min(TEMPO_MAX, float(bpm)))
	idx = int(np.digitize(math.log(bpm), LOG_TEMPO_EDGES)) - 1
	return max(0, min(N_TEMPO_BINS - 1, idx))


	def _tempo_center(i: int) -> float:
	lo, hi = LOG_TEMPO_EDGES[i], LOG_TEMPO_EDGES[i + 1]
	return math.exp(0.5 * (lo + hi))


	def _program_family(program: int) -> int:
	"""Map a General MIDI program (0..127) to its 16-family index."""
	return max(0, min(N_VOICE_BINS - 1, int(program) // 8))


	# --- Encode -------------------------------------------------------------------

	@dataclass
	class _Event:
	onset: float
	voice: int # GM family index 0..15 or DRUM_VOICE
	kind: str = "note" # 'note' \| 'pedal' \| 'cc' \| 'pb'
	# Note fields
	pitch: int = 0
	duration: float = 0.0
	velocity: int = 0
	# Pedal fields
	pedal_type: str = "SUS" # 'SUS' \| 'SOS' \| 'SFT'
	pedal_state: str = "UP" # 'UP' \| 'DOWN'
	# CC fields (continuous controllers)
	cc_type: str = "MOD" # name from CC_TYPES values
	cc_bin: int = 0
	# Pitch-bend fields
	pb_bin: int = 0


	def _cc_bin(value: int) -> int:
	v = max(0, min(127, int(value)))
	return min(N_CC_BINS - 1, v * N_CC_BINS // 128)


	def _cc_center(i: int) -> int:
	return int((i + 0.5) * 128 / N_CC_BINS)


	def _pb_bin(value: int) -> int:
	v = max(-8192, min(8191, int(value)))
	return min(N_PB_BINS - 1, (v + 8192) * N_PB_BINS // 16384)


	def _pb_center(i: int) -> int:
	return int((i + 0.5) * 16384 / N_PB_BINS) - 8192


	def _extract_events(pm: pretty_midi.PrettyMIDI) -> List[_Event]:
	events: List[_Event] = []
	for inst in pm.instruments:
	voice = DRUM_VOICE if inst.is_drum else _program_family(inst.program)
	for n in inst.notes:
	if PITCH_MIN <= n.pitch <= PITCH_MAX:
	events.append(
	_Event(
	onset=n.start, voice=voice, kind="note",
	pitch=n.pitch,
	duration=max(n.end - n.start, TIME_MIN),
	velocity=n.velocity,
	)
	)
	for cc in getattr(inst, "control_changes", []) or []:
	num = int(cc.number)
	if num in PEDAL_CC_NUMBERS:
	events.append(
	_Event(
	onset=float(cc.time), voice=voice, kind="pedal",
	pedal_type=PEDAL_CC_NUMBERS[num],
	pedal_state="DOWN" if int(cc.value) >= 64 else "UP",
	)
	)
	elif num in CC_TYPES:
	events.append(
	_Event(
	onset=float(cc.time), voice=voice, kind="cc",
	cc_type=CC_TYPES[num],
	cc_bin=_cc_bin(cc.value),
	)
	)
	for pb in getattr(inst, "pitch_bends", []) or []:
	events.append(
	_Event(
	onset=float(pb.time), voice=voice, kind="pb",
	pb_bin=_pb_bin(pb.pitch),
	)
	)
	# Note events sort by pitch within onset; non-note events keep their input order.
	events.sort(key=lambda e: (e.onset, e.kind != "note", e.voice, e.pitch))
	return events


	def _tempo_changes(pm: pretty_midi.PrettyMIDI) -> List[Tuple[float, float]]:
	"""Return sorted (time_s, bpm) pairs from a PrettyMIDI's tempo map."""
	try:
	times, tempos = pm.get_tempo_changes()
	except Exception:
	return [(0.0, 120.0)]
	pairs = list(zip(times.tolist(), tempos.tolist())) if len(times) else []
	if not pairs or pairs[0][0] > 1e-6:
	pairs.insert(0, (0.0, pairs[0][1] if pairs else 120.0))
	return pairs


	def _meter_token(num: int, den: int) -> str:
	name = f"METER_{num}_{den}"
	return name if name in METER_QUARTERS else "METER_OTHER"


	def _key_changes(pm: pretty_midi.PrettyMIDI) -> List[Tuple[float, str]]:
	"""Return sorted (time_s, key_name) pairs from key_signature_changes."""
	out: List[Tuple[float, str]] = []
	for ks in getattr(pm, "key_signature_changes", []) or []:
	kn = int(getattr(ks, "key_number", 0)) % 24
	out.append((float(ks.time), f"KEY_{kn}"))
	if not out or out[0][0] > 1e-6:
	out.insert(0, (0.0, out[0][1] if out else "KEY_0"))
	return out


	def _meter_changes(pm: pretty_midi.PrettyMIDI) -> List[Tuple[float, str]]:
	"""Return sorted (time_s, meter_name) pairs from time-signature changes."""
	out: List[Tuple[float, str]] = []
	for ts in getattr(pm, "time_signature_changes", []) or []:
	out.append((float(ts.time), _meter_token(ts.numerator, ts.denominator)))
	if not out or out[0][0] > 1e-6:
	out.insert(0, (0.0, out[0][1] if out else "METER_4_4"))
	return out


	def _bin_density(n: int) -> int:
	if n < 4:
	return 0
	if n < 8:
	return 1
	if n < 16:
	return 2
	return 3


	def _bin_register(mean_pitch: float) -> int:
	if mean_pitch < 48:
	return 0
	if mean_pitch < 60:
	return 1
	if mean_pitch < 72:
	return 2
	return 3


	def _bar_header_tokens(bar_events: List["_Event"]) -> List[int]:
	"""Return [ROOT_<n>, DENS_<n>, REG_<n>] tokens summarizing a bar."""
	if not bar_events:
	return [
	TOKEN2ID[ROOT_NAMES[0]],
	TOKEN2ID[DENS_NAMES[0]],
	TOKEN2ID[REG_NAMES[0]],
	]
	lowest = min(e.pitch for e in bar_events)
	root_pc = lowest % 12
	n = len(bar_events)
	mean_pitch = sum(e.pitch for e in bar_events) / n
	return [
	TOKEN2ID[ROOT_NAMES[root_pc]],
	TOKEN2ID[DENS_NAMES[_bin_density(n)]],
	TOKEN2ID[REG_NAMES[_bin_register(mean_pitch)]],
	]


	def _group_by_onset(events: List["_Event"], eps: float = TIME_MIN) -> List[List["_Event"]]:
	"""Group consecutive note events with coincident onsets into chord
	groups. Non-note events (pedal/cc/pb) are emitted as size-1 groups so
	they keep their place in the timeline but never bracket as chords.
	"""
	groups: List[List[_Event]] = []
	cur: List[_Event] = []
	cur_onset: Optional[float] = None

	def _flush() -> None:
	nonlocal cur, cur_onset
	if cur:
	groups.append(cur)
	cur = []
	cur_onset = None

	for ev in events:
	if ev.kind != "note":
	_flush()
	groups.append([ev])
	continue
	if cur_onset is None or abs(ev.onset - cur_onset) <= eps:
	cur.append(ev)
	if cur_onset is None:
	cur_onset = ev.onset
	else:
	_flush()
	cur = [ev]
	cur_onset = ev.onset
	_flush()
	return groups


	def _downbeats(pm: pretty_midi.PrettyMIDI) -> np.ndarray:
	try:
	db = pm.get_downbeats()
	return np.asarray(db) if db is not None else np.array([])
	except Exception:
	return np.array([])


	def encode(pm: pretty_midi.PrettyMIDI) -> List[int]:
	"""Encode a PrettyMIDI object to a list of vocabulary ids.

	Stream layout: PHRASE_START T<n> METER_X_Y [VC<v>] [BAR_START] ...
	For each onset group (chord = co-located notes):
	[tempo/meter/bar tokens if any cross this onset]
	[POS<p> if in a bar AND position changed; otherwise TS<n> as fallback]
	if size>1: CHORD_START <per-note: VC?, ROLE, V, P, D> CHORD_END
	else: <per-note: VC?, V, P, D>
	Notes within a chord are sorted by pitch ascending; lowest gets ROLE_BASS,
	highest ROLE_TOP, middle pitches ROLE_INNER.
	"""
	events = _extract_events(pm)
	ids: List[int] = [PHRASE_START]
	if not events:
	ids.append(PHRASE_END)
	ids.append(EOS)
	return ids

	tempo_map = _tempo_changes(pm)
	tempo_iter = iter(tempo_map)
	cur_tempo = next(tempo_iter, (0.0, 120.0))
	next_tempo = next(tempo_iter, None)
	ids.append(TOKEN2ID[f"T{_tempo_bin(cur_tempo[1])}"])

	meter_map = _meter_changes(pm)
	meter_iter = iter(meter_map)
	cur_meter = next(meter_iter, (0.0, "METER_4_4"))
	next_meter = next(meter_iter, None)
	ids.append(TOKEN2ID[cur_meter[1]])

	key_map = _key_changes(pm)
	key_iter = iter(key_map)
	cur_key = next(key_iter, (0.0, "KEY_0"))
	next_key = next(key_iter, None)
	ids.append(TOKEN2ID[cur_key[1]])

	downbeats = list(_downbeats(pm))
	# Precompute note events per bar for the header summary. Non-note
	# events (pedals/CC/PB) are excluded so they don't skew ROOT/DENS/REG.
	bar_events_by_idx: Dict[int, List[_Event]] = {}
	if downbeats:
	db_arr = np.asarray(downbeats)
	for ev in events:
	if ev.kind != "note":
	continue
	i = max(
	0,
	int(np.searchsorted(db_arr, ev.onset, side="right")) - 1,
	)
	bar_events_by_idx.setdefault(i, []).append(ev)

	db_idx = 0
	in_bar = False
	bar_start_time: Optional[float] = None
	bar_duration: Optional[float] = None
	last_pos_in_bar: Optional[int] = None
	# History of pitch multisets per emitted bar (for REF_BAR_K matching).
	bar_pitch_history: List[Tuple[int, ...]] = []

	def _bar_pitches(idx: int) -> Tuple[int, ...]:
	return tuple(sorted(e.pitch for e in bar_events_by_idx.get(idx, [])))

	def _emit_bar_start(bar_index: int) -> None:
	ids.append(BAR_START)
	ids.extend(_bar_header_tokens(bar_events_by_idx.get(bar_index, [])))
	fp = _bar_pitches(bar_index)
	for k in REF_DISTANCES:
	if k <= len(bar_pitch_history) and fp and fp == bar_pitch_history[-k]:
	ids.append(TOKEN2ID[f"REF_BAR_{k}"])
	break
	bar_pitch_history.append(fp)

	groups = _group_by_onset(events)
	current_voice = groups[0][0].voice
	ids.append(TOKEN2ID[f"VC{current_voice}"])

	prev_onset = groups[0][0].onset
	while db_idx < len(downbeats) and downbeats[db_idx] <= prev_onset + 1e-6:
	if in_bar:
	ids.append(BAR_END)
	_emit_bar_start(db_idx)
	in_bar = True
	bar_start_time = float(downbeats[db_idx])
	if db_idx + 1 < len(downbeats):
	bar_duration = float(downbeats[db_idx + 1] - downbeats[db_idx])
	last_pos_in_bar = None
	db_idx += 1

	for g_idx, group in enumerate(groups):
	onset = group[0].onset

	while next_tempo is not None and next_tempo[0] <= onset + 1e-6:
	ids.append(TOKEN2ID[f"T{_tempo_bin(next_tempo[1])}"])
	next_tempo = next(tempo_iter, None)

	while next_meter is not None and next_meter[0] <= onset + 1e-6:
	ids.append(TOKEN2ID[next_meter[1]])
	next_meter = next(meter_iter, None)

	while next_key is not None and next_key[0] <= onset + 1e-6:
	ids.append(TOKEN2ID[next_key[1]])
	next_key = next(key_iter, None)

	while db_idx < len(downbeats) and downbeats[db_idx] <= onset + 1e-6:
	if in_bar:
	ids.append(BAR_END)
	_emit_bar_start(db_idx)
	in_bar = True
	bar_start_time = float(downbeats[db_idx])
	if db_idx + 1 < len(downbeats):
	bar_duration = float(downbeats[db_idx + 1] - downbeats[db_idx])
	last_pos_in_bar = None
	db_idx += 1

	if in_bar and bar_duration and bar_duration > 1e-6:
	pos_bin = int(round((onset - bar_start_time) / bar_duration * N_POS_BINS))
	pos_bin = max(0, min(N_POS_BINS - 1, pos_bin))
	if pos_bin != last_pos_in_bar:
	ids.append(TOKEN2ID[f"POS{pos_bin}"])
	last_pos_in_bar = pos_bin
	else:
	shift = 0.0 if g_idx == 0 else onset - prev_onset
	if shift > TIME_MIN:
	ids.append(TOKEN2ID[f"TS{_log_bin(shift)}"])

	if group[0].kind != "note":
	ev = group[0]
	if ev.voice != current_voice:
	ids.append(TOKEN2ID[f"VC{ev.voice}"])
	current_voice = ev.voice
	if ev.kind == "pedal":
	ids.append(TOKEN2ID[f"PEDAL_{ev.pedal_type}_{ev.pedal_state}"])
	elif ev.kind == "cc":
	ids.append(TOKEN2ID[f"CC_{ev.cc_type}_{ev.cc_bin}"])
	elif ev.kind == "pb":
	ids.append(TOKEN2ID[f"PB_{ev.pb_bin}"])
	prev_onset = onset
	continue

	notes = sorted(group, key=lambda e: e.pitch)
	is_chord = len(notes) > 1
	if is_chord:
	ids.append(CHORD_START)
	for n_idx, ev in enumerate(notes):
	if ev.voice != current_voice:
	ids.append(TOKEN2ID[f"VC{ev.voice}"])
	current_voice = ev.voice
	if is_chord:
	if n_idx == 0:
	ids.append(ROLE_BASS)
	elif n_idx == len(notes) - 1:
	ids.append(ROLE_TOP)
	else:
	ids.append(ROLE_INNER)
	ids.append(TOKEN2ID[f"V{_vel_bin(ev.velocity)}"])
	ids.append(TOKEN2ID[f"P{ev.pitch}"])
	ids.append(TOKEN2ID[f"D{_log_bin(ev.duration)}"])
	if is_chord:
	ids.append(CHORD_END)

	prev_onset = onset

	if in_bar:
	ids.append(BAR_END)
	ids.append(PHRASE_END)
	ids.append(EOS)
	return ids


	# --- Decode -------------------------------------------------------------------

	# GM family -> representative program number (one per family of 8).
	FAMILY_PROGRAMS = {
	0: 0, # Piano
	1: 8, # Chromatic Percussion
	2: 16, # Organ
	3: 24, # Guitar
	4: 32, # Bass
	5: 40, # Strings
	6: 48, # Ensemble
	7: 56, # Brass
	8: 64, # Reed
	9: 72, # Pipe
	10: 80, # Synth Lead
	11: 88, # Synth Pad
	12: 96, # Synth Effects
	13: 104, # Ethnic
	14: 112, # Percussive
	15: 120, # Sound Effects
	}


	def _kind(t: str):
	"""Classify a token name into a (kind, value) pair for the decoder."""
	if t in STRUCTURAL:
	return ("struct", t)
	if t in ROLES:
	return ("role", t)
	if t in METERS:
	return ("meter", t)
	if t.startswith("KEY_") and t[4:].isdigit():
	return ("key", int(t[4:]))
	if t.startswith("ROOT_") and t[5:].isdigit():
	return ("root", int(t[5:]))
	if t.startswith("DENS_") and t[5:].isdigit():
	return ("dens", int(t[5:]))
	if t.startswith("REG_") and t[4:].isdigit():
	return ("reg", int(t[4:]))
	if t.startswith("REF_BAR_") and t[8:].isdigit():
	return ("ref", int(t[8:]))
	if t.startswith("CAP_SEG_") and t[8:].isdigit():
	return ("capseg", int(t[8:]))
	if t.startswith("PEDAL_"):
	return ("pedal", t)
	if t.startswith("CC_"):
	return ("cc", t)
	if t.startswith("PB_") and t[3:].isdigit():
	return ("pb", int(t[3:]))
	if t.startswith("TS") and t[2:].isdigit():
	return ("ts", int(t[2:]))
	if t.startswith("VC") and t[2:].isdigit():
	return ("voice", int(t[2:]))
	if t.startswith("POS") and t[3:].isdigit():
	return ("pos", int(t[3:]))
	if t.startswith("D") and t[1:].isdigit():
	return ("dur", int(t[1:]))
	if t.startswith("V") and t[1:].isdigit():
	return ("vel", int(t[1:]))
	if t.startswith("T") and t[1:].isdigit():
	return ("tempo", int(t[1:]))
	if t.startswith("P") and t[1:].isdigit():
	return ("pitch", int(t[1:]))
	return ("struct", t)


	def decode(ids: List[int], default_tempo: float = 120.0) -> pretty_midi.PrettyMIDI:
	"""Decode a token id list back to a PrettyMIDI. Timing is reconstructed
	from POS within bars (using current tempo + meter) or TS deltas as a
	fallback. Pitches and instrument families are preserved exactly.
	"""
	initial_tempo = default_tempo
	for tid in ids:
	t = ID2TOKEN.get(tid, "")
	if t.startswith("T") and not t.startswith("TS") and t[1:].isdigit():
	initial_tempo = _tempo_center(int(t[1:]))
	break

	pm = pretty_midi.PrettyMIDI(initial_tempo=initial_tempo)
	instruments: Dict[int, pretty_midi.Instrument] = {}
	current_voice = 0

	def get_inst(v: int) -> pretty_midi.Instrument:
	if v not in instruments:
	if v == DRUM_VOICE:
	instruments[v] = pretty_midi.Instrument(
	program=0,
	is_drum=True,
	name="drums",
	)
	else:
	prog = FAMILY_PROGRAMS.get(v, 0)
	instruments[v] = pretty_midi.Instrument(
	program=prog,
	name=f"family_{v}",
	)
	return instruments[v]

	current_tempo = initial_tempo
	current_meter_quarters = METER_QUARTERS["METER_4_4"]
	bar_duration = current_meter_quarters * 60.0 / current_tempo
	bar_start_time = 0.0
	n_bars_seen = 0
	current_time = 0.0
	pending_velocity = 64

	i = 0
	while i < len(ids):
	kind, val = _kind(ID2TOKEN.get(ids[i], "PAD"))

	if kind == "ts":
	current_time += _bin_center(val)
	elif kind == "pos":
	current_time = bar_start_time + (int(val) / N_POS_BINS) * bar_duration
	elif kind == "voice":
	current_voice = int(val)
	elif kind == "vel":
	pending_velocity = _vel_center(int(val))
	elif kind == "tempo":
	current_tempo = _tempo_center(int(val))
	bar_duration = current_meter_quarters * 60.0 / current_tempo
	elif kind == "meter":
	current_meter_quarters = METER_QUARTERS.get(val, 4.0)
	bar_duration = current_meter_quarters * 60.0 / current_tempo
	elif kind == "key":
	try:
	pm.key_signature_changes.append(
	pretty_midi.KeySignature(int(val), float(current_time))
	)
	except Exception:
	pass
	elif kind == "pedal":
	# PEDAL_<SUS\|SOS\|SFT>_<UP\|DOWN>
	parts = str(val).split("_")
	if len(parts) == 3:
	ptype, pstate = parts[1], parts[2]
	cc_num = PEDAL_NAME_TO_CC.get(ptype)
	if cc_num is not None:
	inst = get_inst(current_voice)
	inst.control_changes.append(
	pretty_midi.ControlChange(
	number=cc_num,
	value=127 if pstate == "DOWN" else 0,
	time=float(current_time),
	)
	)
	elif kind == "cc":
	# CC_<NAME>_<BIN>
	parts = str(val).split("_")
	if len(parts) == 3 and parts[2].isdigit():
	cname, bidx = parts[1], int(parts[2])
	cc_num = CC_NAME_TO_NUMBER.get(cname)
	if cc_num is not None:
	inst = get_inst(current_voice)
	inst.control_changes.append(
	pretty_midi.ControlChange(
	number=cc_num,
	value=_cc_center(bidx),
	time=float(current_time),
	)
	)
	elif kind == "pb":
	inst = get_inst(current_voice)
	inst.pitch_bends.append(
	pretty_midi.PitchBend(
	pitch=_pb_center(int(val)),
	time=float(current_time),
	)
	)
	elif kind == "struct" and val == "BAR_START":
	if n_bars_seen > 0:
	bar_start_time += bar_duration
	current_time = bar_start_time
	n_bars_seen += 1
	elif kind == "struct" and val == "REST":
	current_time += 0.25
	elif kind == "pitch":
	duration = 0.25
	j = i + 1
	while j < len(ids):
	nt = ID2TOKEN.get(ids[j], "PAD")
	nkind, nval = _kind(nt)
	if nkind == "dur":
	duration = _bin_center(int(nval))
	break
	if nkind in (
	"pitch", "ts", "pos", "voice", "vel",
	"tempo", "meter", "role", "key",
	"root", "dens", "reg", "ref", "capseg",
	"pedal", "cc", "pb",
	):
	break
	if nkind == "struct" and nval not in ("CHORD_START", "CHORD_END"):
	break
	j += 1
	note = pretty_midi.Note(
	velocity=int(pending_velocity),
	pitch=int(val),
	start=current_time,
	end=current_time + max(duration, 0.01),
	)
	get_inst(current_voice).notes.append(note)
	# role / chord brackets / bar_end / phrase / pad / eos: no timing effect
	i += 1

	for voice in sorted(instruments):
	inst = instruments[voice]
	if inst.notes:
	pm.instruments.append(inst)
	return pm


	def inject_caption_segments(ids: List[int], n_segs: int = N_CAP_SEGS) -> List[int]:
	"""Insert CAP_SEG_<i % n_segs> right after each PHRASE_START.

	Use this when you have a multi-sentence caption split into ``n_segs``
	parts and want to bind the i-th phrase of the encoded MIDI to the
	i-th caption segment. Emission is opt-in because the segment count
	only makes sense in the presence of an external caption.
	"""
	if n_segs <= 0 or n_segs > N_CAP_SEGS:
	raise ValueError(f"n_segs must be in [1, {N_CAP_SEGS}]")
	out: List[int] = []
	seen_phrases = 0
	for tid in ids:
	out.append(tid)
	if tid == PHRASE_START:
	out.append(TOKEN2ID[CAP_SEG_NAMES[seen_phrases % n_segs]])
	seen_phrases += 1
	return out


	# --- Round-trip test ----------------------------------------------------------

	def _voice_label(inst: "pretty_midi.Instrument") -> int:
	return DRUM_VOICE if inst.is_drum else _program_family(inst.program)


	def round_trip_test(pm: pretty_midi.PrettyMIDI) -> Tuple[bool, Dict]:
	"""Verify the (pitch, voice) multiset is preserved through encode+decode.

	Timing is not checked because log quantization is lossy.
	"""
	original = sorted(
	(n.pitch, _voice_label(inst))
	for inst in pm.instruments
	for n in inst.notes
	if PITCH_MIN <= n.pitch <= PITCH_MAX
	)
	ids = encode(pm)
	pm2 = decode(ids)
	reconstructed = sorted(
	(n.pitch, _voice_label(inst))
	for inst in pm2.instruments
	for n in inst.notes
	)
	passed = original == reconstructed
	return passed, {
	"n_orig": len(original),
	"n_recon": len(reconstructed),
	"n_tokens": len(ids),
	"vocab_size": VOCAB_SIZE,
	}


	# --- Auto-load fitted velocity quantiles -------------------------------------

	if DEFAULT_VEL_QUANTILES_PATH.exists():
	try:
	load_velocity_quantiles(DEFAULT_VEL_QUANTILES_PATH)
	except Exception:
	pass


	# --- CLI ----------------------------------------------------------------------

	def _cli_fit_velocity_quantiles(sample_dir: Path, out_path: Path) -> None:
	paths = sorted(sample_dir.rglob(".mid")) + sorted(sample_dir.rglob(".midi"))
	if not paths:
	raise SystemExit(f"No MIDI files under {sample_dir}")
	velocities: List[int] = []
	n_failed = 0
	for p in paths:
	try:
	pm = pretty_midi.PrettyMIDI(str(p))
	except Exception:
	n_failed += 1
	continue
	for inst in pm.instruments:
	for n in inst.notes:
	velocities.append(int(n.velocity))
	edges = fit_velocity_quantiles(velocities, n_bins=N_VEL_BINS)
	save_velocity_quantiles(edges, out_path)
	print(
	f"[velocity] files={len(paths)} failed={n_failed} "
	f"velocities={len(velocities)} -> {out_path}"
	)
	print(f"[velocity] edges = {[round(e, 2) for e in edges]}")


	if __name__ == "__main__":
	import argparse as _argparse
	import sys as _sys

	if len(_sys.argv) > 1 and _sys.argv[1] == "fit-velocity":
	p = _argparse.ArgumentParser()
	p.add_argument(
	"--sample-dir",
	type=str,
	default=str(Path(__file__).resolve().parent.parent / "data" / "gigamidi" / "sample"),
	)
	p.add_argument("--out", type=str, default=str(DEFAULT_VEL_QUANTILES_PATH))
	args = p.parse_args(_sys.argv[2:])
	_cli_fit_velocity_quantiles(Path(args.sample_dir), Path(args.out))
	_sys.exit(0)

	print(f"Vocab size: {VOCAB_SIZE}")
	print(f" structural: {len(STRUCTURAL)}")
	print(f" pitch: {N_PITCH}")
	print(f" duration: {N_DUR_BINS}")
	print(f" time-shift: {N_SHIFT_BINS}")
	print(f" velocity: {N_VEL_BINS}")
	print(f" voice/family: {N_VOICE_BINS}")
	print(f" tempo: {N_TEMPO_BINS}")

	# Smoke test: build a tiny C-major scale and round-trip it.
	pm = pretty_midi.PrettyMIDI()
	inst = pretty_midi.Instrument(program=0)
	t = 0.0
	for p in [60, 62, 64, 65, 67, 69, 71, 72]:
	inst.notes.append(pretty_midi.Note(velocity=80, pitch=p, start=t, end=t + 0.5))
	t += 0.5
	pm.instruments.append(inst)
	ok, info = round_trip_test(pm)
	print(f"\nSmoke test round-trip: {'PASS' if ok else 'FAIL'} {info}")

	# Multi-track / multi-velocity test.
	pm2 = pretty_midi.PrettyMIDI(initial_tempo=92.0)
	piano = pretty_midi.Instrument(program=0) # family 0 (piano)
	bass = pretty_midi.Instrument(program=33) # family 4 (bass)
	strings = pretty_midi.Instrument(program=48) # family 6 (ensemble)
	t = 0.0
	for p, v in [(60, 30), (64, 80), (67, 110), (72, 60)]:
	piano.notes.append(pretty_midi.Note(velocity=v, pitch=p, start=t, end=t + 0.5))
	bass.notes.append(pretty_midi.Note(velocity=70, pitch=p - 24, start=t, end=t + 0.5))
	strings.notes.append(pretty_midi.Note(velocity=50, pitch=p + 12, start=t, end=t + 0.5))
	t += 0.5
	pm2.instruments += [piano, bass, strings]
	ok2, info2 = round_trip_test(pm2)
	print(f"Multi-track round-trip: {'PASS' if ok2 else 'FAIL'} {info2}")

	ids = encode(pm2)
	has_tempo = any(
	ID2TOKEN[i].startswith("T") and not ID2TOKEN[i].startswith("TS")
	and ID2TOKEN[i][1:].isdigit()
	for i in ids
	)
	has_voice = any(ID2TOKEN[i].startswith("VC") for i in ids)
	has_meter = any(ID2TOKEN[i] in METERS for i in ids)
	has_pos = any(ID2TOKEN[i].startswith("POS") for i in ids)
	has_chord = CHORD_START in ids
	has_role = any(t in ids for t in (ROLE_BASS, ROLE_INNER, ROLE_TOP))
	print(
	f"Stream features: tempo={has_tempo} voice={has_voice} "
	f"meter={has_meter} pos={has_pos} chord={has_chord} role={has_role}"
	)