"""Lyrics-to-beat mapping: group beats into segments and assign lyrics.""" import json from pathlib import Path from typing import Optional def segment_lyrics( beats: list[dict], lyrics: list[dict], beats_per_segment: int = 4, ) -> list[dict]: """Map timestamped lyrics onto beat-grouped segments. Groups consecutive beats into segments (e.g. 4 beats = 1 bar in 4/4 time) and assigns words to the segment where they start. Args: beats: List of beat dicts with "beat" and "time" keys. lyrics: List of word dicts with "word", "start", "end" keys. beats_per_segment: Number of beats per segment. 4 = one bar in 4/4 time. Returns: List of segment dicts with keys: - segment: 1-indexed segment number - start: start time in seconds - end: end time in seconds - duration: segment duration in seconds - lyrics: raw lyrics text for this segment (may be empty) - words: list of word dicts that fall in this segment """ beat_times = [b["time"] for b in beats] # Build segment boundaries by grouping every N beats segments = [] seg_num = 1 for i in range(0, len(beat_times) - 1, beats_per_segment): start = beat_times[i] # End is either N beats later or the last beat end_idx = min(i + beats_per_segment, len(beat_times) - 1) end = beat_times[end_idx] # Store individual beat timestamps for this segment seg_beat_times = [ round(beat_times[j], 3) for j in range(i, min(i + beats_per_segment + 1, len(beat_times))) ] segments.append({ "segment": seg_num, "start": round(start, 3), "end": round(end, 3), "duration": round(end - start, 3), "beats": seg_beat_times, "lyrics": "", "words": [], }) seg_num += 1 # Assign words to segments based on where the word starts for word in lyrics: word_start = word["start"] for seg in segments: if seg["start"] <= word_start < seg["end"]: seg["words"].append(word) break else: # Word starts after last segment boundary — assign to last segment if segments and word_start >= segments[-1]["start"]: segments[-1]["words"].append(word) # Build lyrics text per segment for seg in segments: seg["lyrics"] = " ".join(w["word"] for w in seg["words"]) return segments def save_segments( segments: list[dict], output_path: str | Path, ) -> Path: """Save segments to a JSON file. Args: segments: List of segment dicts. output_path: Path to save the JSON file. Returns: Path to the saved JSON file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: json.dump(segments, f, indent=2) return output_path def run( data_dir: str | Path, beats_per_segment: int = 4, ) -> list[dict]: """Full segmentation pipeline: load beats + lyrics, segment, and save. Args: data_dir: Song data directory containing beats.json and lyrics.json (e.g. data/Gone/). beats_per_segment: Number of beats per segment (4 = one bar). Returns: List of segment dicts. """ data_dir = Path(data_dir) with open(data_dir / "beats.json") as f: beats = json.load(f) with open(data_dir / "lyrics.json") as f: lyrics = json.load(f) segments = segment_lyrics(beats, lyrics, beats_per_segment=beats_per_segment) save_segments(segments, data_dir / "segments.json") return segments if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python -m src.segmenter ") print(" e.g. python -m src.segmenter data/Gone") sys.exit(1) segments = run(sys.argv[1]) print(f"Created {len(segments)} segments:\n") for seg in segments: lyrics_display = f'"{seg["lyrics"]}"' if seg["lyrics"] else "(instrumental)" print(f" Seg {seg['segment']}: {seg['start']:.3f}s - {seg['end']:.3f}s " f"({seg['duration']:.3f}s) {lyrics_display}")