File size: 2,632 Bytes
72f552e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""WhisperX wrapper for lyrics extraction with word-level timestamps."""

import json
from pathlib import Path
from typing import Optional

import whisperx


def extract_lyrics(
    vocal_path: str | Path,
    model_name: str = "large-v2",
    device: str = "cpu",
    language: str = "en",
    output_dir: Optional[str | Path] = None,
) -> list[dict]:
    """Extract timestamped lyrics from an isolated vocal stem.

    Args:
        vocal_path: Path to the isolated vocal audio file (data/<song>/stems/vocals.wav).
        model_name: Whisper model size. Default "large-v2" (best for lyrics).
        device: Device to run on ("cpu", "cuda").
        language: Language code for transcription.
        output_dir: Directory to save lyrics.json. Defaults to data/<song>/.

    Returns:
        List of word dicts with keys: "word", "start", "end".
        Example: [{"word": "hello", "start": 0.5, "end": 0.8}, ...]
    """
    vocal_path = str(vocal_path)

    # Load audio
    audio = whisperx.load_audio(vocal_path)

    # Transcribe
    model = whisperx.load_model(model_name, device, compute_type="int8", language=language)
    result = model.transcribe(audio, batch_size=4)
    del model  # free Whisper model before loading alignment model

    # Forced alignment for word-level timestamps
    model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device)
    del model_a, metadata  # free alignment model

    # Flatten to word list
    words = []
    for segment in result["segments"]:
        for word in segment.get("words", []):
            if "start" in word and "end" in word:
                words.append({
                    "word": word["word"].strip(),
                    "start": word["start"],
                    "end": word["end"],
                })

    # Save to JSON in the song directory (stems/ parent = data/<song>/)
    if output_dir is None:
        output_dir = Path(vocal_path).parent.parent
    output_dir = Path(output_dir)

    output_path = output_dir / "lyrics.json"
    with open(output_path, "w") as f:
        json.dump(words, f, indent=2)

    import gc
    gc.collect()

    return words


if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage: python -m src.lyrics_extractor <vocal_file>")
        sys.exit(1)

    words = extract_lyrics(sys.argv[1])
    for w in words:
        print(f"{w['start']:6.2f} - {w['end']:6.2f}: {w['word']}")

    output_path = Path(sys.argv[1]).parent.parent / "lyrics.json"
    print(f"\nSaved to {output_path}")