| | """WhisperX wrapper for lyrics extraction with word-level timestamps.""" |
| |
|
| | import json |
| | from pathlib import Path |
| | from typing import Optional |
| |
|
| | import whisperx |
| |
|
| |
|
| | def extract_lyrics( |
| | vocal_path: str | Path, |
| | model_name: str = "large-v2", |
| | device: str = "cpu", |
| | language: str = "en", |
| | output_dir: Optional[str | Path] = None, |
| | ) -> list[dict]: |
| | """Extract timestamped lyrics from an isolated vocal stem. |
| | |
| | Args: |
| | vocal_path: Path to the isolated vocal audio file (data/<song>/stems/vocals.wav). |
| | model_name: Whisper model size. Default "large-v2" (best for lyrics). |
| | device: Device to run on ("cpu", "cuda"). |
| | language: Language code for transcription. |
| | output_dir: Directory to save lyrics.json. Defaults to data/<song>/. |
| | |
| | Returns: |
| | List of word dicts with keys: "word", "start", "end". |
| | Example: [{"word": "hello", "start": 0.5, "end": 0.8}, ...] |
| | """ |
| | vocal_path = str(vocal_path) |
| |
|
| | |
| | audio = whisperx.load_audio(vocal_path) |
| |
|
| | |
| | model = whisperx.load_model(model_name, device, compute_type="int8", language=language) |
| | result = model.transcribe(audio, batch_size=4) |
| | del model |
| |
|
| | |
| | model_a, metadata = whisperx.load_align_model(language_code=language, device=device) |
| | result = whisperx.align(result["segments"], model_a, metadata, audio, device) |
| | del model_a, metadata |
| |
|
| | |
| | words = [] |
| | for segment in result["segments"]: |
| | for word in segment.get("words", []): |
| | if "start" in word and "end" in word: |
| | words.append({ |
| | "word": word["word"].strip(), |
| | "start": word["start"], |
| | "end": word["end"], |
| | }) |
| |
|
| | |
| | if output_dir is None: |
| | output_dir = Path(vocal_path).parent.parent |
| | output_dir = Path(output_dir) |
| |
|
| | output_path = output_dir / "lyrics.json" |
| | with open(output_path, "w") as f: |
| | json.dump(words, f, indent=2) |
| |
|
| | import gc |
| | gc.collect() |
| |
|
| | return words |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import sys |
| |
|
| | if len(sys.argv) < 2: |
| | print("Usage: python -m src.lyrics_extractor <vocal_file>") |
| | sys.exit(1) |
| |
|
| | words = extract_lyrics(sys.argv[1]) |
| | for w in words: |
| | print(f"{w['start']:6.2f} - {w['end']:6.2f}: {w['word']}") |
| |
|
| | output_path = Path(sys.argv[1]).parent.parent / "lyrics.json" |
| | print(f"\nSaved to {output_path}") |
| |
|