Spaces:
Runtime error
Runtime error
File size: 8,765 Bytes
f974ed0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | """
AutoLyrics β Gradio Demo
Fine-tuned Whisper-small + LoRA for lyrics transcription.
Usage:
pip install gradio transformers peft torch torchaudio librosa pyloudnorm jiwer
python app.py
"""
import re
import torch
import torchaudio
import torchaudio.transforms as T
import librosa
import pyloudnorm as pyln
import numpy as np
import gradio as gr
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from peft import PeftModel
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIGURATION β adjust paths if needed
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_NAME = "openai/whisper-small"
LORA_DIR = "./checkpoints/lora_best" # path where you saved the LoRA adapter
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
TARGET_SR = 16000
MAX_DURATION = 30.0
LANGUAGE = "en"
TASK = "transcribe"
BEAM_SIZE = 3
MAX_NEW_TOKENS = 200
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# AUDIO PREPROCESSING (mirrors your notebook pipeline)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
LUFS_TARGET = -23.0
LUFS_HEADROOM = 1.0
SILENCE_TOP_DB = 30
def _remove_dc_offset(waveform: torch.Tensor) -> torch.Tensor:
return waveform - waveform.mean()
def _trim_silence(waveform: torch.Tensor) -> torch.Tensor:
arr = waveform.numpy()
trimmed, _ = librosa.effects.trim(arr, top_db=SILENCE_TOP_DB)
return torch.from_numpy(trimmed)
def _loudness_normalize(waveform: torch.Tensor, sr: int) -> torch.Tensor:
arr = waveform.numpy().astype("float64")
meter = pyln.Meter(sr)
loudness = meter.integrated_loudness(arr)
if not (loudness > -70.0):
peak = arr.max() if arr.max() != 0 else 1.0
arr = arr / peak
else:
arr = pyln.normalize.loudness(arr, loudness, LUFS_TARGET)
limit = 10 ** (-LUFS_HEADROOM / 20.0)
arr = arr.clip(-limit, limit)
return torch.from_numpy(arr.astype("float32"))
def preprocess_audio(waveform: torch.Tensor, sr: int) -> torch.Tensor:
"""Full preprocessing chain: resample β mono β DC β trim β loudness."""
# Convert to mono
if waveform.dim() == 1:
waveform = waveform.unsqueeze(0)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample
if sr != TARGET_SR:
waveform = T.Resample(sr, TARGET_SR)(waveform)
waveform = waveform.squeeze(0)
# Preprocessing chain
waveform = _remove_dc_offset(waveform)
waveform = _trim_silence(waveform)
if len(waveform) == 0:
return waveform
waveform = _loudness_normalize(waveform, TARGET_SR)
return waveform
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MODEL LOADING
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"Loading model on {DEVICE}β¦")
processor = WhisperProcessor.from_pretrained(LORA_DIR, language=LANGUAGE, task=TASK)
base_model = WhisperForConditionalGeneration.from_pretrained(
MODEL_NAME, torch_dtype=MODEL_DTYPE
).to(DEVICE)
base_model.config.forced_decoder_ids = None
base_model.generation_config.forced_decoder_ids = None
base_model.generation_config.suppress_tokens = []
model = PeftModel.from_pretrained(base_model, LORA_DIR).to(DEVICE)
inner_model = model.base_model.model
inner_model.eval()
print("Model loaded β")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# TRANSCRIPTION FUNCTION
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def transcribe(audio_path: str, beam_size: int, max_new_tokens: int) -> str:
"""Load audio, preprocess, run Whisper LoRA, return transcript."""
if audio_path is None:
return "β οΈ Please upload or record an audio file."
try:
waveform, sr = torchaudio.load(audio_path)
except Exception as e:
return f"β Error loading audio: {e}"
waveform = preprocess_audio(waveform, sr)
if len(waveform) == 0:
return "β οΈ Audio appears to be silent after preprocessing."
duration = len(waveform) / TARGET_SR
if duration > MAX_DURATION:
waveform = waveform[: int(MAX_DURATION * TARGET_SR)]
duration = MAX_DURATION
# Pad to 30 s for the feature extractor
max_samples = int(MAX_DURATION * TARGET_SR)
if len(waveform) < max_samples:
waveform = torch.nn.functional.pad(waveform, (0, max_samples - len(waveform)))
features = processor.feature_extractor(
waveform.numpy(), sampling_rate=TARGET_SR, return_tensors="pt"
).input_features.to(DEVICE, dtype=MODEL_DTYPE)
with torch.no_grad():
generated_ids = inner_model.generate(
input_features=features,
num_beams=int(beam_size),
max_new_tokens=int(max_new_tokens),
language=LANGUAGE,
task=TASK,
suppress_tokens=[],
forced_decoder_ids=None,
)
transcript = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
info = f"_(duration: {duration:.1f}s, device: {DEVICE})_"
return f"{transcript}\n\n{info}"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# GRADIO INTERFACE
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks(title="AutoLyrics β Whisper LoRA", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# π΅ AutoLyrics β Whisper-small + LoRA
**Fine-tuned on `gmenon/slt-lyrics-audio` for music lyrics transcription.**
Upload a song clip (β€ 30 s) or record directly from your microphone, then hit **Transcribe**.
"""
)
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="π€ Audio Input",
sources=["upload", "microphone"],
type="filepath",
)
with gr.Accordion("βοΈ Advanced settings", open=False):
beam_slider = gr.Slider(
minimum=1, maximum=5, value=BEAM_SIZE, step=1,
label="Beam size (1 = greedy, higher = better but slower)"
)
tokens_slider = gr.Slider(
minimum=50, maximum=448, value=MAX_NEW_TOKENS, step=10,
label="Max new tokens"
)
transcribe_btn = gr.Button("πΆ Transcribe", variant="primary")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="π Transcription",
lines=8,
placeholder="Lyrics will appear hereβ¦",
show_copy_button=True,
)
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, beam_slider, tokens_slider],
outputs=output_text,
)
gr.Examples(
examples=[], # add example audio paths here if you have them
inputs=audio_input,
)
gr.Markdown(
"""
---
**Model:** `openai/whisper-small` + LoRA (r=8, Ξ±=16) |
**Dataset:** `gmenon/slt-lyrics-audio` |
**Preprocessing:** EBU R128 loudness normalisation, silence trimming
"""
)
if __name__ == "__main__":
demo.launch(share=True) # share=True gives a public URL; remove for local only
|