"""AUTOLYRICS — side-by-side baseline vs fine-tuned Gradio demo.""" import os import time import subprocess import tempfile import wave from pathlib import Path import numpy as np import torch import torchaudio import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration from peft import PeftModel BASE_MODEL = "openai/whisper-small" ADAPTER_REPO = os.environ.get( "ADAPTER_REPO", "Petercoder/autolyrics-whisper-small-lora") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 # ---------- Lazy model loading ---------- print(f"Loading models on {DEVICE}…") processor = WhisperProcessor.from_pretrained(BASE_MODEL) baseline_model = WhisperForConditionalGeneration.from_pretrained( BASE_MODEL, torch_dtype=DTYPE).to(DEVICE).eval() # Configure generation_config only (not model.config) — avoids reconciliation # warnings in transformers 4.47+ when both configs hold conflicting values. baseline_model.generation_config.language = "de" baseline_model.generation_config.task = "transcribe" baseline_model.generation_config.forced_decoder_ids = None baseline_model.generation_config.no_repeat_ngram_size = 3 base_for_ft = WhisperForConditionalGeneration.from_pretrained( BASE_MODEL, torch_dtype=DTYPE) ft_model = PeftModel.from_pretrained(base_for_ft, ADAPTER_REPO).to(DEVICE).eval() ft_model.generation_config.language = "de" ft_model.generation_config.task = "transcribe" ft_model.generation_config.forced_decoder_ids = None ft_model.generation_config.no_repeat_ngram_size = 3 print("Models ready.") def load_audio(path: str) -> torch.Tensor: """Load any browser-uploaded audio format → 16 kHz mono float32 tensor. Strategy (two-stage, zero libsndfile dependency): 1. ffmpeg transcodes ANY browser format (webm/opus, ogg, mp3, m4a, wav) into a clean 16-bit PCM WAV at 16 kHz mono. ffmpeg handles every container/codec that browsers produce, including Gradio mic recordings. 2. Python's built-in `wave` module reads the raw PCM bytes directly. This **completely bypasses soundfile / libsndfile**, which cannot decode webm, ogg/opus, or partially-encoded containers and raises ``soundfile.LibsndfileError: Format not recognised`` on HF Spaces. ffmpeg is pre-installed on HF Spaces via packages.txt — no extra Python package is needed. `wave` and `numpy` are always available. Returns ------- torch.Tensor 1-D float32 waveform on CPU, normalised to [-1, 1], at 16 000 Hz. """ src = Path(path) if not src.exists() or src.stat().st_size == 0: raise ValueError(f"Audio file missing or empty: {path}") with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: tmp_path = tmp.name try: # ── Step 1: transcode to clean PCM WAV via ffmpeg ────────────────── result = subprocess.run( [ "ffmpeg", "-y", # overwrite without prompting "-i", str(src), # any browser-upload format "-ac", "1", # force mono "-ar", "16000", # resample to 16 kHz "-sample_fmt", "s16", # 16-bit signed PCM "-f", "wav", # output container: wav tmp_path, ], stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, timeout=60, ) if result.returncode != 0: err = result.stderr.decode(errors="replace").strip().splitlines() raise RuntimeError( f"ffmpeg failed (code {result.returncode}): " f"{err[-1] if err else 'unknown error'}" ) # ── Step 2: read PCM bytes with stdlib `wave` — no soundfile ─────── with wave.open(tmp_path, "rb") as wf: n_channels = wf.getnchannels() sampwidth = wf.getsampwidth() # bytes per sample: 2 for s16 framerate = wf.getframerate() n_frames = wf.getnframes() if n_frames == 0: raise ValueError("ffmpeg produced an empty audio file.") raw = wf.readframes(n_frames) # Parse raw bytes → float32 in [-1, 1] # ffmpeg guarantees s16, but use sampwidth defensively. if sampwidth == 2: arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 elif sampwidth == 4: arr = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0 else: arr = np.frombuffer(raw, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0 # Mix down multi-channel (guard: -ac 1 already handles this) if n_channels > 1: arr = arr.reshape(-1, n_channels).mean(axis=1) wav = torch.from_numpy(arr.copy()) # copy() avoids non-writable buffer warning # Resample if framerate drifted (guard: -ar 16000 already handles this) if framerate != 16000: wav = torchaudio.functional.resample( wav.unsqueeze(0), framerate, 16000 ).squeeze(0) return wav # 1-D float32 CPU tensor finally: try: os.unlink(tmp_path) except OSError: pass @torch.inference_mode() def transcribe_with(model, audio_tensor, num_beams: int): feats = processor(audio_tensor.numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(DEVICE, dtype=DTYPE) t0 = time.perf_counter() # ── KEY FIX: pass input_features as a KEYWORD argument. ──────────────── # PeftModelForSeq2SeqLM.generate() only accepts **kwargs (no positional # args beyond self). Passing `feats` positionally raises: # TypeError: generate() takes 1 positional argument but 2 were given # WhisperForConditionalGeneration also accepts it as a keyword, so this # call is correct for BOTH the bare baseline model and the PEFT wrapper. ids = model.generate( input_features=feats, num_beams=num_beams, max_new_tokens=225, return_dict_in_generate=True, output_scores=True, ) dt = time.perf_counter() - t0 # ids is GenerateBeamEncoderDecoderOutput when return_dict_in_generate=True. # .sequences holds the token-id tensor; decode it to text. text = processor.batch_decode(ids.sequences, skip_special_tokens=True)[0].strip() # Confidence proxy: exponentiate the beam score (sum of log-probs). # sequences_scores is None for greedy/num_beams=1 — guard accordingly. if hasattr(ids, "sequences_scores") and ids.sequences_scores is not None: conf = float(torch.exp(ids.sequences_scores[0]).clamp(0, 1)) else: conf = None return text, dt, conf def run(audio_path: str, num_beams: int, model_choice: str): if audio_path is None: return "—", "—", "—", "—", "Please upload audio." # Load and decode audio — raises ValueError/RuntimeError on bad input. try: audio = load_audio(audio_path) except (ValueError, RuntimeError, subprocess.TimeoutExpired) as exc: err_msg = f"⚠️ Audio error: {exc}" return err_msg, err_msg, "—", "—", "Audio could not be decoded — try a different file." except Exception as exc: # noqa: BLE001 err_msg = f"⚠️ Unexpected error loading audio: {exc}" return err_msg, err_msg, "—", "—", "Audio could not be decoded — try a different file." if audio.numel() == 0: return "—", "—", "—", "—", "⚠️ Audio file appears to be empty or silent." duration = audio.shape[-1] / 16000 if model_choice == "Baseline only": b_text, b_dt, b_conf = transcribe_with(baseline_model, audio, num_beams) return b_text, "—", f"{b_dt:.2f}s · RTF {b_dt/duration:.2f}", "—", \ f"Audio: {duration:.1f}s" if model_choice == "Fine-tuned only": f_text, f_dt, f_conf = transcribe_with(ft_model, audio, num_beams) return "—", f_text, "—", f"{f_dt:.2f}s · RTF {f_dt/duration:.2f}", \ f"Audio: {duration:.1f}s" # both b_text, b_dt, _ = transcribe_with(baseline_model, audio, num_beams) f_text, f_dt, _ = transcribe_with(ft_model, audio, num_beams) return b_text, f_text, \ f"{b_dt:.2f}s · RTF {b_dt/duration:.2f}", \ f"{f_dt:.2f}s · RTF {f_dt/duration:.2f}", \ f"Audio: {duration:.1f}s" # ---------- UI ---------- THEME = gr.themes.Monochrome( primary_hue="neutral", neutral_hue="slate", radius_size=gr.themes.sizes.radius_lg, font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], ).set( body_background_fill="#000000", body_text_color="#fafafa", block_background_fill="#0a0a0a", block_border_color="#1a1a1a", button_primary_background_fill="#fafafa", button_primary_text_color="#000000", ) CSS = """ #title { letter-spacing: -0.02em; } .gradio-container { max-width: 1100px !important; } footer { display: none !important; } """ with gr.Blocks(theme=THEME, css=CSS, title="AUTOLYRICS") as demo: gr.HTML("""

AUTOLYRICS

Transcribing the voice inside music. Whisper-small fine-tuned with LoRA on singing.

""") with gr.Row(): with gr.Column(scale=1): audio = gr.Audio(type="filepath", label="Upload or record", sources=["upload", "microphone"]) with gr.Row(): beams = gr.Slider(1, 8, value=5, step=1, label="Beam search width") choice = gr.Radio( ["Both (compare)", "Baseline only", "Fine-tuned only"], value="Both (compare)", label="Mode") run_btn = gr.Button("Transcribe", variant="primary") meta = gr.Markdown("") with gr.Column(scale=1): with gr.Group(): gr.Markdown("### Baseline · Whisper-small") base_out = gr.Textbox(lines=4, show_label=False, placeholder="Baseline transcription will appear here…") base_meta = gr.Markdown("") with gr.Group(): gr.Markdown("### Fine-tuned · AUTOLYRICS (LoRA)") ft_out = gr.Textbox(lines=4, show_label=False, placeholder="Fine-tuned transcription will appear here…") ft_meta = gr.Markdown("") gr.Examples( examples=[ ["examples/pop_clip.wav", 5, "Both (compare)"], ["examples/ballad_clip.wav",5, "Both (compare)"], ["examples/rap_clip.wav", 5, "Both (compare)"], ], inputs=[audio, beams, choice], outputs=[base_out, ft_out, base_meta, ft_meta, meta], fn=run, cache_examples=False, ) run_btn.click( run, inputs=[audio, beams, choice], outputs=[base_out, ft_out, base_meta, ft_meta, meta], ) demo.queue(max_size=12).launch( server_name="0.0.0.0", # Required: HF Spaces proxy expects this binding server_port=7860, # HF Spaces standard port show_error=True, # Surface tracebacks in the UI during debugging )