import io import time import numpy as np import librosa import torch import streamlit as st from typing import Tuple from transformers import WhisperProcessor, WhisperForConditionalGeneration # ─── Page Config ───────────────────────────────────────────────────────────── st.set_page_config( page_title="RegionalCap · ASR", page_icon="🎙️", layout="centered", initial_sidebar_state="collapsed", ) # ─── CSS ────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ─── Constants ──────────────────────────────────────────────────────────────── REPO_ID = "rocky250/RegionalCap" PROCESSOR_ID = "openai/whisper-small" SAMPLE_RATE = 16000 CHECKPOINTS = [f"checkpoint-{n}" for n in range(1000, 11000, 1000)] DEFAULT_CKPT = "checkpoint-10000" AUDIO_FMTS = ["wav", "mp3", "flac", "ogg", "m4a"] # ─── Model loader ───────────────────────────────────────────────────────────── @st.cache_resource(show_spinner=False) def load_model(checkpoint: str) -> Tuple: """ Load processor + model once; cached across reruns. Key fix: clear forced_decoder_ids from both model.config and generation_config so they don't conflict with our runtime language/task. """ device = "cuda" if torch.cuda.is_available() else "cpu" processor = WhisperProcessor.from_pretrained(PROCESSOR_ID) model = WhisperForConditionalGeneration.from_pretrained( REPO_ID, subfolder=checkpoint, torch_dtype=torch.float16 if device == "cuda" else torch.float32, low_cpu_mem_usage=True, ).to(device) # ── Fix: wipe pre-baked forced_decoder_ids so we control language/task ── model.generation_config.forced_decoder_ids = None model.config.forced_decoder_ids = None # ── Fix: set suppress_tokens to empty to avoid duplicate logits processor ─ model.generation_config.suppress_tokens = [] model.eval() return processor, model, device # ─── Transcription ──────────────────────────────────────────────────────────── def run_transcription( audio_bytes: bytes, processor, model, device: str, language: str, task: str, ) -> Tuple[str, float, float]: """ Fixes vs previous version: - No temp file: librosa reads from BytesIO directly - No forced_decoder_ids passed to generate(); use language= task= kwargs (supported from transformers ≥ 4.27, avoids all logits-processor clashes) - attention_mask passed to avoid padding ambiguity warning """ # ── Load audio from memory (no disk write) ── audio_np, _ = librosa.load(io.BytesIO(audio_bytes), sr=SAMPLE_RATE, mono=True) duration = len(audio_np) / SAMPLE_RATE # ── Feature extraction ── inputs = processor( audio_np, sampling_rate=SAMPLE_RATE, return_tensors="pt", return_attention_mask=True, ) input_features = inputs.input_features.to(device) attention_mask = inputs.get("attention_mask") if attention_mask is not None: attention_mask = attention_mask.to(device) if device == "cuda": input_features = input_features.half() # ── Generate ── t0 = time.perf_counter() with torch.no_grad(): predicted_ids = model.generate( input_features, attention_mask=attention_mask, language=language, task=task, ) elapsed = time.perf_counter() - t0 text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return text.strip(), duration, elapsed # ══════════════════════════════════════════════════════════════════════════════ # UI # ══════════════════════════════════════════════════════════════════════════════ # ─── Hero ───────────────────────────────────────────────────────────────────── st.markdown("""
Bengali Dialect ASR · rocky250/RegionalCap · Whisper fine-tune
' 'Supported: WAV · MP3 · FLAC · OGG · M4A
', unsafe_allow_html=True, ) st.stop() # ─── Preview ────────────────────────────────────────────────────────────────── audio_bytes = uploaded.read() ext = uploaded.name.rsplit(".", 1)[-1].lower() st.audio(audio_bytes, format=f"audio/{ext}") st.markdown( f'{uploaded.name}' f'{len(audio_bytes)/1024:.1f} kB', unsafe_allow_html=True, ) st.markdown('