import io import time import numpy as np import librosa import torch import streamlit as st from typing import Tuple from transformers import WhisperProcessor, WhisperForConditionalGeneration # ─── Page Config ───────────────────────────────────────────────────────────── st.set_page_config( page_title="RegionalCap · ASR", page_icon="🎙️", layout="centered", initial_sidebar_state="collapsed", ) # ─── CSS ────────────────────────────────────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ─── Constants ──────────────────────────────────────────────────────────────── REPO_ID = "rocky250/RegionalCap" PROCESSOR_ID = "openai/whisper-small" SAMPLE_RATE = 16000 CHECKPOINTS = [f"checkpoint-{n}" for n in range(1000, 11000, 1000)] DEFAULT_CKPT = "checkpoint-10000" AUDIO_FMTS = ["wav", "mp3", "flac", "ogg", "m4a"] # ─── Model loader ───────────────────────────────────────────────────────────── @st.cache_resource(show_spinner=False) def load_model(checkpoint: str) -> Tuple: """ Load processor + model once; cached across reruns. Key fix: clear forced_decoder_ids from both model.config and generation_config so they don't conflict with our runtime language/task. """ device = "cuda" if torch.cuda.is_available() else "cpu" processor = WhisperProcessor.from_pretrained(PROCESSOR_ID) model = WhisperForConditionalGeneration.from_pretrained( REPO_ID, subfolder=checkpoint, torch_dtype=torch.float16 if device == "cuda" else torch.float32, low_cpu_mem_usage=True, ).to(device) # ── Fix: wipe pre-baked forced_decoder_ids so we control language/task ── model.generation_config.forced_decoder_ids = None model.config.forced_decoder_ids = None # ── Fix: set suppress_tokens to empty to avoid duplicate logits processor ─ model.generation_config.suppress_tokens = [] model.eval() return processor, model, device # ─── Transcription ──────────────────────────────────────────────────────────── def run_transcription( audio_bytes: bytes, processor, model, device: str, language: str, task: str, ) -> Tuple[str, float, float]: """ Fixes vs previous version: - No temp file: librosa reads from BytesIO directly - No forced_decoder_ids passed to generate(); use language= task= kwargs (supported from transformers ≥ 4.27, avoids all logits-processor clashes) - attention_mask passed to avoid padding ambiguity warning """ # ── Load audio from memory (no disk write) ── audio_np, _ = librosa.load(io.BytesIO(audio_bytes), sr=SAMPLE_RATE, mono=True) duration = len(audio_np) / SAMPLE_RATE # ── Feature extraction ── inputs = processor( audio_np, sampling_rate=SAMPLE_RATE, return_tensors="pt", return_attention_mask=True, ) input_features = inputs.input_features.to(device) attention_mask = inputs.get("attention_mask") if attention_mask is not None: attention_mask = attention_mask.to(device) if device == "cuda": input_features = input_features.half() # ── Generate ── t0 = time.perf_counter() with torch.no_grad(): predicted_ids = model.generate( input_features, attention_mask=attention_mask, language=language, task=task, ) elapsed = time.perf_counter() - t0 text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return text.strip(), duration, elapsed # ══════════════════════════════════════════════════════════════════════════════ # UI # ══════════════════════════════════════════════════════════════════════════════ # ─── Hero ───────────────────────────────────────────────────────────────────── st.markdown("""

🎙️ RegionalCap

Bengali Dialect ASR · rocky250/RegionalCap · Whisper fine-tune

""", unsafe_allow_html=True) # ─── Sidebar ────────────────────────────────────────────────────────────────── with st.sidebar: st.markdown("### ⚙️ Config") checkpoint = st.selectbox("Checkpoint", CHECKPOINTS, index=CHECKPOINTS.index(DEFAULT_CKPT)) language = st.selectbox("Language", ["bn", "en"], index=0) task = st.selectbox("Task", ["transcribe", "translate"], index=0) st.markdown("---") st.caption(f"**Repo** `{REPO_ID}`") st.caption(f"**Processor** `{PROCESSOR_ID}`") # ─── Load model ─────────────────────────────────────────────────────────────── _status = st.empty() with st.spinner("Loading model… (first run downloads weights, subsequent runs are instant)"): try: processor, model, device = load_model(checkpoint) _status.markdown( f'✓ READY' f'{checkpoint}' f'{device.upper()}', unsafe_allow_html=True, ) except Exception as exc: _status.markdown('✗ LOAD FAILED', unsafe_allow_html=True) st.error(str(exc)) st.stop() st.markdown('

', unsafe_allow_html=True) # ─── Audio upload ───────────────────────────────────────────────────────────── st.markdown('

01 · Upload Audio

', unsafe_allow_html=True) uploaded = st.file_uploader( "audio", type=AUDIO_FMTS, label_visibility="collapsed", ) if uploaded is None: st.markdown( '

' 'Supported: WAV · MP3 · FLAC · OGG · M4A

', unsafe_allow_html=True, ) st.stop() # ─── Preview ────────────────────────────────────────────────────────────────── audio_bytes = uploaded.read() ext = uploaded.name.rsplit(".", 1)[-1].lower() st.audio(audio_bytes, format=f"audio/{ext}") st.markdown( f'{uploaded.name}' f'{len(audio_bytes)/1024:.1f} kB', unsafe_allow_html=True, ) st.markdown('

', unsafe_allow_html=True) # ─── Transcribe ─────────────────────────────────────────────────────────────── st.markdown('

02 · Transcribe

', unsafe_allow_html=True) if st.button("▶ Run Transcription"): result_slot = st.empty() result_slot.info("Processing audio…") try: text, duration, elapsed = run_transcription( audio_bytes, processor, model, device, language, task ) rtf = elapsed / duration if duration > 0 else 0.0 result_slot.empty() # clear the "processing" message st.markdown( f'

' f'

{text}

' f'

' f'audio {duration:.1f}s · ' f'inference {elapsed:.2f}s · ' f'RTF {rtf:.3f} · ' f'{device}

' f'

', unsafe_allow_html=True, ) st.download_button( label="⬇ Download .txt", data=text, file_name=f"{uploaded.name.rsplit('.',1)[0]}_transcription.txt", mime="text/plain", ) except Exception as exc: result_slot.empty() st.markdown('✗ ERROR', unsafe_allow_html=True) st.error(str(exc))