import os import html import re from difflib import SequenceMatcher from pathlib import Path import urllib.request import time from typing import Optional, Tuple import gradio as gr import librosa import numpy as np import torch from transformers import WhisperForConditionalGeneration, WhisperProcessor from openai import OpenAI DEMO_URL = "https://github.com/dmatekenya/AI-seminars-malawi/releases/download/v1.1/test_15_secs.wav" DEMO_AUDIO_PATH = Path("/tmp/demo.wav") def ensure_demo_audio() -> str: """ Ensure demo audio exists on disk and return the path as a string. Raises RuntimeError with a useful message if download fails. """ DEMO_AUDIO_PATH.parent.mkdir(parents=True, exist_ok=True) # If already downloaded, reuse it if DEMO_AUDIO_PATH.exists() and DEMO_AUDIO_PATH.stat().st_size > 0: print(f"[demo] Using cached audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True) return str(DEMO_AUDIO_PATH) print(f"[demo] Downloading demo audio from: {DEMO_URL}", flush=True) try: # Download to a temp file first, then rename (avoids partial files) tmp_path = DEMO_AUDIO_PATH.with_suffix(".wav.tmp") urllib.request.urlretrieve(DEMO_URL, tmp_path) os.replace(tmp_path, DEMO_AUDIO_PATH) except Exception as e: raise RuntimeError(f"[demo] Failed to download demo audio from {DEMO_URL}. Error: {e}") if not DEMO_AUDIO_PATH.exists() or DEMO_AUDIO_PATH.stat().st_size == 0: raise RuntimeError(f"[demo] Download completed but file is missing/empty at {DEMO_AUDIO_PATH}") print(f"[demo] Downloaded demo audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True) return str(DEMO_AUDIO_PATH) # ----------------------------- # Models / Config # ----------------------------- BASE_REPO = "openai/whisper-large-v3" FINETUNED_REPO = "dmatekenya/whisper-large-v3-chichewa" FINETUNED_REVISION = "bff60fb08ba9f294e05bfcab4306f30b6a0cfc0a" # pinned commit hash # Local WhisperProcessor language hint (keep consistent with how you evaluated) LOCAL_LANGUAGE = "shona" # Audio constraints TARGET_SR = 16000 MAX_SECONDS = 30.0 # OpenAI transcription model (commercial) OPENAI_MODEL = "gpt-4o-transcribe" # simple + stable # ----------------------------- # UI Text / Styling # ----------------------------- LOGO_HTML = """

""" HEADER_HTML = """

Chichewa Speech2Text: How Custom Data Improves Transcription Performance

Observe how the fine-tuned model provides better transcription quality.

Upload or record a short Chichewa voice note (≤30 seconds).

""" DIVIDER = """

""" # TITLE_HTML = """ #

# Chichewa Speech2Text: How Custom Data Improves Performance #

# """ # HIGHLIGHT_TEXT = """ #

# Observe how the fine-tuned model provides better transcription quality. #

# """ # DESCRIPTION_HTML = """ #

# Upload or record a short Chichewa voice note (≤30 seconds). The same audio will be transcribed by three systems. #

# """ ARTICLE_HTML = """

Read more about the ChichewaSpeech2Text project

""" # ----------------------------- # Load local models once # ----------------------------- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 print(f"Using device: {DEVICE}", flush=True) PROCESSOR = WhisperProcessor.from_pretrained( BASE_REPO, language=LOCAL_LANGUAGE, task="transcribe", ) MODEL_BASE = WhisperForConditionalGeneration.from_pretrained(BASE_REPO).to(DEVICE).eval() MODEL_FT = WhisperForConditionalGeneration.from_pretrained( FINETUNED_REPO, revision=FINETUNED_REVISION, ).to(DEVICE).eval() if DEVICE == "cuda": MODEL_BASE = MODEL_BASE.to(dtype=DTYPE) MODEL_FT = MODEL_FT.to(dtype=DTYPE) OPENAI_CLIENT = OpenAI() # ----------------------------- # Helpers # ----------------------------- def _tokenize_words(s: str): # words + punctuation as separate tokens return re.findall(r"\w+|[^\w\s]", s, flags=re.UNICODE) def diff_highlight_html(ref: str, hyp: str, title_ref="Reference", title_hyp="Hypothesis") -> str: """ Returns HTML showing a word-level diff between ref and hyp. - deletions (in ref not in hyp): red + strikethrough - insertions (in hyp not in ref): green - replacements: red struck old + green new """ ref_toks = _tokenize_words(ref or "") hyp_toks = _tokenize_words(hyp or "") sm = SequenceMatcher(a=ref_toks, b=hyp_toks) ref_out, hyp_out = [], [] for tag, i1, i2, j1, j2 in sm.get_opcodes(): a = ref_toks[i1:i2] b = hyp_toks[j1:j2] if tag == "equal": ref_out += [html.escape(t) for t in a] hyp_out += [html.escape(t) for t in b] elif tag == "delete": ref_out += [f"{html.escape(t)}" for t in a] elif tag == "insert": hyp_out += [f"{html.escape(t)}" for t in b] elif tag == "replace": ref_out += [f"{html.escape(t)}" for t in a] hyp_out += [f"{html.escape(t)}" for t in b] # tidy spacing: join with spaces, then remove spaces before punctuation def _join(tokens): s = " ".join(tokens) s = re.sub(r"\s+([,.;:!?])", r"\1", s) s = re.sub(r"\(\s+", "(", s) s = re.sub(r"\s+\)", ")", s) return s ref_html = _join(ref_out) hyp_html = _join(hyp_out) return f"""

{html.escape(title_ref)}

{ref_html}

{html.escape(title_hyp)}

{hyp_html}

""" def make_diffs(base_text: str, ft_text: str, openai_text: str, ref_choice: str): if ref_choice == "Fine-tuned": ref = ft_text ref_name = "Fine-tuned (Reference)" elif ref_choice == "OpenAI": ref = openai_text ref_name = "OpenAI (Reference)" else: ref = base_text ref_name = "Base (Reference)" base_diff = diff_highlight_html(ref, base_text, title_ref=ref_name, title_hyp="Base") openai_diff = diff_highlight_html(ref, openai_text, title_ref=ref_name, title_hyp="OpenAI") return base_diff, openai_diff def load_audio(audio_path: str) -> Tuple[np.ndarray, int, float]: y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True) dur = float(len(y) / sr) if sr else 0.0 return y, sr, dur @torch.inference_mode() def transcribe_local(model: WhisperForConditionalGeneration, audio_16k: np.ndarray) -> str: feats = PROCESSOR( audio_16k, return_tensors="pt", sampling_rate=TARGET_SR ).input_features model_device = next(model.parameters()).device model_dtype = next(model.parameters()).dtype # <- key line feats = feats.to(device=model_device, dtype=model_dtype) generated_ids = model.generate(input_features=feats) text = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0] return text.strip() def transcribe_openai(audio_path: str) -> str: if not os.getenv("OPENAI_API_KEY"): return "OpenAI ASR disabled: OPENAI_API_KEY not set in Space Secrets." prompt = "Chichewa transcription. Malawi names like Lilongwe, Blantyre, Zomba. Keep local names as spoken." with open(audio_path, "rb") as f: resp = OPENAI_CLIENT.audio.transcriptions.create( file=f, model=OPENAI_MODEL, prompt=prompt, temperature=0.0, response_format="json", ) return (resp.text or "").strip() def transcribe_all(audio_path: Optional[str]) -> Tuple[str, str, str, str]: """ Returns: status, base_text, finetuned_text, openai_text """ if not audio_path: return "Please record or upload an audio clip.", "", "", "" # Load audio once try: y, sr, dur = load_audio(audio_path) except Exception as e: return f"❌ Failed to load audio: {e}", "", "", "" status = [] # Base (local) t0 = time.time() try: base_text = transcribe_local(MODEL_BASE, y) status.append(f"1. Open Source (base) {time.time()-t0:.2f}s") except Exception as e: base_text = f"[ERROR] Base failed: {e}" status.append("❌ Base failed") # Fine-tuned (local) t1 = time.time() try: ft_text = transcribe_local(MODEL_FT, y) status.append(f"2. Fine-tuned {time.time()-t1:.2f}s") except Exception as e: ft_text = f"[ERROR] Fine-tuned failed: {e}" status.append("❌ Fine-tuned failed") # OpenAI (commercial) t2 = time.time() try: openai_text = transcribe_openai(audio_path) status.append(f"3. OpenAI ({OPENAI_MODEL}) {time.time()-t2:.2f}s") except Exception as e: openai_text = f"[ERROR] OpenAI failed: {e}" status.append("❌ OpenAI failed") return "\n".join(status), base_text, ft_text, openai_text def init_demo(): audio_path = ensure_demo_audio() status, base_text, ft_text, openai_text = transcribe_all(audio_path) return audio_path, status, base_text, ft_text, openai_text # ----------------------------- # Warm-up (local models only) # ----------------------------- # def warmup(): # try: # dummy = np.zeros(int(TARGET_SR * 1.0), dtype=np.float32) # _ = transcribe_local(MODEL_BASE, dummy) # _ = transcribe_local(MODEL_FT, dummy) # print("Warm-up complete.", flush=True) # except Exception as e: # print(f"Warm-up skipped/failed: {e}", flush=True) # warmup() # ----------------------------- # UI # ----------------------------- with gr.Blocks(theme="grass", title="Chichewa Speech2Text") as demo: gr.HTML(LOGO_HTML) gr.HTML(DIVIDER) gr.HTML(HEADER_HTML) audio_in = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Audio Input (Record or Upload)", value=DEMO_AUDIO_PATH, ) run_btn = gr.Button("Transcribe & Compare", variant="primary") status_out = gr.Textbox(label="Status / timing", lines=3) with gr.Row(equal_height=True): base_out = gr.Textbox(label="Open Source ASR Model", lines=12) ft_out = gr.Textbox(label="Open Source Model Fine-Tuned with Custom Chichewa Speech", lines=12) commercial_out = gr.Textbox(label="Frontier Commercial ASR Model (OpenAI)", lines=12) run_btn.click( fn=transcribe_all, inputs=[audio_in], outputs=[status_out, base_out, ft_out, commercial_out], ) # Preload audio + transcripts immediately on page load demo.load( fn=init_demo, inputs=None, outputs=[audio_in, status_out, base_out, ft_out, commercial_out], ) gr.Markdown(ARTICLE_HTML) if __name__ == "__main__": demo.queue(default_concurrency_limit=2).launch()