| | """ |
| | Chichewa Speech2Text — Diff Viewer (Single-model inference + Ground-truth diff) |
| | |
| | What this app does: |
| | - Lets the user upload/record an audio clip (≤30s recommended). |
| | - Lets the user paste a human-verified "ground truth" reference transcript. |
| | - Lets the user choose ONE system to run (Base / Fine-tuned / OpenAI). |
| | - Produces a 2-column, word-level highlighted diff: Reference vs Hypothesis. |
| | |
| | Notes: |
| | - This keeps inference lightweight (runs only one model at a time). |
| | - The demo audio is preloaded on page load, but inference is NOT auto-run to reduce break risk. |
| | """ |
| |
|
| | import os |
| | import html |
| | import re |
| | import time |
| | import urllib.request |
| | from difflib import SequenceMatcher |
| | from pathlib import Path |
| | from typing import Optional, Tuple |
| |
|
| | import gradio as gr |
| | import librosa |
| | import numpy as np |
| | import torch |
| | from transformers import WhisperForConditionalGeneration, WhisperProcessor |
| | from openai import OpenAI |
| |
|
| |
|
| | |
| | |
| | |
| | DEMO_URL = "https://github.com/dmatekenya/AI-seminars-malawi/releases/download/v1.0/WAU12.wav" |
| | DEMO_AUDIO_PATH = Path("/tmp/demo.wav") |
| |
|
| |
|
| | def ensure_demo_audio() -> str: |
| | """ |
| | Ensure demo audio exists on disk and return the path as a string. |
| | |
| | Raises: |
| | RuntimeError: If download fails or file is empty. |
| | """ |
| | DEMO_AUDIO_PATH.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | if DEMO_AUDIO_PATH.exists() and DEMO_AUDIO_PATH.stat().st_size > 0: |
| | print(f"[demo] Using cached audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True) |
| | return str(DEMO_AUDIO_PATH) |
| |
|
| | print(f"[demo] Downloading demo audio from: {DEMO_URL}", flush=True) |
| | try: |
| | tmp_path = DEMO_AUDIO_PATH.with_suffix(".wav.tmp") |
| | urllib.request.urlretrieve(DEMO_URL, tmp_path) |
| | os.replace(tmp_path, DEMO_AUDIO_PATH) |
| | except Exception as e: |
| | raise RuntimeError(f"[demo] Failed to download demo audio from {DEMO_URL}. Error: {e}") |
| |
|
| | if not DEMO_AUDIO_PATH.exists() or DEMO_AUDIO_PATH.stat().st_size == 0: |
| | raise RuntimeError(f"[demo] Download completed but file is missing/empty at {DEMO_AUDIO_PATH}") |
| |
|
| | print(f"[demo] Downloaded demo audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True) |
| | return str(DEMO_AUDIO_PATH) |
| |
|
| |
|
| | |
| | |
| | |
| | BASE_REPO = "openai/whisper-large-v3" |
| | FINETUNED_REPO = "dmatekenya/whisper-large-v3-chichewa" |
| | FINETUNED_REVISION = "bff60fb08ba9f294e05bfcab4306f30b6a0cfc0a" |
| |
|
| | |
| | |
| | LOCAL_LANGUAGE = "shona" |
| |
|
| | TARGET_SR = 16000 |
| | MAX_SECONDS = 30.0 |
| |
|
| | OPENAI_MODEL = "gpt-4o-transcribe" |
| |
|
| |
|
| | |
| | |
| | |
| | LOGO_HTML = """ |
| | <div style="text-align:center; margin-bottom: 25px;"> |
| | <img src="https://i.ibb.co/5nQdGSs/logo.png" |
| | style="max-width: 100%; height: auto; border-radius: 12px;"> |
| | </div> |
| | """ |
| |
|
| | HEADER_HTML = """ |
| | <div style="text-align:center; max-width:900px; margin:0 auto;"> |
| | <h1 style="font-size:36px; margin-bottom:12px;"> |
| | Chichewa Speech2Text: Ground Truth Diff Viewer |
| | </h1> |
| | <p style="font-size:20px; font-weight:700; color:#1F3A5F; margin-bottom:12px;"> |
| | Paste a human-verified reference transcript and compare it to one ASR system at a time. |
| | </p> |
| | <p style="font-size:18px; color:#444; margin-bottom:25px;"> |
| | Record or upload a short voice note (≤30 seconds recommended). |
| | </p> |
| | </div> |
| | """ |
| |
|
| | DIVIDER = """ |
| | <div style="max-width:900px; margin:10px auto;"> |
| | <hr style="border:0; border-top:1px solid #ddd;"> |
| | </div> |
| | """ |
| |
|
| | ARTICLE_HTML = """ |
| | <p style="text-align:center; margin-top: 10px;"> |
| | Read more about the <a href="https://dmatekenya.github.io/Chichewa-Speech2Text/README.html" target="_blank">ChichewaSpeech2Text</a> project |
| | and sign up for our voice note donation event: |
| | <a href="https://forms.gle/fHLESutofVvb2YFM9" target="_blank">Google Form</a>. |
| | </p> |
| | """ |
| |
|
| |
|
| | |
| | |
| | |
| | DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| | DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 |
| | print(f"Using device: {DEVICE}", flush=True) |
| |
|
| | PROCESSOR = WhisperProcessor.from_pretrained( |
| | BASE_REPO, |
| | language=LOCAL_LANGUAGE, |
| | task="transcribe", |
| | ) |
| |
|
| | MODEL_BASE = WhisperForConditionalGeneration.from_pretrained(BASE_REPO).to(DEVICE).eval() |
| | MODEL_FT = WhisperForConditionalGeneration.from_pretrained( |
| | FINETUNED_REPO, |
| | revision=FINETUNED_REVISION, |
| | ).to(DEVICE).eval() |
| |
|
| | if DEVICE == "cuda": |
| | MODEL_BASE = MODEL_BASE.to(dtype=DTYPE) |
| | MODEL_FT = MODEL_FT.to(dtype=DTYPE) |
| |
|
| | OPENAI_CLIENT = OpenAI() |
| |
|
| |
|
| | |
| | |
| | |
| | def load_audio(audio_path: str) -> Tuple[np.ndarray, int, float]: |
| | y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True) |
| | dur = float(len(y) / sr) if sr else 0.0 |
| | return y, sr, dur |
| |
|
| |
|
| | @torch.inference_mode() |
| | def transcribe_local(model: WhisperForConditionalGeneration, audio_16k: np.ndarray) -> str: |
| | input_features = PROCESSOR( |
| | audio_16k, |
| | return_tensors="pt", |
| | sampling_rate=TARGET_SR, |
| | ).input_features |
| |
|
| | input_features = input_features.to(DEVICE) |
| | if DEVICE == "cuda": |
| | input_features = input_features.to(dtype=DTYPE) |
| |
|
| | generated_ids = model.generate(input_features=input_features) |
| | transcription = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0] |
| | return transcription.strip() |
| |
|
| |
|
| | def transcribe_openai(audio_path: str) -> str: |
| | if not os.getenv("OPENAI_API_KEY"): |
| | return "OpenAI ASR disabled: OPENAI_API_KEY not set in Space Secrets." |
| |
|
| | prompt = "Chichewa transcription. Malawi names like Lilongwe, Blantyre, Zomba. Keep local names as spoken." |
| |
|
| | with open(audio_path, "rb") as f: |
| | resp = OPENAI_CLIENT.audio.transcriptions.create( |
| | file=f, |
| | model=OPENAI_MODEL, |
| | prompt=prompt, |
| | temperature=0.0, |
| | response_format="json", |
| | ) |
| | return (resp.text or "").strip() |
| |
|
| |
|
| | def transcribe_selected(audio_path: Optional[str], which: str) -> Tuple[str, str]: |
| | """ |
| | Transcribe using a single selected system. |
| | |
| | Parameters: |
| | audio_path: filepath from Gradio audio component |
| | which: "Base" | "Fine-tuned" | "OpenAI" |
| | |
| | Returns: |
| | status, hypothesis_text |
| | """ |
| | if not audio_path: |
| | return "Please record or upload an audio clip.", "" |
| |
|
| | |
| | y = None |
| | if which in ["Base", "Fine-tuned"]: |
| | try: |
| | y, sr, dur = load_audio(audio_path) |
| | except Exception as e: |
| | return f"❌ Failed to load audio: {e}", "" |
| |
|
| | t0 = time.time() |
| | try: |
| | if which == "Base": |
| | hyp = transcribe_local(MODEL_BASE, y) |
| | elif which == "Fine-tuned": |
| | hyp = transcribe_local(MODEL_FT, y) |
| | elif which == "OpenAI": |
| | hyp = transcribe_openai(audio_path) |
| | else: |
| | return f"Unknown model selection: {which}", "" |
| | except Exception as e: |
| | return f"❌ {which} failed: {e}", "" |
| |
|
| | return f"✅ {which} done in {time.time() - t0:.2f}s", (hyp or "").strip() |
| |
|
| |
|
| | |
| | |
| | |
| | def _tokenize_words(s: str): |
| | return re.findall(r"\w+|[^\w\s]", s, flags=re.UNICODE) |
| |
|
| |
|
| | def diff_highlight_html(ref: str, hyp: str, title_ref="Reference", title_hyp="Hypothesis") -> str: |
| | """ |
| | Returns HTML showing a word-level diff between ref and hyp. |
| | - deletions (in ref not in hyp): red + strikethrough (shown on reference side) |
| | - insertions (in hyp not in ref): green (shown on hypothesis side) |
| | - replacements: red struck old (ref) + green new (hyp) |
| | """ |
| | ref_toks = _tokenize_words(ref or "") |
| | hyp_toks = _tokenize_words(hyp or "") |
| |
|
| | sm = SequenceMatcher(a=ref_toks, b=hyp_toks) |
| | ref_out, hyp_out = [], [] |
| |
|
| | for tag, i1, i2, j1, j2 in sm.get_opcodes(): |
| | a = ref_toks[i1:i2] |
| | b = hyp_toks[j1:j2] |
| |
|
| | if tag == "equal": |
| | ref_out += [html.escape(t) for t in a] |
| | hyp_out += [html.escape(t) for t in b] |
| |
|
| | elif tag == "delete": |
| | ref_out += [ |
| | f"<span style='color:#b00020;text-decoration:line-through;background:#ffe6e6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>" |
| | for t in a |
| | ] |
| |
|
| | elif tag == "insert": |
| | hyp_out += [ |
| | f"<span style='color:#0a7a0a;background:#e6ffe6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>" |
| | for t in b |
| | ] |
| |
|
| | elif tag == "replace": |
| | ref_out += [ |
| | f"<span style='color:#b00020;text-decoration:line-through;background:#ffe6e6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>" |
| | for t in a |
| | ] |
| | hyp_out += [ |
| | f"<span style='color:#0a7a0a;background:#e6ffe6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>" |
| | for t in b |
| | ] |
| |
|
| | def _join(tokens): |
| | s = " ".join(tokens) |
| | s = re.sub(r"\s+([,.;:!?])", r"\1", s) |
| | s = re.sub(r"\(\s+", "(", s) |
| | s = re.sub(r"\s+\)", ")", s) |
| | return s |
| |
|
| | ref_html = _join(ref_out) |
| | hyp_html = _join(hyp_out) |
| |
|
| | return f""" |
| | <div style="display:grid;grid-template-columns:1fr 1fr;gap:14px;"> |
| | <div style="padding:12px;border:1px solid #ddd;border-radius:10px;"> |
| | <div style="font-weight:700;margin-bottom:8px;">{html.escape(title_ref)}</div> |
| | <div style="line-height:1.6;">{ref_html}</div> |
| | </div> |
| | <div style="padding:12px;border:1px solid #ddd;border-radius:10px;"> |
| | <div style="font-weight:700;margin-bottom:8px;">{html.escape(title_hyp)}</div> |
| | <div style="line-height:1.6;">{hyp_html}</div> |
| | </div> |
| | </div> |
| | """ |
| |
|
| |
|
| | def make_diff_from_gt(reference_text: str, hypothesis_text: str, which: str) -> str: |
| | ref = (reference_text or "").strip() |
| | hyp = (hypothesis_text or "").strip() |
| |
|
| | if not ref: |
| | return ( |
| | "<div style='padding:12px;border:1px solid #ddd;border-radius:10px;'>" |
| | "❗ Paste a ground-truth reference transcript to see the diff." |
| | "</div>" |
| | ) |
| |
|
| | if not hyp: |
| | return ( |
| | "<div style='padding:12px;border:1px solid #ddd;border-radius:10px;'>" |
| | "❗ No hypothesis transcript yet. Click “Transcribe + Show Diff”." |
| | "</div>" |
| | ) |
| |
|
| | return diff_highlight_html( |
| | ref, |
| | hyp, |
| | title_ref="Ground truth (Reference)", |
| | title_hyp=f"{which} (Hypothesis)", |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| | with gr.Blocks(theme="grass", title="Chichewa Speech2Text — Diff Viewer") as demo: |
| | gr.HTML(LOGO_HTML) |
| | gr.HTML(DIVIDER) |
| | gr.HTML(HEADER_HTML) |
| |
|
| | audio_in = gr.Audio( |
| | sources=["microphone", "upload"], |
| | type="filepath", |
| | label="Audio Input (Record or Upload)", |
| | value=DEMO_AUDIO_PATH, |
| | ) |
| |
|
| | reference_in = gr.Textbox( |
| | label="Ground truth reference transcript (paste here)", |
| | lines=5, |
| | placeholder="Paste the correct (human-verified) transcript here...", |
| | ) |
| |
|
| | with gr.Row(): |
| | which_model = gr.Radio( |
| | choices=["Fine-tuned", "Base", "OpenAI"], |
| | value="Fine-tuned", |
| | label="Choose one system to transcribe (runs only one)", |
| | ) |
| | run_btn = gr.Button("Transcribe + Show Diff", variant="primary") |
| |
|
| | status_out = gr.Textbox(label="Status / timing", lines=2) |
| | hyp_out = gr.Textbox(label="Hypothesis transcript", lines=10) |
| | diff_out = gr.HTML() |
| |
|
| | |
| | run_btn.click( |
| | fn=transcribe_selected, |
| | inputs=[audio_in, which_model], |
| | outputs=[status_out, hyp_out], |
| | ).then( |
| | fn=make_diff_from_gt, |
| | inputs=[reference_in, hyp_out, which_model], |
| | outputs=[diff_out], |
| | ) |
| |
|
| | |
| | reference_in.change( |
| | fn=make_diff_from_gt, |
| | inputs=[reference_in, hyp_out, which_model], |
| | outputs=[diff_out], |
| | ) |
| |
|
| | |
| | demo.load( |
| | fn=lambda: ensure_demo_audio(), |
| | inputs=None, |
| | outputs=[audio_in], |
| | ) |
| |
|
| | gr.Markdown(ARTICLE_HTML) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | demo.queue(default_concurrency_limit=2).launch() |