Spaces:

dmatekenya
/

chichewa-speech2text

Running

App Files Files Community

dmatekenya commited on Feb 22

Commit

256ade0

1 Parent(s): ad594d3

Added app code

Browse files

Files changed (2) hide show

app.py +244 -0
requirements.txt +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+import time
+from typing import Optional, Tuple
+import gradio as gr
+import librosa
+import numpy as np
+import torch
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from openai import OpenAI
+# -----------------------------
+# Models / Config
+# -----------------------------
+BASE_REPO = "openai/whisper-large-v3"
+FINETUNED_REPO = "dmatekenya/whisper-large-v3-chichewa"
+FINETUNED_REVISION = "bff60fb08ba9f294e05bfcab4306f30b6a0cfc0a"  # pinned commit hash
+# Local WhisperProcessor language hint (keep consistent with how you evaluated)
+LOCAL_LANGUAGE = "shona"
+# OpenAI language hint (ISO-639-1). Chichewa/Nyanja often "ny". Set to None to auto-detect.
+OPENAI_LANGUAGE = "ny"
+# Audio constraints
+TARGET_SR = 16000
+MAX_SECONDS = 30.0
+# OpenAI transcription model (commercial)
+OPENAI_MODEL = "whisper-1"  # simple + stable
+# -----------------------------
+# UI Text / Styling
+# -----------------------------
+LOGO_HTML = """
+<div style="text-align:center; margin-bottom: 25px;">
+    <img src="https://i.ibb.co/5nQdGSs/logo.png"
+         style="max-width: 100%; height: auto; border-radius: 12px;">
+</div>
+"""
+TITLE_HTML = """
+<h1 style="text-align:center; font-size:34px; margin-bottom:10px;">
+Chichewa Speech2Text: How Custom Data Improves Performance
+</h1>
+"""
+HIGHLIGHT_TEXT = """
+<p style="text-align:center; font-size:20px; font-weight:600; color:#1F3A5F; margin-bottom:20px;">
+Observe how the fine-tuned model provides better transcription quality.
+</p>
+"""
+DESCRIPTION_HTML = """
+<p style="text-align:center; font-size:18px; margin-bottom: 18px;">
+Upload or record a short Chichewa voice note (≤30 seconds). The same audio will be transcribed by three systems.
+</p>
+"""
+ARTICLE_HTML = """
+<p style="text-align:center; margin-top: 10px;">
+Read more about the <a href="https://dmatekenya.github.io/Chichewa-Speech2Text/README.html" target="_blank">ChichewaSpeech2Text</a> project
+and sign up for our voice note donation event:
+<a href="https://forms.gle/fHLESutofVvb2YFM9" target="_blank">Google Form</a>.
+</p>
+"""
+# -----------------------------
+# Load local models once
+# -----------------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+print(f"Using device: {DEVICE}", flush=True)
+PROCESSOR = WhisperProcessor.from_pretrained(
+    BASE_REPO,
+    language=LOCAL_LANGUAGE,
+    task="transcribe",
+)
+MODEL_BASE = WhisperForConditionalGeneration.from_pretrained(BASE_REPO).to(DEVICE).eval()
+MODEL_FT = WhisperForConditionalGeneration.from_pretrained(
+    FINETUNED_REPO,
+    revision=FINETUNED_REVISION,
+).to(DEVICE).eval()
+if DEVICE == "cuda":
+    MODEL_BASE = MODEL_BASE.to(dtype=DTYPE)
+    MODEL_FT = MODEL_FT.to(dtype=DTYPE)
+OPENAI_CLIENT = OpenAI()
+# -----------------------------
+# Helpers
+# -----------------------------
+def load_audio(audio_path: str) -> Tuple[np.ndarray, int, float]:
+    y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
+    dur = float(len(y) / sr) if sr else 0.0
+    return y, sr, dur
+@torch.inference_mode()
+def transcribe_local(model: WhisperForConditionalGeneration, audio_16k: np.ndarray) -> str:
+    inputs = PROCESSOR(audio_16k, sampling_rate=TARGET_SR, return_tensors="pt")
+    feats = inputs.input_features.to(DEVICE)
+    if DEVICE == "cuda":
+        feats = feats.to(dtype=DTYPE)
+    ids = model.generate(inputs=feats)
+    text = PROCESSOR.batch_decode(ids, skip_special_tokens=True)[0]
+    return text.strip()
+def transcribe_openai(audio_path: str) -> str:
+    if not os.getenv("OPENAI_API_KEY"):
+        return "OpenAI ASR disabled: OPENAI_API_KEY not set in Space Secrets."
+    prompt = "Chichewa transcription. Malawi names like Lilongwe, Blantyre, Zomba. Keep local names as spoken."
+    with open(audio_path, "rb") as f:
+        resp = OPENAI_CLIENT.audio.transcriptions.create(
+            file=f,
+            model=OPENAI_MODEL,
+            language=OPENAI_LANGUAGE,  # set to None for auto-detect
+            prompt=prompt,
+            temperature=0.0,
+            response_format="json",
+        )
+    return (resp.text or "").strip()
+def transcribe_all(audio_path: Optional[str]) -> Tuple[str, str, str, str]:
+    """
+    Returns:
+        status, base_text, finetuned_text, openai_text
+    """
+    if not audio_path:
+        return "Please record or upload an audio clip.", "", "", ""
+    # Load audio once
+    try:
+        y, sr, dur = load_audio(audio_path)
+    except Exception as e:
+        return f"❌ Failed to load audio: {e}", "", "", ""
+    if dur > MAX_SECONDS:
+        return f"⚠️ Audio is {dur:.1f}s. Please keep clips ≤ {MAX_SECONDS:.0f}s.", "", "", ""
+    status = []
+    # Base (local)
+    t0 = time.time()
+    try:
+        base_text = transcribe_local(MODEL_BASE, y)
+        status.append(f"✅ Open Source (base) {time.time()-t0:.2f}s")
+    except Exception as e:
+        base_text = f"[ERROR] Base failed: {e}"
+        status.append("❌ Base failed")
+    # Fine-tuned (local)
+    t1 = time.time()
+    try:
+        ft_text = transcribe_local(MODEL_FT, y)
+        status.append(f"✅ Fine-tuned {time.time()-t1:.2f}s")
+    except Exception as e:
+        ft_text = f"[ERROR] Fine-tuned failed: {e}"
+        status.append("❌ Fine-tuned failed")
+    # OpenAI (commercial)
+    t2 = time.time()
+    try:
+        openai_text = transcribe_openai(audio_path)
+        status.append(f"✅ OpenAI ({OPENAI_MODEL}) {time.time()-t2:.2f}s")
+    except Exception as e:
+        openai_text = f"[ERROR] OpenAI failed: {e}"
+        status.append("❌ OpenAI failed")
+    return "\n".join(status), base_text, ft_text, openai_text
+# -----------------------------
+# Warm-up (local models only)
+# -----------------------------
+def warmup():
+    try:
+        dummy = np.zeros(int(TARGET_SR * 1.0), dtype=np.float32)
+        _ = transcribe_local(MODEL_BASE, dummy)
+        _ = transcribe_local(MODEL_FT, dummy)
+        print("Warm-up complete.", flush=True)
+    except Exception as e:
+        print(f"Warm-up skipped/failed: {e}", flush=True)
+warmup()
+# -----------------------------
+# UI
+# -----------------------------
+with gr.Blocks(theme="grass", title="Chichewa Speech2Text") as demo:
+    gr.Markdown(LOGO_HTML)
+    gr.Markdown(TITLE_HTML)
+    gr.Markdown(HIGHLIGHT_TEXT)
+    gr.Markdown(DESCRIPTION_HTML)
+    audio_in = gr.Audio(
+        sources=["microphone", "upload"],
+        type="filepath",
+        label="Audio Input (Record or Upload)",
+    )
+    run_btn = gr.Button("Transcribe & Compare", variant="primary")
+    status_out = gr.Textbox(label="Status / timing", lines=3)
+    with gr.Row():
+        base_out = gr.Textbox(
+            label="Open Source ASR Model",
+            lines=12,
+        )
+        ft_out = gr.Textbox(
+            label="Open Source Model Fine-Tuned with Custom Chichewa Speech",
+            lines=12,
+        )
+        commercial_out = gr.Textbox(
+            label="Frontier Commercial ASR Model (OpenAI)",
+            lines=12,
+        )
+    run_btn.click(
+        fn=transcribe_all,
+        inputs=[audio_in],
+        outputs=[status_out, base_out, ft_out, commercial_out],
+    )
+    gr.Markdown(ARTICLE_HTML)
+if __name__ == "__main__":
+    # Queue helps when multiple people test at once during your seminar
+    demo.queue(default_concurrency_limit=2).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,84 @@

+aiofiles==24.1.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.1
+audioread==3.1.0
+brotli==1.2.0
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+decorator==5.2.1
+distro==1.9.0
+fastapi==0.129.2
+ffmpy==1.0.0
+filelock==3.24.3
+fsspec==2026.2.0
+gradio==6.6.0
+gradio-client==2.1.0
+groovy==0.1.2
+groq==1.0.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==1.4.1
+idna==3.11
+jinja2==3.1.6
+jiter==0.13.0
+joblib==1.5.3
+lazy-loader==0.4
+librosa==0.11.0
+llvmlite==0.46.0
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.6.1
+numba==0.64.0
+numpy==2.4.2
+openai==2.21.0
+orjson==3.11.7
+packaging==26.0
+pandas==3.0.1
+pillow==12.1.1
+platformdirs==4.9.2
+pooch==1.9.0
+pycparser==3.0
+pydantic==2.12.5
+pydantic-core==2.41.5
+pydub==0.25.1
+pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.22
+pytz==2025.2
+pyyaml==6.0.3
+regex==2026.2.19
+requests==2.32.5
+rich==14.3.3
+safehttpx==0.1.7
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.0
+semantic-version==2.10.0
+setuptools==82.0.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==1.0.0
+starlette==0.52.1
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.22.2
+tomlkit==0.13.3
+torch==2.10.0
+tqdm==4.67.3
+transformers==5.2.0
+typer==0.24.1
+typer-slim==0.24.0
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+urllib3==2.6.3
+uvicorn==0.41.0