chichewa-transcriber

Paused

App Files Files Community

chichewa-transcriber / app.py

dmatekenya

Add application code

dc7727e 14 days ago

raw

history blame contribute delete

12.7 kB

	"""
	Chichewa Speech2Text — Diff Viewer (Single-model inference + Ground-truth diff)

	What this app does:
	- Lets the user upload/record an audio clip (≤30s recommended).
	- Lets the user paste a human-verified "ground truth" reference transcript.
	- Lets the user choose ONE system to run (Base / Fine-tuned / OpenAI).
	- Produces a 2-column, word-level highlighted diff: Reference vs Hypothesis.

	Notes:
	- This keeps inference lightweight (runs only one model at a time).
	- The demo audio is preloaded on page load, but inference is NOT auto-run to reduce break risk.
	"""

	import os
	import html
	import re
	import time
	import urllib.request
	from difflib import SequenceMatcher
	from pathlib import Path
	from typing import Optional, Tuple

	import gradio as gr
	import librosa
	import numpy as np
	import torch
	from transformers import WhisperForConditionalGeneration, WhisperProcessor
	from openai import OpenAI


	# -----------------------------
	# Demo audio
	# -----------------------------
	DEMO_URL = "https://github.com/dmatekenya/AI-seminars-malawi/releases/download/v1.0/WAU12.wav"
	DEMO_AUDIO_PATH = Path("/tmp/demo.wav")


	def ensure_demo_audio() -> str:
	"""
	Ensure demo audio exists on disk and return the path as a string.

	Raises:
	RuntimeError: If download fails or file is empty.
	"""
	DEMO_AUDIO_PATH.parent.mkdir(parents=True, exist_ok=True)

	if DEMO_AUDIO_PATH.exists() and DEMO_AUDIO_PATH.stat().st_size > 0:
	print(f"[demo] Using cached audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True)
	return str(DEMO_AUDIO_PATH)

	print(f"[demo] Downloading demo audio from: {DEMO_URL}", flush=True)
	try:
	tmp_path = DEMO_AUDIO_PATH.with_suffix(".wav.tmp")
	urllib.request.urlretrieve(DEMO_URL, tmp_path)
	os.replace(tmp_path, DEMO_AUDIO_PATH)
	except Exception as e:
	raise RuntimeError(f"[demo] Failed to download demo audio from {DEMO_URL}. Error: {e}")

	if not DEMO_AUDIO_PATH.exists() or DEMO_AUDIO_PATH.stat().st_size == 0:
	raise RuntimeError(f"[demo] Download completed but file is missing/empty at {DEMO_AUDIO_PATH}")

	print(f"[demo] Downloaded demo audio: {DEMO_AUDIO_PATH} ({DEMO_AUDIO_PATH.stat().st_size} bytes)", flush=True)
	return str(DEMO_AUDIO_PATH)


	# -----------------------------
	# Models / Config
	# -----------------------------
	BASE_REPO = "openai/whisper-large-v3"
	FINETUNED_REPO = "dmatekenya/whisper-large-v3-chichewa"
	FINETUNED_REVISION = "bff60fb08ba9f294e05bfcab4306f30b6a0cfc0a" # pinned commit hash

	# Keep this consistent with how you evaluated to avoid surprises.
	# (You can change later; for tomorrow, stability > perfection.)
	LOCAL_LANGUAGE = "shona"

	TARGET_SR = 16000
	MAX_SECONDS = 30.0 # recommended, not enforced here

	OPENAI_MODEL = "gpt-4o-transcribe"


	# -----------------------------
	# UI Text / Styling
	# -----------------------------
	LOGO_HTML = """
	<div style="text-align:center; margin-bottom: 25px;">
	<img src="https://i.ibb.co/5nQdGSs/logo.png"
	style="max-width: 100%; height: auto; border-radius: 12px;">
	</div>
	"""

	HEADER_HTML = """
	<div style="text-align:center; max-width:900px; margin:0 auto;">
	<h1 style="font-size:36px; margin-bottom:12px;">
	Chichewa Speech2Text: Ground Truth Diff Viewer
	</h1>
	<p style="font-size:20px; font-weight:700; color:#1F3A5F; margin-bottom:12px;">
	Paste a human-verified reference transcript and compare it to one ASR system at a time.
	</p>
	<p style="font-size:18px; color:#444; margin-bottom:25px;">
	Record or upload a short voice note (≤30 seconds recommended).
	</p>
	</div>
	"""

	DIVIDER = """
	<div style="max-width:900px; margin:10px auto;">
	<hr style="border:0; border-top:1px solid #ddd;">
	</div>
	"""

	ARTICLE_HTML = """
	<p style="text-align:center; margin-top: 10px;">
	Read more about the <a href="https://dmatekenya.github.io/Chichewa-Speech2Text/README.html" target="_blank">ChichewaSpeech2Text</a> project
	and sign up for our voice note donation event:
	<a href="https://forms.gle/fHLESutofVvb2YFM9" target="_blank">Google Form</a>.
	</p>
	"""


	# -----------------------------
	# Load models once
	# -----------------------------
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
	print(f"Using device: {DEVICE}", flush=True)

	PROCESSOR = WhisperProcessor.from_pretrained(
	BASE_REPO,
	language=LOCAL_LANGUAGE,
	task="transcribe",
	)

	MODEL_BASE = WhisperForConditionalGeneration.from_pretrained(BASE_REPO).to(DEVICE).eval()
	MODEL_FT = WhisperForConditionalGeneration.from_pretrained(
	FINETUNED_REPO,
	revision=FINETUNED_REVISION,
	).to(DEVICE).eval()

	if DEVICE == "cuda":
	MODEL_BASE = MODEL_BASE.to(dtype=DTYPE)
	MODEL_FT = MODEL_FT.to(dtype=DTYPE)

	OPENAI_CLIENT = OpenAI()


	# -----------------------------
	# Helpers: audio + transcription
	# -----------------------------
	def load_audio(audio_path: str) -> Tuple[np.ndarray, int, float]:
	y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
	dur = float(len(y) / sr) if sr else 0.0
	return y, sr, dur


	@torch.inference_mode()
	def transcribe_local(model: WhisperForConditionalGeneration, audio_16k: np.ndarray) -> str:
	input_features = PROCESSOR(
	audio_16k,
	return_tensors="pt",
	sampling_rate=TARGET_SR,
	).input_features

	input_features = input_features.to(DEVICE)
	if DEVICE == "cuda":
	input_features = input_features.to(dtype=DTYPE)

	generated_ids = model.generate(input_features=input_features)
	transcription = PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return transcription.strip()


	def transcribe_openai(audio_path: str) -> str:
	if not os.getenv("OPENAI_API_KEY"):
	return "OpenAI ASR disabled: OPENAI_API_KEY not set in Space Secrets."

	prompt = "Chichewa transcription. Malawi names like Lilongwe, Blantyre, Zomba. Keep local names as spoken."

	with open(audio_path, "rb") as f:
	resp = OPENAI_CLIENT.audio.transcriptions.create(
	file=f,
	model=OPENAI_MODEL,
	prompt=prompt,
	temperature=0.0,
	response_format="json",
	)
	return (resp.text or "").strip()


	def transcribe_selected(audio_path: Optional[str], which: str) -> Tuple[str, str]:
	"""
	Transcribe using a single selected system.

	Parameters:
	audio_path: filepath from Gradio audio component
	which: "Base" \| "Fine-tuned" \| "OpenAI"

	Returns:
	status, hypothesis_text
	"""
	if not audio_path:
	return "Please record or upload an audio clip.", ""

	# Load audio only for local models
	y = None
	if which in ["Base", "Fine-tuned"]:
	try:
	y, sr, dur = load_audio(audio_path)
	except Exception as e:
	return f"❌ Failed to load audio: {e}", ""

	t0 = time.time()
	try:
	if which == "Base":
	hyp = transcribe_local(MODEL_BASE, y)
	elif which == "Fine-tuned":
	hyp = transcribe_local(MODEL_FT, y)
	elif which == "OpenAI":
	hyp = transcribe_openai(audio_path)
	else:
	return f"Unknown model selection: {which}", ""
	except Exception as e:
	return f"❌ {which} failed: {e}", ""

	return f"✅ {which} done in {time.time() - t0:.2f}s", (hyp or "").strip()


	# -----------------------------
	# Helpers: diff visualization
	# -----------------------------
	def _tokenize_words(s: str):
	return re.findall(r"\w+\|[^\w\s]", s, flags=re.UNICODE)


	def diff_highlight_html(ref: str, hyp: str, title_ref="Reference", title_hyp="Hypothesis") -> str:
	"""
	Returns HTML showing a word-level diff between ref and hyp.
	- deletions (in ref not in hyp): red + strikethrough (shown on reference side)
	- insertions (in hyp not in ref): green (shown on hypothesis side)
	- replacements: red struck old (ref) + green new (hyp)
	"""
	ref_toks = _tokenize_words(ref or "")
	hyp_toks = _tokenize_words(hyp or "")

	sm = SequenceMatcher(a=ref_toks, b=hyp_toks)
	ref_out, hyp_out = [], []

	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	a = ref_toks[i1:i2]
	b = hyp_toks[j1:j2]

	if tag == "equal":
	ref_out += [html.escape(t) for t in a]
	hyp_out += [html.escape(t) for t in b]

	elif tag == "delete":
	ref_out += [
	f"<span style='color:#b00020;text-decoration:line-through;background:#ffe6e6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>"
	for t in a
	]

	elif tag == "insert":
	hyp_out += [
	f"<span style='color:#0a7a0a;background:#e6ffe6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>"
	for t in b
	]

	elif tag == "replace":
	ref_out += [
	f"<span style='color:#b00020;text-decoration:line-through;background:#ffe6e6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>"
	for t in a
	]
	hyp_out += [
	f"<span style='color:#0a7a0a;background:#e6ffe6;padding:1px 3px;border-radius:4px;'>{html.escape(t)}</span>"
	for t in b
	]

	def _join(tokens):
	s = " ".join(tokens)
	s = re.sub(r"\s+([,.;:!?])", r"\1", s)
	s = re.sub(r"\(\s+", "(", s)
	s = re.sub(r"\s+\)", ")", s)
	return s

	ref_html = _join(ref_out)
	hyp_html = _join(hyp_out)

	return f"""
	<div style="display:grid;grid-template-columns:1fr 1fr;gap:14px;">
	<div style="padding:12px;border:1px solid #ddd;border-radius:10px;">
	<div style="font-weight:700;margin-bottom:8px;">{html.escape(title_ref)}</div>
	<div style="line-height:1.6;">{ref_html}</div>
	</div>
	<div style="padding:12px;border:1px solid #ddd;border-radius:10px;">
	<div style="font-weight:700;margin-bottom:8px;">{html.escape(title_hyp)}</div>
	<div style="line-height:1.6;">{hyp_html}</div>
	</div>
	</div>
	"""


	def make_diff_from_gt(reference_text: str, hypothesis_text: str, which: str) -> str:
	ref = (reference_text or "").strip()
	hyp = (hypothesis_text or "").strip()

	if not ref:
	return (
	"<div style='padding:12px;border:1px solid #ddd;border-radius:10px;'>"
	"❗ Paste a ground-truth reference transcript to see the diff."
	"</div>"
	)

	if not hyp:
	return (
	"<div style='padding:12px;border:1px solid #ddd;border-radius:10px;'>"
	"❗ No hypothesis transcript yet. Click “Transcribe + Show Diff”."
	"</div>"
	)

	return diff_highlight_html(
	ref,
	hyp,
	title_ref="Ground truth (Reference)",
	title_hyp=f"{which} (Hypothesis)",
	)


	# -----------------------------
	# UI
	# -----------------------------
	with gr.Blocks(theme="grass", title="Chichewa Speech2Text — Diff Viewer") as demo:
	gr.HTML(LOGO_HTML)
	gr.HTML(DIVIDER)
	gr.HTML(HEADER_HTML)

	audio_in = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Audio Input (Record or Upload)",
	value=DEMO_AUDIO_PATH,
	)

	reference_in = gr.Textbox(
	label="Ground truth reference transcript (paste here)",
	lines=5,
	placeholder="Paste the correct (human-verified) transcript here...",
	)

	with gr.Row():
	which_model = gr.Radio(
	choices=["Fine-tuned", "Base", "OpenAI"],
	value="Fine-tuned",
	label="Choose one system to transcribe (runs only one)",
	)
	run_btn = gr.Button("Transcribe + Show Diff", variant="primary")

	status_out = gr.Textbox(label="Status / timing", lines=2)
	hyp_out = gr.Textbox(label="Hypothesis transcript", lines=10)
	diff_out = gr.HTML()

	# Run one transcription, then render the diff
	run_btn.click(
	fn=transcribe_selected,
	inputs=[audio_in, which_model],
	outputs=[status_out, hyp_out],
	).then(
	fn=make_diff_from_gt,
	inputs=[reference_in, hyp_out, which_model],
	outputs=[diff_out],
	)

	# Re-render diff if the reference text changes (no need to re-run ASR)
	reference_in.change(
	fn=make_diff_from_gt,
	inputs=[reference_in, hyp_out, which_model],
	outputs=[diff_out],
	)

	# Preload demo audio path on page load (no auto inference)
	demo.load(
	fn=lambda: ensure_demo_audio(),
	inputs=None,
	outputs=[audio_in],
	)

	gr.Markdown(ARTICLE_HTML)


	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=2).launch()