# app.py from __future__ import annotations import os import numpy as np import librosa import gradio as gr from transformers import pipeline import kenlm_asr_pipeline HF_TOKEN = os.environ.get("HF_TOKEN") MODEL_ID = "Elormiden/wav2vec2-cypriot-dialect" KENLM_FILE = "cypriot.klm" ASR = pipeline( "automatic-speech-recognition-kenlm", model=MODEL_ID, kenlm_filename=KENLM_FILE, alpha=0.4, beta=0.9, token=HF_TOKEN, device=0, model_id_or_path=MODEL_ID, ) def transcribe(audio: tuple[int, np.ndarray] | None): def wrap(msg: str) -> str: msg = (msg or "").strip() return f"Output: {msg if msg else '(empty)'}" if audio is None: return wrap("No audio.") sr, data = audio if isinstance(data, np.ndarray) and data.ndim == 2: data = data.mean(axis=1) target_sr = 16000 if sr != target_sr: data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=target_sr) sr = target_sr if data.size == 0 or not np.isfinite(data).all(): return wrap("No valid audio.") try: out = ASR(data, sampling_rate=sr) if isinstance(out, dict): text = out.get("text", "") elif isinstance(out, list) and out and isinstance(out[0], dict): text = out[0].get("text", "") elif isinstance(out, str): text = out else: text = str(out) return wrap(text) except Exception as e: return wrap(f"Error: {e}") with gr.Blocks(title="KenLM Wav2Vec2 ASR") as demo: gr.Markdown("# KenLM Wav2Vec2 ASR\nUpload or record audio; decoding uses KenLM for better accuracy.") audio = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Audio (16kHz preferred)") btn = gr.Button("Transcribe") txt = gr.Textbox(label="Transcription") btn.click(fn=transcribe, inputs=audio, outputs=txt) audio.change(fn=transcribe, inputs=audio, outputs=txt) if __name__ == "__main__": demo.launch()