Elormiden's picture
1.0
f1687db
# app.py
from __future__ import annotations
import os
import numpy as np
import librosa
import gradio as gr
from transformers import pipeline
import kenlm_asr_pipeline
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_ID = "Elormiden/wav2vec2-cypriot-dialect"
KENLM_FILE = "cypriot.klm"
ASR = pipeline(
"automatic-speech-recognition-kenlm",
model=MODEL_ID,
kenlm_filename=KENLM_FILE,
alpha=0.4,
beta=0.9,
token=HF_TOKEN,
device=0,
model_id_or_path=MODEL_ID,
)
def transcribe(audio: tuple[int, np.ndarray] | None):
def wrap(msg: str) -> str:
msg = (msg or "").strip()
return f"Output: {msg if msg else '(empty)'}"
if audio is None:
return wrap("No audio.")
sr, data = audio
if isinstance(data, np.ndarray) and data.ndim == 2:
data = data.mean(axis=1)
target_sr = 16000
if sr != target_sr:
data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=target_sr)
sr = target_sr
if data.size == 0 or not np.isfinite(data).all():
return wrap("No valid audio.")
try:
out = ASR(data, sampling_rate=sr)
if isinstance(out, dict):
text = out.get("text", "")
elif isinstance(out, list) and out and isinstance(out[0], dict):
text = out[0].get("text", "")
elif isinstance(out, str):
text = out
else:
text = str(out)
return wrap(text)
except Exception as e:
return wrap(f"Error: {e}")
with gr.Blocks(title="KenLM Wav2Vec2 ASR") as demo:
gr.Markdown("# KenLM Wav2Vec2 ASR\nUpload or record audio; decoding uses KenLM for better accuracy.")
audio = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Audio (16kHz preferred)")
btn = gr.Button("Transcribe")
txt = gr.Textbox(label="Transcription")
btn.click(fn=transcribe, inputs=audio, outputs=txt)
audio.change(fn=transcribe, inputs=audio, outputs=txt)
if __name__ == "__main__":
demo.launch()