File size: 2,025 Bytes
f1687db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# app.py

from __future__ import annotations
import os
import numpy as np
import librosa
import gradio as gr
from transformers import pipeline
import kenlm_asr_pipeline

HF_TOKEN = os.environ.get("HF_TOKEN")

MODEL_ID = "Elormiden/wav2vec2-cypriot-dialect"
KENLM_FILE = "cypriot.klm"

ASR = pipeline(
    "automatic-speech-recognition-kenlm",
    model=MODEL_ID,        
    kenlm_filename=KENLM_FILE, 
    alpha=0.4,
    beta=0.9,
    token=HF_TOKEN,
    device=0,
    model_id_or_path=MODEL_ID,
)

def transcribe(audio: tuple[int, np.ndarray] | None):
    def wrap(msg: str) -> str:
        msg = (msg or "").strip()
        return f"Output: {msg if msg else '(empty)'}"

    if audio is None:
        return wrap("No audio.")
    sr, data = audio

    if isinstance(data, np.ndarray) and data.ndim == 2:
        data = data.mean(axis=1)

    target_sr = 16000
    if sr != target_sr:
        data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=target_sr)
        sr = target_sr

    if data.size == 0 or not np.isfinite(data).all():
        return wrap("No valid audio.")

    try:
        out = ASR(data, sampling_rate=sr)
        if isinstance(out, dict):
            text = out.get("text", "")
        elif isinstance(out, list) and out and isinstance(out[0], dict):
            text = out[0].get("text", "")
        elif isinstance(out, str):
            text = out
        else:
            text = str(out)

        return wrap(text)
    except Exception as e:
        return wrap(f"Error: {e}")


with gr.Blocks(title="KenLM Wav2Vec2 ASR") as demo:
    gr.Markdown("# KenLM Wav2Vec2 ASR\nUpload or record audio; decoding uses KenLM for better accuracy.")
    audio = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Audio (16kHz preferred)")
    btn = gr.Button("Transcribe")
    txt = gr.Textbox(label="Transcription")

    btn.click(fn=transcribe, inputs=audio, outputs=txt)
    audio.change(fn=transcribe, inputs=audio, outputs=txt)

if __name__ == "__main__":
    demo.launch()