| |
|
|
| from __future__ import annotations |
| import os |
| import numpy as np |
| import librosa |
| import gradio as gr |
| from transformers import pipeline |
| import kenlm_asr_pipeline |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
| MODEL_ID = "Elormiden/wav2vec2-cypriot-dialect" |
| KENLM_FILE = "cypriot.klm" |
|
|
| ASR = pipeline( |
| "automatic-speech-recognition-kenlm", |
| model=MODEL_ID, |
| kenlm_filename=KENLM_FILE, |
| alpha=0.4, |
| beta=0.9, |
| token=HF_TOKEN, |
| device = 0, |
| model_id_or_path=MODEL_ID, |
| ) |
|
|
|
|
| def transcribe(audio: tuple[int, np.ndarray] | None): |
| if audio is None: |
| return "Upload or record audio to get a transcription." |
|
|
| sr, data = audio |
|
|
| if isinstance(data, np.ndarray) and data.ndim == 2: |
| data = data.mean(axis=1) |
|
|
| target_sr = 16000 |
| if sr != target_sr: |
| data = librosa.resample(data.astype(np.float32), orig_sr=sr, target_sr=target_sr) |
| sr = target_sr |
|
|
| if data.size == 0 or not np.isfinite(data).all(): |
| return "Could not read this audio. Try another file or recording." |
|
|
| try: |
| out = ASR(data, sampling_rate=sr) |
| if isinstance(out, dict): |
| text = out.get("text", "") |
| elif isinstance(out, list) and out and isinstance(out[0], dict): |
| text = out[0].get("text", "") |
| elif isinstance(out, str): |
| text = out |
| else: |
| text = str(out) |
|
|
| return (text or "").strip() or "No speech detected." |
| except Exception as e: |
| return f"Something went wrong. Please try again. ({e})" |
|
|
|
|
| DESCRIPTION = """ |
| # Simona AI |
| |
| **Speech-to-text for Cypriot Greek (Κυπριακά).** |
| |
| Generic recognizers often miss how people actually speak in Cyprus. Simona turns your recording into text tuned for the **Cypriot dialect** — upload a file or use the microphone. |
| |
| *Μιλήστε ή ανεβάστε ήχο· λάβετε κείμενο στα Κυπριακά.* |
| """ |
|
|
| with gr.Blocks(title="Simona AI — Cypriot Greek ASR") as demo: |
| gr.Markdown(DESCRIPTION) |
|
|
| audio = gr.Audio( |
| sources=["microphone", "upload"], |
| type="numpy", |
| label="Audio", |
| ) |
| btn = gr.Button("Transcribe", variant="primary") |
| txt = gr.Textbox( |
| label="Transcription", |
| lines=4, |
| placeholder="Your text will appear here…", |
| ) |
|
|
| btn.click(fn=transcribe, inputs=audio, outputs=txt) |
| audio.change(fn=transcribe, inputs=audio, outputs=txt) |
|
|
| if __name__ == "__main__": |
| demo.launch() |