Spaces:

ZidanePMSE
/

speech_to_text

Runtime error

App Files Files Community

ZidanePMSE commited on Dec 7, 2025

Commit

31266e7

verified ·

1 Parent(s): 17d198f

Upload 2 files

Browse files

Files changed (2) hide show

app.py +94 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import io
+import torch
+import torchaudio
+import numpy as np
+import gradio as gr
+import soundfile as sf
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# ===== CONFIG =====
+MODEL_ID = "vinai/PhoWhisper-small"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+TARGET_SR = 16000  # Whisper expects 16kHz
+# ===== LOAD MODEL =====
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID).to(DEVICE)
+model.eval()
+# prepare forced decoder ids for Vietnamese transcription
+try:
+    forced_decoder_ids = processor.get_decoder_prompt_ids(language="vi", task="transcribe")
+except Exception:
+    forced_decoder_ids = None
+# ===== HELPERS =====
+def _read_audio_tuple(audio):
+    """
+    audio: (sr, np.ndarray) coming from gr.Audio(type="numpy")
+    returns mono float32 numpy array and original sr
+    """
+    if audio is None:
+        return None, None
+    sr, data = audio
+    # ensure numpy
+    data = np.asarray(data)
+    # stereo -> mono
+    if data.ndim > 1:
+        data = data.mean(axis=1)
+    # convert to float32 in range [-1, 1] if needed
+    if data.dtype.kind == "i":
+        # integer PCM -> normalize
+        maxv = float(np.iinfo(data.dtype).max)
+        data = data.astype("float32") / maxv
+    else:
+        data = data.astype("float32")
+    return data, sr
+# ===== INFERENCE =====
+def s2t(audio):
+    """
+    audio: (sr, numpy array) from gradio Audio
+    returns: transcription string
+    """
+    data, sr = _read_audio_tuple(audio)
+    if data is None:
+        return "No audio provided"
+    # resample if needed
+    if sr != TARGET_SR:
+        waveform = torch.from_numpy(data)
+        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=TARGET_SR)
+        data = waveform.numpy()
+    # processor -> input features
+    inputs = processor(data, sampling_rate=TARGET_SR, return_tensors="pt")
+    input_features = inputs.input_features.to(DEVICE)
+    with torch.no_grad():
+        if forced_decoder_ids is not None:
+            pred_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+        else:
+            pred_ids = model.generate(input_features)
+    # decode
+    transcription = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
+    return transcription.strip()
+# ===== GRADIO APP =====
+title = "Vietnamese Speech-to-Text — PhoWhisper-small"
+desc = "Upload or record audio (wav/mp3). Model: vinai/PhoWhisper-small. Resamples to 16 kHz."
+app = gr.Interface(
+    fn=s2t,
+    inputs=gr.Audio(source="upload", type="numpy", label="Upload or record audio (.wav/.mp3)"),
+    outputs=gr.Textbox(label="Transcription"),
+    title=title,
+    description=desc,
+    allow_flagging="never",
+    examples=[],
+)
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+torchaudio
+transformers
+sentencepiece
+gradio
+soundfile
+numpy