Spaces:

mkfallah
/

pasr

Sleeping

App Files Files Community

mkfallah commited on Sep 4, 2025

Commit

153e956

verified ·

1 Parent(s): ddf950c

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -11

app.py CHANGED Viewed

@@ -5,11 +5,11 @@ import tempfile
 import soundfile as sf
 import numpy as np
-# --- ASR pipeline ---
 asr = pipeline(
     task="automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
-    device=-1  # CPU
 )
 # --- Custom vocabulary with multiple forms for accuracy ---
@@ -21,6 +21,10 @@ custom_vocab_map = {
 }
 def replace_fuzzy(text, vocab_map, threshold=85):
     for target, alternatives in vocab_map.items():
         match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
         if score >= threshold:
@@ -28,23 +32,27 @@ def replace_fuzzy(text, vocab_map, threshold=85):
     return text
 def transcribe(audio):
-    # check if audio is None
     if audio is None:
         return "No audio input detected."
-    # Gradio may pass a tuple or just a file path depending on version
     if isinstance(audio, tuple):
         data, sr = audio
-        # convert to 2D if mono
         if isinstance(data, int):
             return "Invalid audio data."
         if data.ndim == 1:
             data = np.expand_dims(data, axis=1)
         with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
             sf.write(tmp.name, data, samplerate=sr)
             result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
     else:
-        # assume audio is file path
         result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
     text = result["text"]
@@ -54,10 +62,7 @@ def transcribe(audio):
 # --- Gradio interface ---
 iface = gr.Interface(
     fn=transcribe,
-    inputs=gr.Audio(source="microphone", type="numpy"),  # live mic recording
     outputs="text",
     title="Persian ASR with High Accuracy Vocabulary",
-    description="Speak in Persian; recognized words are corrected using a custom high-accuracy vocabulary."
-)
-iface.launch()

 import soundfile as sf
 import numpy as np
+# --- Initialize ASR pipeline ---
 asr = pipeline(
     task="automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
+    device=-1  # CPU; for GPU set device=0
 )
 # --- Custom vocabulary with multiple forms for accuracy ---
 }
 def replace_fuzzy(text, vocab_map, threshold=85):
+    """
+    Replace words/phrases in text using fuzzy matching with high threshold.
+    Supports multiple alternatives per word/phrase.
+    """
     for target, alternatives in vocab_map.items():
         match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
         if score >= threshold:
     return text
 def transcribe(audio):
+    """
+    audio: tuple(numpy array, sample_rate) from Gradio
+    """
     if audio is None:
         return "No audio input detected."
+    # Handle audio input
     if isinstance(audio, tuple):
         data, sr = audio
+        # Convert mono to 2D array for soundfile
         if isinstance(data, int):
             return "Invalid audio data."
         if data.ndim == 1:
             data = np.expand_dims(data, axis=1)
+        # Write temporary WAV file
         with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
             sf.write(tmp.name, data, samplerate=sr)
+            # Run ASR with chunking
             result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
     else:
+        # If audio is a file path
         result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
     text = result["text"]
 # --- Gradio interface ---
 iface = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(type="numpy", label="Record or upload audio"),
     outputs="text",
     title="Persian ASR with High Accuracy Vocabulary",
+    description="Speak in Persian or upload an audio file; recognized words are corrected