Spaces:

Rahul2020
/

EAG_s9_sample

Sleeping

App Files Files Community

Rahuluni commited on Oct 9, 2025

Commit

86fcf07

1 Parent(s): 1da33c5

error fixed

Browse files

Files changed (2) hide show

app.py +133 -15
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import tempfile
 import uuid
 import soundfile as sf
 from pathlib import Path
 import gradio as gr
 from transformers import pipeline
@@ -14,12 +16,44 @@ from transformers import pipeline
 ASR_MODEL = "openai/whisper-small"
 asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, chunk_length_s=30, ignore_warning=True)
 def save_audio_to_wav(audio, sr):
     """
     audio: numpy array (samples,) or path string
     sr: sample rate
     Returns path to saved wav
     """
     tmpdir = tempfile.gettempdir()
     fname = Path(tmpdir) / f"hf_audio_{uuid.uuid4().hex}.wav"
     sf.write(str(fname), audio, sr, format="WAV")
@@ -33,28 +67,112 @@ def transcribe(audio):
     if audio is None:
         return "No audio provided."
-    # If Gradio gives a filepath (str)
     if isinstance(audio, str):
-        audio_path = audio
     else:
-        # Gradio sometimes returns (np_array, sr) or (sr, np_array) depending on version.
         try:
-            # try (array, sr)
-            samples, sr = audio
-        except Exception:
-            # try (sr, array)
-            sr, samples = audio
-        audio_path = save_audio_to_wav(samples, sr)
-    # Use the pipeline to transcribe
-    # pipeline returns a dict with "text"
-    result = asr(audio_path)
     text = result.get("text", "").strip()
     # cleanup temporary file
     try:
-        if not isinstance(audio, str) and os.path.exists(audio_path):
-            os.remove(audio_path)
     except Exception:
         pass

 import uuid
 import soundfile as sf
 from pathlib import Path
+import numpy as np
+import logging
 import gradio as gr
 from transformers import pipeline
 ASR_MODEL = "openai/whisper-small"
 asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, chunk_length_s=30, ignore_warning=True)
+# Debug flag: set True to print audio shapes/dtypes and save resampled temp WAVs
+DEBUG = False
+logger = logging.getLogger(__name__)
+if DEBUG:
+    logging.basicConfig(level=logging.DEBUG)
 def save_audio_to_wav(audio, sr):
     """
     audio: numpy array (samples,) or path string
     sr: sample rate
     Returns path to saved wav
     """
+    # unwrap common tuple forms (array, sr) or (sr, array)
+    if isinstance(audio, (list, tuple)):
+        # prefer numpy array element
+        arr = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
+        if isinstance(arr, (list, tuple)):
+            audio = np.asarray(arr)
+        elif isinstance(arr, np.ndarray):
+            audio = arr
+        else:
+            # fallback to first element
+            audio = np.asarray(audio[0])
+    # ensure numpy array
+    audio = np.asarray(audio)
+    # If shape is (channels, frames) transpose to (frames, channels)
+    if audio.ndim == 2 and audio.shape[0] <= 2 and audio.shape[1] > audio.shape[0]:
+        audio = audio.T
+    # Convert integer audio to float32 in [-1, 1] or ensure float32
+    if np.issubdtype(audio.dtype, np.integer):
+        maxv = np.iinfo(audio.dtype).max
+        audio = audio.astype("float32") / float(maxv)
+    else:
+        audio = audio.astype("float32")
     tmpdir = tempfile.gettempdir()
     fname = Path(tmpdir) / f"hf_audio_{uuid.uuid4().hex}.wav"
     sf.write(str(fname), audio, sr, format="WAV")
     if audio is None:
         return "No audio provided."
+    # If Gradio gives a filepath (str), read it with soundfile to avoid ffmpeg requirement
+    audio_array = None
+    sampling_rate = None
     if isinstance(audio, str):
+        try:
+            audio_array, sampling_rate = sf.read(audio)
+        except Exception as e:
+            return f"Could not read audio file: {e}"
+    else:
+        # Normalize audio to (samples, sr)
+        samples = None
+        sr = None
+        if isinstance(audio, (list, tuple)):
+            # common forms: (samples, sr) or (sr, samples)
+            if len(audio) >= 2:
+                a0, a1 = audio[0], audio[1]
+                if isinstance(a0, (list, tuple, np.ndarray)):
+                    samples, sr = a0, a1
+                elif isinstance(a1, (list, tuple, np.ndarray)):
+                    samples, sr = a1, a0
+            # fallback: try to find array and int within the tuple
+            if samples is None:
+                samples = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
+                sr = next((x for x in audio if isinstance(x, int)), None)
+        else:
+            samples = audio
+        if samples is None:
+            return "Unsupported audio format."
+        # default sr if missing
+        if sr is None:
+            sr = 16000
+        audio_array = np.asarray(samples)
+        sampling_rate = sr
+    # Ensure numpy array and float32
+    try:
+        audio_array = np.asarray(audio_array)
+    except Exception:
+        return "Unsupported audio data - cannot convert to numpy array."
+    # If 2D (frames, channels) or (channels, frames), make mono by averaging channels
+    if audio_array.ndim == 2:
+        # If shape looks like (channels, frames), transpose first
+        if audio_array.shape[0] <= 2 and audio_array.shape[1] > audio_array.shape[0]:
+            audio_array = audio_array.T
+        # average channels to mono
+        audio_array = np.mean(audio_array, axis=1)
+    # Convert integer audio to float32 in [-1, 1] or ensure float32
+    if np.issubdtype(audio_array.dtype, np.integer):
+        maxv = np.iinfo(audio_array.dtype).max
+        audio_array = audio_array.astype("float32") / float(maxv)
     else:
+        audio_array = audio_array.astype("float32")
+    # Resample to the model's expected sampling rate if needed (avoid passing sampling_rate kwarg)
+    try:
+        model_sr = getattr(getattr(asr, "feature_extractor", None), "sampling_rate", None)
+    except Exception:
+        model_sr = None
+    if model_sr is None:
+        model_sr = 16000
+    # if incoming sampling_rate is missing, assume model rate
+    if sampling_rate is None:
+        sampling_rate = model_sr
+    if sampling_rate != model_sr:
+        # simple linear resampling via numpy.interp
+        try:
+            orig_len = audio_array.shape[0]
+            new_len = int(round(orig_len * float(model_sr) / float(sampling_rate)))
+            if new_len <= 0:
+                return "Transcription failed: invalid resample length"
+            new_indices = np.linspace(0, orig_len - 1, new_len)
+            old_indices = np.arange(orig_len)
+            audio_array = np.interp(new_indices, old_indices, audio_array).astype("float32")
+            sampling_rate = model_sr
+        except Exception as e:
+            return f"Transcription failed during resampling: {e}"
+    # Debug: log and optionally save the resampled audio
+    if DEBUG:
         try:
+            logger.debug(f"Calling ASR with audio_array.shape={audio_array.shape}, dtype={audio_array.dtype}, sampling_rate={sampling_rate}")
+            tmpdir = tempfile.gettempdir()
+            dbg_fname = Path(tmpdir) / f"hf_debug_audio_{uuid.uuid4().hex}.wav"
+            sf.write(str(dbg_fname), audio_array, sampling_rate, format="WAV")
+            logger.debug(f"Wrote debug WAV to {dbg_fname}")
+        except Exception as e:
+            logger.debug(f"Debug save failed: {e}")
+    # Use the pipeline to transcribe by passing just the numpy array (model expects array at its sampling rate)
+    try:
+        result = asr(audio_array)
+    except Exception as e:
+        return f"Transcription failed: {e}"
     text = result.get("text", "").strip()
     # cleanup temporary file
     try:
+            pass  # Removed cleanup code referencing undefined audio_path
     except Exception:
         pass

requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ gradio>=3.34
 transformers>=4.30.0
 torch  # CPU will be used by default on free Spaces
 soundfile

 transformers>=4.30.0
 torch  # CPU will be used by default on free Spaces
 soundfile
+numpy