Rahuluni commited on
Commit
86fcf07
·
1 Parent(s): 1da33c5

error fixed

Browse files
Files changed (2) hide show
  1. app.py +133 -15
  2. requirements.txt +1 -0
app.py CHANGED
@@ -4,6 +4,8 @@ import tempfile
4
  import uuid
5
  import soundfile as sf
6
  from pathlib import Path
 
 
7
 
8
  import gradio as gr
9
  from transformers import pipeline
@@ -14,12 +16,44 @@ from transformers import pipeline
14
  ASR_MODEL = "openai/whisper-small"
15
  asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, chunk_length_s=30, ignore_warning=True)
16
 
 
 
 
 
 
 
17
  def save_audio_to_wav(audio, sr):
18
  """
19
  audio: numpy array (samples,) or path string
20
  sr: sample rate
21
  Returns path to saved wav
22
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  tmpdir = tempfile.gettempdir()
24
  fname = Path(tmpdir) / f"hf_audio_{uuid.uuid4().hex}.wav"
25
  sf.write(str(fname), audio, sr, format="WAV")
@@ -33,28 +67,112 @@ def transcribe(audio):
33
  if audio is None:
34
  return "No audio provided."
35
 
36
- # If Gradio gives a filepath (str)
 
 
 
37
  if isinstance(audio, str):
38
- audio_path = audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  else:
40
- # Gradio sometimes returns (np_array, sr) or (sr, np_array) depending on version.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  try:
42
- # try (array, sr)
43
- samples, sr = audio
44
- except Exception:
45
- # try (sr, array)
46
- sr, samples = audio
47
- audio_path = save_audio_to_wav(samples, sr)
48
-
49
- # Use the pipeline to transcribe
50
- # pipeline returns a dict with "text"
51
- result = asr(audio_path)
 
 
 
52
  text = result.get("text", "").strip()
53
 
54
  # cleanup temporary file
55
  try:
56
- if not isinstance(audio, str) and os.path.exists(audio_path):
57
- os.remove(audio_path)
58
  except Exception:
59
  pass
60
 
 
4
  import uuid
5
  import soundfile as sf
6
  from pathlib import Path
7
+ import numpy as np
8
+ import logging
9
 
10
  import gradio as gr
11
  from transformers import pipeline
 
16
  ASR_MODEL = "openai/whisper-small"
17
  asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, chunk_length_s=30, ignore_warning=True)
18
 
19
+ # Debug flag: set True to print audio shapes/dtypes and save resampled temp WAVs
20
+ DEBUG = False
21
+ logger = logging.getLogger(__name__)
22
+ if DEBUG:
23
+ logging.basicConfig(level=logging.DEBUG)
24
+
25
  def save_audio_to_wav(audio, sr):
26
  """
27
  audio: numpy array (samples,) or path string
28
  sr: sample rate
29
  Returns path to saved wav
30
  """
31
+ # unwrap common tuple forms (array, sr) or (sr, array)
32
+ if isinstance(audio, (list, tuple)):
33
+ # prefer numpy array element
34
+ arr = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
35
+ if isinstance(arr, (list, tuple)):
36
+ audio = np.asarray(arr)
37
+ elif isinstance(arr, np.ndarray):
38
+ audio = arr
39
+ else:
40
+ # fallback to first element
41
+ audio = np.asarray(audio[0])
42
+
43
+ # ensure numpy array
44
+ audio = np.asarray(audio)
45
+
46
+ # If shape is (channels, frames) transpose to (frames, channels)
47
+ if audio.ndim == 2 and audio.shape[0] <= 2 and audio.shape[1] > audio.shape[0]:
48
+ audio = audio.T
49
+
50
+ # Convert integer audio to float32 in [-1, 1] or ensure float32
51
+ if np.issubdtype(audio.dtype, np.integer):
52
+ maxv = np.iinfo(audio.dtype).max
53
+ audio = audio.astype("float32") / float(maxv)
54
+ else:
55
+ audio = audio.astype("float32")
56
+
57
  tmpdir = tempfile.gettempdir()
58
  fname = Path(tmpdir) / f"hf_audio_{uuid.uuid4().hex}.wav"
59
  sf.write(str(fname), audio, sr, format="WAV")
 
67
  if audio is None:
68
  return "No audio provided."
69
 
70
+ # If Gradio gives a filepath (str), read it with soundfile to avoid ffmpeg requirement
71
+ audio_array = None
72
+ sampling_rate = None
73
+
74
  if isinstance(audio, str):
75
+ try:
76
+ audio_array, sampling_rate = sf.read(audio)
77
+ except Exception as e:
78
+ return f"Could not read audio file: {e}"
79
+ else:
80
+ # Normalize audio to (samples, sr)
81
+ samples = None
82
+ sr = None
83
+ if isinstance(audio, (list, tuple)):
84
+ # common forms: (samples, sr) or (sr, samples)
85
+ if len(audio) >= 2:
86
+ a0, a1 = audio[0], audio[1]
87
+ if isinstance(a0, (list, tuple, np.ndarray)):
88
+ samples, sr = a0, a1
89
+ elif isinstance(a1, (list, tuple, np.ndarray)):
90
+ samples, sr = a1, a0
91
+ # fallback: try to find array and int within the tuple
92
+ if samples is None:
93
+ samples = next((x for x in audio if isinstance(x, (list, tuple, np.ndarray))), None)
94
+ sr = next((x for x in audio if isinstance(x, int)), None)
95
+ else:
96
+ samples = audio
97
+
98
+ if samples is None:
99
+ return "Unsupported audio format."
100
+
101
+ # default sr if missing
102
+ if sr is None:
103
+ sr = 16000
104
+
105
+ audio_array = np.asarray(samples)
106
+ sampling_rate = sr
107
+
108
+ # Ensure numpy array and float32
109
+ try:
110
+ audio_array = np.asarray(audio_array)
111
+ except Exception:
112
+ return "Unsupported audio data - cannot convert to numpy array."
113
+
114
+ # If 2D (frames, channels) or (channels, frames), make mono by averaging channels
115
+ if audio_array.ndim == 2:
116
+ # If shape looks like (channels, frames), transpose first
117
+ if audio_array.shape[0] <= 2 and audio_array.shape[1] > audio_array.shape[0]:
118
+ audio_array = audio_array.T
119
+ # average channels to mono
120
+ audio_array = np.mean(audio_array, axis=1)
121
+
122
+ # Convert integer audio to float32 in [-1, 1] or ensure float32
123
+ if np.issubdtype(audio_array.dtype, np.integer):
124
+ maxv = np.iinfo(audio_array.dtype).max
125
+ audio_array = audio_array.astype("float32") / float(maxv)
126
  else:
127
+ audio_array = audio_array.astype("float32")
128
+ # Resample to the model's expected sampling rate if needed (avoid passing sampling_rate kwarg)
129
+ try:
130
+ model_sr = getattr(getattr(asr, "feature_extractor", None), "sampling_rate", None)
131
+ except Exception:
132
+ model_sr = None
133
+
134
+ if model_sr is None:
135
+ model_sr = 16000
136
+
137
+ # if incoming sampling_rate is missing, assume model rate
138
+ if sampling_rate is None:
139
+ sampling_rate = model_sr
140
+
141
+ if sampling_rate != model_sr:
142
+ # simple linear resampling via numpy.interp
143
+ try:
144
+ orig_len = audio_array.shape[0]
145
+ new_len = int(round(orig_len * float(model_sr) / float(sampling_rate)))
146
+ if new_len <= 0:
147
+ return "Transcription failed: invalid resample length"
148
+ new_indices = np.linspace(0, orig_len - 1, new_len)
149
+ old_indices = np.arange(orig_len)
150
+ audio_array = np.interp(new_indices, old_indices, audio_array).astype("float32")
151
+ sampling_rate = model_sr
152
+ except Exception as e:
153
+ return f"Transcription failed during resampling: {e}"
154
+
155
+ # Debug: log and optionally save the resampled audio
156
+ if DEBUG:
157
  try:
158
+ logger.debug(f"Calling ASR with audio_array.shape={audio_array.shape}, dtype={audio_array.dtype}, sampling_rate={sampling_rate}")
159
+ tmpdir = tempfile.gettempdir()
160
+ dbg_fname = Path(tmpdir) / f"hf_debug_audio_{uuid.uuid4().hex}.wav"
161
+ sf.write(str(dbg_fname), audio_array, sampling_rate, format="WAV")
162
+ logger.debug(f"Wrote debug WAV to {dbg_fname}")
163
+ except Exception as e:
164
+ logger.debug(f"Debug save failed: {e}")
165
+
166
+ # Use the pipeline to transcribe by passing just the numpy array (model expects array at its sampling rate)
167
+ try:
168
+ result = asr(audio_array)
169
+ except Exception as e:
170
+ return f"Transcription failed: {e}"
171
  text = result.get("text", "").strip()
172
 
173
  # cleanup temporary file
174
  try:
175
+ pass # Removed cleanup code referencing undefined audio_path
 
176
  except Exception:
177
  pass
178
 
requirements.txt CHANGED
@@ -2,3 +2,4 @@ gradio>=3.34
2
  transformers>=4.30.0
3
  torch # CPU will be used by default on free Spaces
4
  soundfile
 
 
2
  transformers>=4.30.0
3
  torch # CPU will be used by default on free Spaces
4
  soundfile
5
+ numpy