Chia Woon Yap commited on
Commit
f2bf26d
·
verified ·
1 Parent(s): 539cc55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -37
app.py CHANGED
@@ -275,42 +275,7 @@ def process_document(file):
275
  # y /= np.max(np.abs(y))
276
  # return transcriber({"sampling_rate": sr, "raw": y})["text"]
277
 
278
-
279
- #Quick Fixes You Can Try First:
280
-
281
- #def transcribe_audio(audio):
282
- # """Real-time optimized transcription"""
283
- # if audio is None:
284
- # return ""
285
-
286
- # sr, y = audio
287
-
288
- # # Quick preprocessing
289
- # if y.ndim > 1:
290
- # y = y.mean(axis=1)
291
-
292
- # y = y.astype(np.float32)
293
- # max_val = np.max(np.abs(y))
294
- # if max_val > 0:
295
- # y = y / max_val
296
-
297
- # # Use tiny model for real-time speed
298
- # realtime_transcriber = pipeline(
299
- # "automatic-speech-recognition",
300
- # model="openai/whisper-tiny.en", # Fastest model
301
- # device="cuda" if torch.cuda.is_available() else "cpu",
302
- # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
303
- # generate_kwargs={
304
- # "language": "english",
305
- # "task": "transcribe",
306
- # "temperature": 0.0, # More deterministic
307
- # "no_repeat_ngram_size": 2
308
- # }
309
- # )
310
- #
311
- # return realtime_transcriber({"sampling_rate": sr, "raw": y})["text"]
312
- #end
313
-
314
  # Real-time Whisper setup - cache the model
315
  #@gr.cache_resource
316
  def load_realtime_whisper():
@@ -363,9 +328,45 @@ def transcribe_audio(audio):
363
  print(f"Transcription error: {e}")
364
  return "Could not transcribe audio. Please try again."
365
 
 
366
 
367
 
368
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
 
371
 
 
275
  # y /= np.max(np.abs(y))
276
  # return transcriber({"sampling_rate": sr, "raw": y})["text"]
277
 
278
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  # Real-time Whisper setup - cache the model
280
  #@gr.cache_resource
281
  def load_realtime_whisper():
 
328
  print(f"Transcription error: {e}")
329
  return "Could not transcribe audio. Please try again."
330
 
331
+ """
332
 
333
 
334
+ #Common Issue 1: Audio Format Problems
335
+ def transcribe_audio(audio):
336
+ """Fixed version - handles audio format issues"""
337
+ if audio is None:
338
+ return "Please record audio first"
339
+
340
+ try:
341
+ sr, y = audio
342
+
343
+ # FIX: Handle different audio formats from Gradio
344
+ if isinstance(y, np.ndarray):
345
+ # Standard numpy array format
346
+ if y.ndim > 1:
347
+ y = y.mean(axis=1) # Stereo to mono
348
+ y = y.astype(np.float32)
349
+
350
+ # Normalize volume
351
+ if np.max(np.abs(y)) > 0:
352
+ y = y / np.max(np.abs(y))
353
+ else:
354
+ return "Unsupported audio format"
355
+
356
+ # FIX: Use a more reliable approach
357
+ transcriber = pipeline(
358
+ "automatic-speech-recognition",
359
+ model="openai/whisper-base.en"
360
+ )
361
+
362
+ # FIX: Ensure proper input format
363
+ result = transcriber({"sampling_rate": sr, "raw": y})
364
+ text = result["text"].strip()
365
+
366
+ return text if text else "I heard audio but no clear speech. Try speaking louder."
367
+
368
+ except Exception as e:
369
+ return f"Please try again - {str(e)}"
370
 
371
 
372