Chia Woon Yap commited on
Commit
f97b708
·
verified ·
1 Parent(s): 36e9420

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -16
app.py CHANGED
@@ -13,6 +13,10 @@ import time
13
  import groq
14
  import uuid # For generating unique filenames
15
 
 
 
 
 
16
 
17
  # NEW IMPORTS (current):
18
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
@@ -274,6 +278,59 @@ def process_document(file):
274
 
275
  #Quick Fixes You Can Try First:
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  def transcribe_audio(audio):
278
  """Real-time optimized transcription"""
279
  if audio is None:
@@ -283,28 +340,35 @@ def transcribe_audio(audio):
283
 
284
  # Quick preprocessing
285
  if y.ndim > 1:
286
- y = y.mean(axis=1)
287
 
288
  y = y.astype(np.float32)
289
  max_val = np.max(np.abs(y))
290
  if max_val > 0:
291
  y = y / max_val
292
 
293
- # Use tiny model for real-time speed
294
- realtime_transcriber = pipeline(
295
- "automatic-speech-recognition",
296
- model="openai/whisper-tiny.en", # Fastest model
297
- device="cuda" if torch.cuda.is_available() else "cpu",
298
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
299
- generate_kwargs={
300
- "language": "english",
301
- "task": "transcribe",
302
- "temperature": 0.0, # More deterministic
303
- "no_repeat_ngram_size": 2
304
- }
305
- )
306
-
307
- return realtime_transcriber({"sampling_rate": sr, "raw": y})["text"]
 
 
 
 
 
 
 
308
 
309
  # the remaining is the same
310
 
 
13
  import groq
14
  import uuid # For generating unique filenames
15
 
16
+ # Add torch imports at the top
17
+ import torch
18
+ import torchaudio
19
+
20
 
21
  # NEW IMPORTS (current):
22
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
 
278
 
279
  #Quick Fixes You Can Try First:
280
 
281
+ #def transcribe_audio(audio):
282
+ # """Real-time optimized transcription"""
283
+ # if audio is None:
284
+ # return ""
285
+
286
+ # sr, y = audio
287
+
288
+ # # Quick preprocessing
289
+ # if y.ndim > 1:
290
+ # y = y.mean(axis=1)
291
+
292
+ # y = y.astype(np.float32)
293
+ # max_val = np.max(np.abs(y))
294
+ # if max_val > 0:
295
+ # y = y / max_val
296
+
297
+ # # Use tiny model for real-time speed
298
+ # realtime_transcriber = pipeline(
299
+ # "automatic-speech-recognition",
300
+ # model="openai/whisper-tiny.en", # Fastest model
301
+ # device="cuda" if torch.cuda.is_available() else "cpu",
302
+ # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
303
+ # generate_kwargs={
304
+ # "language": "english",
305
+ # "task": "transcribe",
306
+ # "temperature": 0.0, # More deterministic
307
+ # "no_repeat_ngram_size": 2
308
+ # }
309
+ # )
310
+ #
311
+ # return realtime_transcriber({"sampling_rate": sr, "raw": y})["text"]
312
+ #end
313
+
314
+ # Real-time Whisper setup - cache the model
315
+ @gr.cache_resource
316
+ def load_realtime_whisper():
317
+ """Load optimized Whisper model for real-time transcription"""
318
+ device = "cuda" if torch.cuda.is_available() else "cpu"
319
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
320
+
321
+ # Use tiny model for real-time speed
322
+ realtime_transcriber = pipeline(
323
+ "automatic-speech-recognition",
324
+ model="openai/whisper-tiny.en",
325
+ device=device,
326
+ torch_dtype=torch_dtype,
327
+ )
328
+
329
+ return realtime_transcriber
330
+
331
+ # Load model at startup
332
+ realtime_transcriber = load_realtime_whisper()
333
+
334
  def transcribe_audio(audio):
335
  """Real-time optimized transcription"""
336
  if audio is None:
 
340
 
341
  # Quick preprocessing
342
  if y.ndim > 1:
343
+ y = y.mean(axis=1) # Convert to mono
344
 
345
  y = y.astype(np.float32)
346
  max_val = np.max(np.abs(y))
347
  if max_val > 0:
348
  y = y / max_val
349
 
350
+ try:
351
+ # Use real-time transcriber with optimized settings
352
+ result = realtime_transcriber(
353
+ {"sampling_rate": sr, "raw": y},
354
+ generate_kwargs={
355
+ "language": "english",
356
+ "task": "transcribe",
357
+ "temperature": 0.0, # More deterministic
358
+ "no_repeat_ngram_size": 2, # Reduce repetitions
359
+ }
360
+ )
361
+ return result["text"]
362
+ except Exception as e:
363
+ print(f"Transcription error: {e}")
364
+ return "Could not transcribe audio. Please try again."
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
 
373
  # the remaining is the same
374