Chia Woon Yap commited on
Commit
dafdf4f
·
verified ·
1 Parent(s): d6f71f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -9
app.py CHANGED
@@ -29,6 +29,10 @@ import gtts # Google Text-to-Speech library
29
  from pptx import Presentation # python-pptx for PowerPoint files
30
  import re
31
 
 
 
 
 
32
  # Set API Key
33
  groq.api_key = os.getenv("GROQ_API_KEY")
34
 
@@ -246,37 +250,91 @@ def process_document(file):
246
  except Exception as e:
247
  return f"Error processing document: {str(e)}"
248
 
 
 
249
  # Function to handle speech-to-text conversion
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  def transcribe_audio(audio):
251
- """Simple working transcription"""
252
  if audio is None:
253
  return "Please record audio first"
254
 
255
  try:
256
  sr, y = audio
257
 
258
- # Basic preprocessing
259
  if y.ndim > 1:
260
  y = y.mean(axis=1) # Convert to mono
261
 
 
262
  y = y.astype(np.float32)
 
 
263
  max_val = np.max(np.abs(y))
264
  if max_val > 0:
265
  y = y / max_val
266
 
267
- # Simple pipeline call
268
- transcriber = pipeline(
269
- "automatic-speech-recognition",
270
- model="openai/whisper-base.en"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  )
272
 
273
- result = transcriber({"sampling_rate": sr, "raw": y})
274
  text = result["text"].strip()
275
 
276
- return text if text else "No clear speech detected. Try speaking louder."
 
 
 
277
 
278
  except Exception as e:
279
- return f"Recording error: {str(e)}"
 
 
280
 
281
  # Clear chat history function
282
  def clear_chat_history():
 
29
  from pptx import Presentation # python-pptx for PowerPoint files
30
  import re
31
 
32
+ import torch
33
+ import torchaudio
34
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
35
+
36
  # Set API Key
37
  groq.api_key = os.getenv("GROQ_API_KEY")
38
 
 
250
  except Exception as e:
251
  return f"Error processing document: {str(e)}"
252
 
253
+
254
+
255
  # Function to handle speech-to-text conversion
256
+
257
+ # Initialize Whisper model globally to avoid reloading
258
+ def initialize_whisper_model():
259
+ """Initialize Whisper model once to improve performance"""
260
+ try:
261
+ # Use larger model for better accuracy
262
+ model_name = "openai/whisper-small.en" # or "openai/whisper-medium.en" for even better accuracy
263
+ transcriber = pipeline(
264
+ "automatic-speech-recognition",
265
+ model=model_name,
266
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
267
+ device="cuda" if torch.cuda.is_available() else "cpu"
268
+ )
269
+ return transcriber
270
+ except Exception as e:
271
+ print(f"Error initializing Whisper model: {e}")
272
+ # Fallback to base model
273
+ return pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
274
+
275
+ # Initialize model once
276
+ whisper_model = initialize_whisper_model()
277
+
278
  def transcribe_audio(audio):
279
+ """Enhanced speech-to-text transcription with better preprocessing"""
280
  if audio is None:
281
  return "Please record audio first"
282
 
283
  try:
284
  sr, y = audio
285
 
286
+ # Enhanced audio preprocessing
287
  if y.ndim > 1:
288
  y = y.mean(axis=1) # Convert to mono
289
 
290
+ # Convert to proper data type
291
  y = y.astype(np.float32)
292
+
293
+ # Normalize audio
294
  max_val = np.max(np.abs(y))
295
  if max_val > 0:
296
  y = y / max_val
297
 
298
+ # Remove silence (simple threshold-based)
299
+ silence_threshold = 0.01
300
+ non_silent_indices = np.where(np.abs(y) > silence_threshold)[0]
301
+
302
+ if len(non_silent_indices) == 0:
303
+ return "No speech detected. Please speak louder or check your microphone."
304
+
305
+ # Trim silence from beginning and end
306
+ start_idx = non_silent_indices[0]
307
+ end_idx = non_silent_indices[-1]
308
+ y_trimmed = y[start_idx:end_idx+1]
309
+
310
+ # Check if audio is too short
311
+ if len(y_trimmed) / sr < 0.5: # Less than 0.5 seconds
312
+ return "Audio too short. Please speak for at least 1-2 seconds."
313
+
314
+ # Enhanced transcription with better parameters
315
+ result = whisper_model(
316
+ {
317
+ "sampling_rate": sr,
318
+ "raw": y_trimmed
319
+ },
320
+ return_timestamps=False,
321
+ generate_kwargs={
322
+ "task": "transcribe",
323
+ "language": "en"
324
+ }
325
  )
326
 
 
327
  text = result["text"].strip()
328
 
329
+ if not text or text.lower() in ["", "you", "thank you"]:
330
+ return "No clear speech detected. Try speaking more clearly or in a quieter environment."
331
+
332
+ return text
333
 
334
  except Exception as e:
335
+ error_msg = f"Transcription error: {str(e)}"
336
+ print(error_msg)
337
+ return f"Sorry, I couldn't process the audio. Please try again or type your message instead. Error: {str(e)}"
338
 
339
  # Clear chat history function
340
  def clear_chat_history():