Chia Woon Yap commited on
Commit
7242b45
Β·
verified Β·
1 Parent(s): fb80bae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -89
app.py CHANGED
@@ -7,12 +7,12 @@ Original file is located at
7
 
8
  import gradio as gr
9
  import numpy as np
10
- from transformers import pipeline
11
  import os
12
  import time
13
  import groq
14
  import uuid
15
  import re
 
16
 
17
  # LangChain imports
18
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
@@ -29,8 +29,6 @@ import docx # python-docx for Word files
29
  import gtts # Google Text-to-Speech library
30
  from pptx import Presentation # python-pptx for PowerPoint files
31
 
32
- import torch
33
-
34
  # Set API Key
35
  groq.api_key = os.getenv("GROQ_API_KEY")
36
 
@@ -85,41 +83,14 @@ Answer: d) 0.4
85
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
86
  """
87
 
88
- # Fixed Whisper Implementation
89
- class FixedWhisperTranscriber:
90
  def __init__(self):
91
- self.device = 0 if torch.cuda.is_available() else "cpu"
92
- print(f"Using device: {self.device}")
93
-
94
- # Try multiple models in order
95
- self.model = self._load_model()
96
-
97
- def _load_model(self):
98
- """Try loading different models until one works"""
99
- models_to_try = [
100
- "openai/whisper-base",
101
- "openai/whisper-tiny",
102
- "openai/whisper-small",
103
- ]
104
-
105
- for model_name in models_to_try:
106
- try:
107
- print(f"Trying to load: {model_name}")
108
- pipe = pipeline(
109
- "automatic-speech-recognition",
110
- model=model_name,
111
- device=self.device,
112
- )
113
- print(f"βœ… Successfully loaded: {model_name}")
114
- return pipe
115
- except Exception as e:
116
- print(f"❌ Failed to load {model_name}: {e}")
117
- continue
118
-
119
- raise Exception("All models failed to load")
120
 
121
  def transcribe_audio(self, audio):
122
- """Robust transcription with proper error handling"""
123
  if audio is None:
124
  return "Please record audio first"
125
 
@@ -136,74 +107,74 @@ class FixedWhisperTranscriber:
136
  if y.ndim > 1:
137
  y = np.mean(y, axis=1)
138
 
139
- # Convert to float32 and normalize
140
  y = y.astype(np.float32)
141
- if np.max(np.abs(y)) > 0:
142
- y = y / np.max(np.abs(y))
143
 
144
- # Check audio quality
 
 
 
 
 
145
  audio_duration = len(y) / sr
146
  print(f"Audio duration: {audio_duration:.2f} seconds")
147
 
148
  if audio_duration < 0.5:
149
  return "Audio too short. Speak for at least 1 second."
150
 
151
- if audio_duration > 30:
152
- return "Audio too long. Keep it under 30 seconds."
 
 
 
 
 
 
153
 
154
- # Prepare audio for Whisper
155
- audio_dict = {"array": y, "sampling_rate": sr}
156
 
157
- print("Starting transcription...")
 
158
 
159
- # Simple transcription call
160
- result = self.model(audio_dict)
161
- transcription = result["text"].strip()
162
 
163
- print(f"Raw transcription: '{transcription}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # Filter out garbage outputs
166
- if self._is_garbage_transcription(transcription):
167
- return "No clear speech detected. Please try again with clearer audio."
168
 
169
- return transcription
170
 
171
  except Exception as e:
172
- print(f"Transcription error: {str(e)}")
 
 
 
 
 
 
173
  return f"Transcription failed: {str(e)}"
174
-
175
- def _is_garbage_transcription(self, text):
176
- """Check if transcription is garbage"""
177
- if not text:
178
- return True
179
-
180
- # Common garbage patterns
181
- garbage_patterns = [
182
- r"^(oh,\s*)+oh$",
183
- r"^(ah,\s*)+ah$",
184
- r"^(\w+,\s*)+\w+$", # Repeated single words
185
- ]
186
-
187
- text_lower = text.lower().strip()
188
-
189
- for pattern in garbage_patterns:
190
- if re.match(pattern, text_lower):
191
- return True
192
-
193
- # Check if it's just repetitive nonsense
194
- words = text_lower.split()
195
- if len(words) > 10:
196
- unique_words = len(set(words))
197
- if unique_words / len(words) < 0.3: # Too repetitive
198
- return True
199
-
200
- return False
201
 
202
  # Initialize transcriber
203
  try:
204
- transcriber = FixedWhisperTranscriber()
 
205
  except Exception as e:
206
- print(f"Failed to initialize transcriber: {e}")
207
  transcriber = None
208
 
209
  def transcribe_audio(audio):
@@ -227,7 +198,7 @@ def get_transcription_status(audio):
227
  elif duration > 10:
228
  return "Processing longer audio..."
229
  else:
230
- return "Processing audio..."
231
  except:
232
  return "Ready to record"
233
 
@@ -445,11 +416,11 @@ def tutor_ai_chatbot():
445
  - πŸ—£οΈ Speak at normal volume and pace
446
  - πŸ“± Use a good quality microphone
447
 
448
- **If you see 'oh oh oh' errors:**
449
- - Your audio might be too noisy
450
- - Try recording in a quieter place
451
- - Speak more clearly and slowly
452
- - Use headphones with microphone
453
  """)
454
 
455
  # Clear chat history button
@@ -499,7 +470,7 @@ def tutor_ai_chatbot():
499
  inputs=audio_input,
500
  outputs=msg
501
  ).then(
502
- fn=lambda x: "Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
503
  inputs=msg,
504
  outputs=transcription_status
505
  )
 
7
 
8
  import gradio as gr
9
  import numpy as np
 
10
  import os
11
  import time
12
  import groq
13
  import uuid
14
  import re
15
+ import tempfile
16
 
17
  # LangChain imports
18
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
 
29
  import gtts # Google Text-to-Speech library
30
  from pptx import Presentation # python-pptx for PowerPoint files
31
 
 
 
32
  # Set API Key
33
  groq.api_key = os.getenv("GROQ_API_KEY")
34
 
 
83
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
84
  """
85
 
86
+ # Groq Whisper Transcriber - RELIABLE SOLUTION
87
+ class GroqWhisperTranscriber:
88
  def __init__(self):
89
+ self.client = groq.Client(api_key=groq.api_key)
90
+ print("βœ… Groq Whisper transcriber initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  def transcribe_audio(self, audio):
93
+ """Transcribe audio using Groq's reliable Whisper API"""
94
  if audio is None:
95
  return "Please record audio first"
96
 
 
107
  if y.ndim > 1:
108
  y = np.mean(y, axis=1)
109
 
110
+ # Convert to proper format
111
  y = y.astype(np.float32)
 
 
112
 
113
+ # Normalize audio
114
+ max_val = np.max(np.abs(y))
115
+ if max_val > 0:
116
+ y = y / max_val
117
+
118
+ # Check audio duration
119
  audio_duration = len(y) / sr
120
  print(f"Audio duration: {audio_duration:.2f} seconds")
121
 
122
  if audio_duration < 0.5:
123
  return "Audio too short. Speak for at least 1 second."
124
 
125
+ if audio_duration > 60:
126
+ return "Audio too long. Keep it under 60 seconds."
127
+
128
+ # Convert to 16-bit PCM for WAV file
129
+ y_int16 = (y * 32767).astype(np.int16)
130
+
131
+ # Create temporary WAV file
132
+ import scipy.io.wavfile
133
 
134
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
135
+ temp_path = f.name
136
 
137
+ # Save as WAV file
138
+ scipy.io.wavfile.write(temp_path, sr, y_int16)
139
 
140
+ print("Sending to Groq Whisper API...")
 
 
141
 
142
+ # Transcribe with Groq API - USE TURBO VERSION
143
+ with open(temp_path, "rb") as audio_file:
144
+ transcription = self.client.audio.transcriptions.create(
145
+ file=(temp_path, audio_file.read(), "audio/wav"),
146
+ model="whisper-large-v3-turbo", # Use the best model
147
+ response_format="text",
148
+ language="en" # Optional: specify English for better accuracy
149
+ )
150
+
151
+ # Clean up temporary file
152
+ os.unlink(temp_path)
153
+
154
+ text = transcription.strip()
155
+ print(f"Groq transcription: '{text}'")
156
 
157
+ if not text:
158
+ return "No speech detected. Please try again."
 
159
 
160
+ return text
161
 
162
  except Exception as e:
163
+ print(f"Groq transcription error: {str(e)}")
164
+ # Clean up temp file if it exists
165
+ try:
166
+ if 'temp_path' in locals():
167
+ os.unlink(temp_path)
168
+ except:
169
+ pass
170
  return f"Transcription failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  # Initialize transcriber
173
  try:
174
+ transcriber = GroqWhisperTranscriber()
175
+ print("βœ… Transcriber initialized successfully with Groq API")
176
  except Exception as e:
177
+ print(f"❌ Failed to initialize transcriber: {e}")
178
  transcriber = None
179
 
180
  def transcribe_audio(audio):
 
198
  elif duration > 10:
199
  return "Processing longer audio..."
200
  else:
201
+ return "Processing audio with Groq API..."
202
  except:
203
  return "Ready to record"
204
 
 
416
  - πŸ—£οΈ Speak at normal volume and pace
417
  - πŸ“± Use a good quality microphone
418
 
419
+ **Using Groq Whisper API:**
420
+ - βœ… High accuracy transcription
421
+ - βœ… No more "B-B-B" or "oh-oh-oh" errors
422
+ - βœ… Fast and reliable
423
+ - βœ… Professional grade speech recognition
424
  """)
425
 
426
  # Clear chat history button
 
470
  inputs=audio_input,
471
  outputs=msg
472
  ).then(
473
+ fn=lambda x: "βœ… Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
474
  inputs=msg,
475
  outputs=transcription_status
476
  )