Chia Woon Yap commited on
Commit
fb80bae
·
verified ·
1 Parent(s): f8542fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -116
app.py CHANGED
@@ -12,6 +12,7 @@ import os
12
  import time
13
  import groq
14
  import uuid
 
15
 
16
  # LangChain imports
17
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
@@ -27,10 +28,8 @@ import fitz # PyMuPDF for PDFs
27
  import docx # python-docx for Word files
28
  import gtts # Google Text-to-Speech library
29
  from pptx import Presentation # python-pptx for PowerPoint files
30
- import re
31
 
32
  import torch
33
- import torchaudio
34
 
35
  # Set API Key
36
  groq.api_key = os.getenv("GROQ_API_KEY")
@@ -86,143 +85,151 @@ Answer: d) 0.4
86
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
87
  """
88
 
89
- # Simplified and Robust Whisper Transcriber
90
- class SimpleWhisperTranscriber:
91
- def __init__(self, model_name="openai/whisper-base.en"):
92
  self.device = 0 if torch.cuda.is_available() else "cpu"
93
- self.model_name = model_name
94
 
95
- print(f"Initializing Whisper model: {model_name} on {self.device}")
 
 
 
 
 
 
 
 
 
96
 
97
- try:
98
- # Simplified pipeline with minimal parameters
99
- self.pipe = pipeline(
100
- task="automatic-speech-recognition",
101
- model=model_name,
102
- device=self.device,
103
- )
104
- print("✅ Whisper model loaded successfully")
105
- except Exception as e:
106
- print(f"❌ Error loading Whisper model: {e}")
107
- # Fallback to tiny model if base fails
108
- self.pipe = pipeline(
109
- task="automatic-speech-recognition",
110
- model="openai/whisper-tiny.en",
111
- device=self.device,
112
- )
113
 
114
- def transcribe_numpy(self, sr, y):
115
- """Simplified and robust transcription"""
 
 
 
116
  try:
117
- print(f"Audio shape: {y.shape}, Sample rate: {sr}")
118
 
119
- # Basic preprocessing - keep it simple
120
- if y.ndim > 1:
121
- y = y.mean(axis=1) # Convert to mono
122
 
123
- # Convert to proper data type
124
- y = y.astype(np.float32)
 
125
 
126
- # Simple normalization
127
- max_val = np.max(np.abs(y))
128
- if max_val > 0:
129
- y = y / max_val
130
 
131
- print(f"After preprocessing - shape: {y.shape}, max: {np.max(y)}, min: {np.min(y)}")
 
 
 
132
 
133
- # Check audio length
134
  audio_duration = len(y) / sr
135
  print(f"Audio duration: {audio_duration:.2f} seconds")
136
 
137
- if audio_duration < 0.3:
138
- return "Audio too short. Please speak for at least 1 second."
139
 
140
- # Create audio input - SIMPLIFIED
141
- audio_input = {
142
- "array": y,
143
- "sampling_rate": sr
144
- }
145
 
146
- # Simple transcription call
147
  print("Starting transcription...")
148
- result = self.pipe(audio_input)
149
- print("Transcription completed")
150
 
151
- text = result["text"].strip()
152
- print(f"Raw transcription: '{text}'")
 
153
 
154
- if not text:
155
- return "No speech detected. Please try speaking more clearly."
156
 
157
- # Check for common false positives
158
- false_positives = ["", "you", "thank you", "thanks for watching", "hello", "hi"]
159
- if text.lower() in false_positives:
160
- return "No meaningful speech detected. Please try again with clearer audio."
161
 
162
- return text
163
 
164
  except Exception as e:
165
- error_msg = f"Transcription error: {str(e)}"
166
- print(f" {error_msg}")
167
- # Return more specific error message
168
- return f"Audio processing failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- # Initialize the transcriber
171
  try:
172
- transcriber = SimpleWhisperTranscriber()
173
- print("✅ Transcriber initialized successfully")
174
  except Exception as e:
175
- print(f"Failed to initialize transcriber: {e}")
176
  transcriber = None
177
 
 
 
 
 
 
 
 
178
  def get_transcription_status(audio):
179
- """Provide status feedback for transcription"""
180
  if audio is None:
181
- return "Ready to record audio"
182
 
183
  try:
184
  sr, y = audio
185
  duration = len(y) / sr if sr > 0 else 0
186
 
187
  if duration < 0.5:
188
- return "Audio too short - please record at least 1 second"
189
- elif duration > 60 and not torch.cuda.is_available():
190
- return "Long audio detected on CPU - this may take a while..."
191
  else:
192
- device = "GPU" if torch.cuda.is_available() else "CPU"
193
- return f"Processing {duration:.1f}s audio on {device}..."
194
- except Exception as e:
195
- return f"Error analyzing audio: {str(e)}"
196
-
197
- def transcribe_audio(audio):
198
- """Main transcription function with better error handling"""
199
- if audio is None:
200
- return "Please record audio first"
201
-
202
- if transcriber is None:
203
- return "Transcription service not available. Please type your message."
204
-
205
- try:
206
- sr, y = audio
207
-
208
- # Basic validation
209
- if sr is None or y is None or len(y) == 0:
210
- return "Invalid audio data. Please try recording again."
211
-
212
- print(f"=== Starting Transcription ===")
213
- print(f"Sample rate: {sr}, Audio length: {len(y)}")
214
-
215
- result = transcriber.transcribe_numpy(sr, y)
216
-
217
- print(f"=== Transcription Result ===")
218
- print(f"Result: '{result}'")
219
-
220
- return result
221
-
222
- except Exception as e:
223
- error_msg = f"Unexpected error: {str(e)}"
224
- print(f"❌ {error_msg}")
225
- return "Failed to process audio. Please try typing your message instead."
226
 
227
  # Function to clean AI response by removing unwanted formatting
228
  def clean_response(response):
@@ -424,25 +431,25 @@ def tutor_ai_chatbot():
424
  transcription_status = gr.Textbox(
425
  label="Transcription Status",
426
  interactive=False,
427
- value="Record audio to see status here",
428
  max_lines=2
429
  )
430
 
431
  # Voice recording tips - ONLY in AI Chatbot tab
432
  with gr.Accordion("Voice Recording Tips", open=False):
433
  gr.Markdown("""
434
- **For better speech recognition accuracy:**
435
- - Speak clearly and at a moderate pace
436
- - Record in a quiet environment
437
- - Keep the microphone close to your mouth (10-15 cm)
438
- - Use a good quality microphone if possible
439
- - Review the transcribed text before sending
440
- - If transcription is poor, try recording again or type manually
441
 
442
- **Performance Info:**
443
- - GPU: Fast transcription (2-5 seconds)
444
- - CPU: Slower but functional (10-30 seconds for longer audio)
445
- - Using model: whisper-base.en (optimized for accuracy/speed balance)
 
446
  """)
447
 
448
  # Clear chat history button
@@ -492,7 +499,7 @@ def tutor_ai_chatbot():
492
  inputs=audio_input,
493
  outputs=msg
494
  ).then(
495
- fn=lambda x: "Transcription completed!" if x and x != "Please record audio first" else "Ready for new recording",
496
  inputs=msg,
497
  outputs=transcription_status
498
  )
 
12
  import time
13
  import groq
14
  import uuid
15
+ import re
16
 
17
  # LangChain imports
18
  from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
 
28
  import docx # python-docx for Word files
29
  import gtts # Google Text-to-Speech library
30
  from pptx import Presentation # python-pptx for PowerPoint files
 
31
 
32
  import torch
 
33
 
34
  # Set API Key
35
  groq.api_key = os.getenv("GROQ_API_KEY")
 
85
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
86
  """
87
 
88
+ # Fixed Whisper Implementation
89
+ class FixedWhisperTranscriber:
90
+ def __init__(self):
91
  self.device = 0 if torch.cuda.is_available() else "cpu"
92
+ print(f"Using device: {self.device}")
93
 
94
+ # Try multiple models in order
95
+ self.model = self._load_model()
96
+
97
+ def _load_model(self):
98
+ """Try loading different models until one works"""
99
+ models_to_try = [
100
+ "openai/whisper-base",
101
+ "openai/whisper-tiny",
102
+ "openai/whisper-small",
103
+ ]
104
 
105
+ for model_name in models_to_try:
106
+ try:
107
+ print(f"Trying to load: {model_name}")
108
+ pipe = pipeline(
109
+ "automatic-speech-recognition",
110
+ model=model_name,
111
+ device=self.device,
112
+ )
113
+ print(f"✅ Successfully loaded: {model_name}")
114
+ return pipe
115
+ except Exception as e:
116
+ print(f"❌ Failed to load {model_name}: {e}")
117
+ continue
118
+
119
+ raise Exception("All models failed to load")
 
120
 
121
+ def transcribe_audio(self, audio):
122
+ """Robust transcription with proper error handling"""
123
+ if audio is None:
124
+ return "Please record audio first"
125
+
126
  try:
127
+ sr, y = audio
128
 
129
+ print(f"Audio received - Sample rate: {sr}, Length: {len(y)}")
 
 
130
 
131
+ # Basic validation
132
+ if len(y) == 0:
133
+ return "Empty audio detected"
134
 
135
+ # Convert to mono if stereo
136
+ if y.ndim > 1:
137
+ y = np.mean(y, axis=1)
 
138
 
139
+ # Convert to float32 and normalize
140
+ y = y.astype(np.float32)
141
+ if np.max(np.abs(y)) > 0:
142
+ y = y / np.max(np.abs(y))
143
 
144
+ # Check audio quality
145
  audio_duration = len(y) / sr
146
  print(f"Audio duration: {audio_duration:.2f} seconds")
147
 
148
+ if audio_duration < 0.5:
149
+ return "Audio too short. Speak for at least 1 second."
150
 
151
+ if audio_duration > 30:
152
+ return "Audio too long. Keep it under 30 seconds."
153
+
154
+ # Prepare audio for Whisper
155
+ audio_dict = {"array": y, "sampling_rate": sr}
156
 
 
157
  print("Starting transcription...")
 
 
158
 
159
+ # Simple transcription call
160
+ result = self.model(audio_dict)
161
+ transcription = result["text"].strip()
162
 
163
+ print(f"Raw transcription: '{transcription}'")
 
164
 
165
+ # Filter out garbage outputs
166
+ if self._is_garbage_transcription(transcription):
167
+ return "No clear speech detected. Please try again with clearer audio."
 
168
 
169
+ return transcription
170
 
171
  except Exception as e:
172
+ print(f"Transcription error: {str(e)}")
173
+ return f"Transcription failed: {str(e)}"
174
+
175
+ def _is_garbage_transcription(self, text):
176
+ """Check if transcription is garbage"""
177
+ if not text:
178
+ return True
179
+
180
+ # Common garbage patterns
181
+ garbage_patterns = [
182
+ r"^(oh,\s*)+oh$",
183
+ r"^(ah,\s*)+ah$",
184
+ r"^(\w+,\s*)+\w+$", # Repeated single words
185
+ ]
186
+
187
+ text_lower = text.lower().strip()
188
+
189
+ for pattern in garbage_patterns:
190
+ if re.match(pattern, text_lower):
191
+ return True
192
+
193
+ # Check if it's just repetitive nonsense
194
+ words = text_lower.split()
195
+ if len(words) > 10:
196
+ unique_words = len(set(words))
197
+ if unique_words / len(words) < 0.3: # Too repetitive
198
+ return True
199
+
200
+ return False
201
 
202
+ # Initialize transcriber
203
  try:
204
+ transcriber = FixedWhisperTranscriber()
 
205
  except Exception as e:
206
+ print(f"Failed to initialize transcriber: {e}")
207
  transcriber = None
208
 
209
+ def transcribe_audio(audio):
210
+ """Main transcription function"""
211
+ if transcriber is None:
212
+ return "Speech recognition not available"
213
+
214
+ return transcriber.transcribe_audio(audio)
215
+
216
  def get_transcription_status(audio):
217
+ """Status updates"""
218
  if audio is None:
219
+ return "Click record to start"
220
 
221
  try:
222
  sr, y = audio
223
  duration = len(y) / sr if sr > 0 else 0
224
 
225
  if duration < 0.5:
226
+ return "Recording... (keep speaking)"
227
+ elif duration > 10:
228
+ return "Processing longer audio..."
229
  else:
230
+ return "Processing audio..."
231
+ except:
232
+ return "Ready to record"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  # Function to clean AI response by removing unwanted formatting
235
  def clean_response(response):
 
431
  transcription_status = gr.Textbox(
432
  label="Transcription Status",
433
  interactive=False,
434
+ value="Click record to start",
435
  max_lines=2
436
  )
437
 
438
  # Voice recording tips - ONLY in AI Chatbot tab
439
  with gr.Accordion("Voice Recording Tips", open=False):
440
  gr.Markdown("""
441
+ **For perfect transcription:**
442
+ - 🎤 Speak clearly and directly into microphone
443
+ - 🔇 Record in QUIET environment (no background noise)
444
+ - 📏 Keep recording between 2-10 seconds
445
+ - 🗣️ Speak at normal volume and pace
446
+ - 📱 Use a good quality microphone
 
447
 
448
+ **If you see 'oh oh oh' errors:**
449
+ - Your audio might be too noisy
450
+ - Try recording in a quieter place
451
+ - Speak more clearly and slowly
452
+ - Use headphones with microphone
453
  """)
454
 
455
  # Clear chat history button
 
499
  inputs=audio_input,
500
  outputs=msg
501
  ).then(
502
+ fn=lambda x: "Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
503
  inputs=msg,
504
  outputs=transcription_status
505
  )