Chia Woon Yap commited on
Commit
93ff155
·
verified ·
1 Parent(s): 1376d34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -87
app.py CHANGED
@@ -31,7 +31,6 @@ import re
31
 
32
  import torch
33
  import torchaudio
34
- from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
35
 
36
  # Set API Key
37
  groq.api_key = os.getenv("GROQ_API_KEY")
@@ -87,133 +86,143 @@ Answer: d) 0.4
87
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
88
  """
89
 
90
- # Enhanced Whisper Transcriber with Chunked Processing
91
- class EnhancedWhisperTranscriber:
92
- def __init__(self, model_name=None):
93
- # Auto-select optimal model based on hardware
94
- if model_name is None:
95
- model_name = self.get_optimal_model()
96
-
97
  self.device = 0 if torch.cuda.is_available() else "cpu"
98
  self.model_name = model_name
99
 
100
  print(f"Initializing Whisper model: {model_name} on {self.device}")
101
 
102
- self.pipe = pipeline(
103
- task="automatic-speech-recognition",
104
- model=model_name,
105
- chunk_length_s=30, # Process in 30-second chunks
106
- stride_length_s=5, # 5-second overlap between chunks
107
- device=self.device,
108
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
109
- )
110
-
111
- def get_optimal_model(self):
112
- """Automatically select the best model for available hardware"""
113
- if torch.cuda.is_available():
114
- gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
115
- if gpu_memory > 8: # 8GB+ VRAM
116
- return "openai/whisper-small.en"
117
- else: # Limited VRAM
118
- return "openai/whisper-base.en"
119
- else: # CPU only
120
- return "openai/whisper-base.en" # Balanced choice for CPU
121
 
122
- def transcribe_numpy(self, sr, y, return_timestamps=False):
123
- """Transcribe numpy array audio with chunked processing"""
124
  try:
125
- # Enhanced audio preprocessing
 
 
126
  if y.ndim > 1:
127
  y = y.mean(axis=1) # Convert to mono
128
 
 
129
  y = y.astype(np.float32)
130
 
131
- # Normalize audio
132
  max_val = np.max(np.abs(y))
133
  if max_val > 0:
134
  y = y / max_val
135
 
136
- # Remove silence (simple threshold-based)
137
- silence_threshold = 0.01
138
- non_silent_indices = np.where(np.abs(y) > silence_threshold)[0]
139
 
140
- if len(non_silent_indices) == 0:
141
- return "No speech detected. Please speak louder or check your microphone."
 
142
 
143
- # Trim silence from beginning and end
144
- start_idx = non_silent_indices[0]
145
- end_idx = non_silent_indices[-1]
146
- y_trimmed = y[start_idx:end_idx+1]
147
 
148
- # Check if audio is too short
149
- if len(y_trimmed) / sr < 0.5: # Less than 0.5 seconds
150
- return "Audio too short. Please speak for at least 1-2 seconds."
 
 
151
 
152
- # Create audio dict for pipeline
153
- inputs = {"array": y_trimmed, "sampling_rate": sr}
154
-
155
- # Enhanced transcription with chunked processing
156
- result = self.pipe(
157
- inputs,
158
- batch_size=4, # Optimal batch size for chunked processing
159
- generate_kwargs={"task": "transcribe"},
160
- return_timestamps=return_timestamps
161
- )
162
 
163
  text = result["text"].strip()
 
164
 
165
  if not text:
166
- return "No clear speech detected. Try speaking more clearly or in a quieter environment."
 
 
 
 
 
167
 
168
  return text
169
 
170
  except Exception as e:
171
  error_msg = f"Transcription error: {str(e)}"
172
- print(error_msg)
173
- return f"Sorry, I couldn't process the audio. Please try again or type your message instead."
174
-
175
- # Initialize the enhanced transcriber
176
- transcriber = EnhancedWhisperTranscriber()
 
 
 
 
 
 
177
 
178
  def get_transcription_status(audio):
179
  """Provide status feedback for transcription"""
180
  if audio is None:
181
  return "Ready to record audio"
182
 
183
- sr, y = audio
184
- duration = len(y) / sr if sr > 0 else 0
185
-
186
- if duration < 0.5:
187
- return "Audio too short - please record at least 1 second"
188
- elif duration > 60 and not torch.cuda.is_available():
189
- return "Long audio detected on CPU - this may take a while..."
190
- else:
191
- device = "GPU" if torch.cuda.is_available() else "CPU"
192
- return f"Processing {duration:.1f}s audio on {device}..."
 
 
 
193
 
194
  def transcribe_audio(audio):
195
- """Main transcription function with progress feedback"""
196
  if audio is None:
197
  return "Please record audio first"
198
 
199
- # Show device info for debugging
200
- device_type = "GPU" if torch.cuda.is_available() else "CPU"
201
- print(f"Transcribing on {device_type} using {transcriber.model_name}")
202
-
203
- sr, y = audio
204
-
205
- # For CPU users, we might want to show a warning for long audio
206
- audio_duration = len(y) / sr if sr > 0 else 0
207
- if not torch.cuda.is_available() and audio_duration > 30: # Longer than 30 seconds on CPU
208
- print("Warning: Long audio on CPU - transcription may take a while...")
209
 
210
- # Use the enhanced transcriber
211
- result = transcriber.transcribe_numpy(sr, y)
212
-
213
- # Log transcription result for debugging
214
- print(f"Transcription result: {result[:100]}...")
215
-
216
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  # Function to clean AI response by removing unwanted formatting
219
  def clean_response(response):
 
31
 
32
  import torch
33
  import torchaudio
 
34
 
35
  # Set API Key
36
  groq.api_key = os.getenv("GROQ_API_KEY")
 
86
  Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
87
  """
88
 
89
+ # Simplified and Robust Whisper Transcriber
90
+ class SimpleWhisperTranscriber:
91
+ def __init__(self, model_name="openai/whisper-base.en"):
 
 
 
 
92
  self.device = 0 if torch.cuda.is_available() else "cpu"
93
  self.model_name = model_name
94
 
95
  print(f"Initializing Whisper model: {model_name} on {self.device}")
96
 
97
+ try:
98
+ # Simplified pipeline with minimal parameters
99
+ self.pipe = pipeline(
100
+ task="automatic-speech-recognition",
101
+ model=model_name,
102
+ device=self.device,
103
+ )
104
+ print("✅ Whisper model loaded successfully")
105
+ except Exception as e:
106
+ print(f"❌ Error loading Whisper model: {e}")
107
+ # Fallback to tiny model if base fails
108
+ self.pipe = pipeline(
109
+ task="automatic-speech-recognition",
110
+ model="openai/whisper-tiny.en",
111
+ device=self.device,
112
+ )
 
 
 
113
 
114
+ def transcribe_numpy(self, sr, y):
115
+ """Simplified and robust transcription"""
116
  try:
117
+ print(f"Audio shape: {y.shape}, Sample rate: {sr}")
118
+
119
+ # Basic preprocessing - keep it simple
120
  if y.ndim > 1:
121
  y = y.mean(axis=1) # Convert to mono
122
 
123
+ # Convert to proper data type
124
  y = y.astype(np.float32)
125
 
126
+ # Simple normalization
127
  max_val = np.max(np.abs(y))
128
  if max_val > 0:
129
  y = y / max_val
130
 
131
+ print(f"After preprocessing - shape: {y.shape}, max: {np.max(y)}, min: {np.min(y)}")
 
 
132
 
133
+ # Check audio length
134
+ audio_duration = len(y) / sr
135
+ print(f"Audio duration: {audio_duration:.2f} seconds")
136
 
137
+ if audio_duration < 0.3:
138
+ return "Audio too short. Please speak for at least 1 second."
 
 
139
 
140
+ # Create audio input - SIMPLIFIED
141
+ audio_input = {
142
+ "array": y,
143
+ "sampling_rate": sr
144
+ }
145
 
146
+ # Simple transcription call
147
+ print("Starting transcription...")
148
+ result = self.pipe(audio_input)
149
+ print("Transcription completed")
 
 
 
 
 
 
150
 
151
  text = result["text"].strip()
152
+ print(f"Raw transcription: '{text}'")
153
 
154
  if not text:
155
+ return "No speech detected. Please try speaking more clearly."
156
+
157
+ # Check for common false positives
158
+ false_positives = ["", "you", "thank you", "thanks for watching", "hello", "hi"]
159
+ if text.lower() in false_positives:
160
+ return "No meaningful speech detected. Please try again with clearer audio."
161
 
162
  return text
163
 
164
  except Exception as e:
165
  error_msg = f"Transcription error: {str(e)}"
166
+ print(f"❌ {error_msg}")
167
+ # Return more specific error message
168
+ return f"Audio processing failed: {str(e)}"
169
+
170
+ # Initialize the transcriber
171
+ try:
172
+ transcriber = SimpleWhisperTranscriber()
173
+ print("✅ Transcriber initialized successfully")
174
+ except Exception as e:
175
+ print(f"❌ Failed to initialize transcriber: {e}")
176
+ transcriber = None
177
 
178
  def get_transcription_status(audio):
179
  """Provide status feedback for transcription"""
180
  if audio is None:
181
  return "Ready to record audio"
182
 
183
+ try:
184
+ sr, y = audio
185
+ duration = len(y) / sr if sr > 0 else 0
186
+
187
+ if duration < 0.5:
188
+ return "Audio too short - please record at least 1 second"
189
+ elif duration > 60 and not torch.cuda.is_available():
190
+ return "Long audio detected on CPU - this may take a while..."
191
+ else:
192
+ device = "GPU" if torch.cuda.is_available() else "CPU"
193
+ return f"Processing {duration:.1f}s audio on {device}..."
194
+ except Exception as e:
195
+ return f"Error analyzing audio: {str(e)}"
196
 
197
  def transcribe_audio(audio):
198
+ """Main transcription function with better error handling"""
199
  if audio is None:
200
  return "Please record audio first"
201
 
202
+ if transcriber is None:
203
+ return "Transcription service not available. Please type your message."
 
 
 
 
 
 
 
 
204
 
205
+ try:
206
+ sr, y = audio
207
+
208
+ # Basic validation
209
+ if sr is None or y is None or len(y) == 0:
210
+ return "Invalid audio data. Please try recording again."
211
+
212
+ print(f"=== Starting Transcription ===")
213
+ print(f"Sample rate: {sr}, Audio length: {len(y)}")
214
+
215
+ result = transcriber.transcribe_numpy(sr, y)
216
+
217
+ print(f"=== Transcription Result ===")
218
+ print(f"Result: '{result}'")
219
+
220
+ return result
221
+
222
+ except Exception as e:
223
+ error_msg = f"Unexpected error: {str(e)}"
224
+ print(f"❌ {error_msg}")
225
+ return "Failed to process audio. Please try typing your message instead."
226
 
227
  # Function to clean AI response by removing unwanted formatting
228
  def clean_response(response):