eduard76 commited on
Commit
faef6ba
·
verified ·
1 Parent(s): 7728870

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -0
app.py CHANGED
@@ -210,19 +210,36 @@ class ProfessionalVoiceAgent:
210
  logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
211
 
212
  # Convert to float32 if needed
 
213
  if audio_data.dtype == np.int16:
 
214
  audio_data = audio_data.astype(np.float32) / 32768.0
215
  elif audio_data.dtype == np.int32:
 
216
  audio_data = audio_data.astype(np.float32) / 2147483648.0
 
 
 
 
217
 
218
  # Handle stereo to mono conversion
219
  if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
220
  audio_data = np.mean(audio_data, axis=1)
 
 
 
 
221
 
222
  # Resample to 16kHz if needed (Whisper requirement)
223
  if sample_rate != 16000:
224
  import librosa
225
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
 
 
 
 
 
 
226
 
227
  # Trim silence and limit audio length for speed (max 30 seconds)
228
  max_samples = 16000 * 30 # 30 seconds at 16kHz
@@ -238,6 +255,8 @@ class ProfessionalVoiceAgent:
238
  return_tensors="pt"
239
  ).input_features.to(self.device)
240
 
 
 
241
  # Generate token ids - optimized for speed
242
  with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
243
  with torch.no_grad():
@@ -246,6 +265,8 @@ class ProfessionalVoiceAgent:
246
  language="en",
247
  task="transcribe"
248
  )
 
 
249
  predicted_ids = self.whisper_model.generate(
250
  input_features,
251
  forced_decoder_ids=forced_decoder_ids,
@@ -254,6 +275,8 @@ class ProfessionalVoiceAgent:
254
  do_sample=False # Deterministic
255
  )
256
 
 
 
257
  # Decode token ids to text
258
  transcription = self.whisper_processor.batch_decode(
259
  predicted_ids,
 
210
  logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
211
 
212
  # Convert to float32 if needed
213
+ logger.info(f"Audio dtype before conversion: {audio_data.dtype}")
214
  if audio_data.dtype == np.int16:
215
+ logger.info("Converting from int16 to float32")
216
  audio_data = audio_data.astype(np.float32) / 32768.0
217
  elif audio_data.dtype == np.int32:
218
+ logger.info("Converting from int32 to float32")
219
  audio_data = audio_data.astype(np.float32) / 2147483648.0
220
+ elif audio_data.dtype == np.float64:
221
+ logger.info("Converting from float64 to float32")
222
+ audio_data = audio_data.astype(np.float32)
223
+ logger.info(f"Audio dtype after conversion: {audio_data.dtype}")
224
 
225
  # Handle stereo to mono conversion
226
  if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
227
  audio_data = np.mean(audio_data, axis=1)
228
+ logger.info(f"Converted stereo to mono, new shape: {audio_data.shape}")
229
+
230
+ # Check audio statistics before resampling
231
+ logger.info(f"Audio stats - min: {audio_data.min():.4f}, max: {audio_data.max():.4f}, mean: {audio_data.mean():.4f}")
232
 
233
  # Resample to 16kHz if needed (Whisper requirement)
234
  if sample_rate != 16000:
235
  import librosa
236
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
237
+ logger.info(f"Resampled to 16kHz, new length: {len(audio_data)} samples ({len(audio_data)/16000:.2f}s)")
238
+
239
+ # Check if audio is too quiet or silent
240
+ audio_abs_mean = np.abs(audio_data).mean()
241
+ if audio_abs_mean < 0.001:
242
+ logger.warning(f"Audio might be too quiet! Abs mean: {audio_abs_mean}")
243
 
244
  # Trim silence and limit audio length for speed (max 30 seconds)
245
  max_samples = 16000 * 30 # 30 seconds at 16kHz
 
255
  return_tensors="pt"
256
  ).input_features.to(self.device)
257
 
258
+ logger.info(f"Whisper input_features shape: {input_features.shape}, device: {input_features.device}")
259
+
260
  # Generate token ids - optimized for speed
261
  with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
262
  with torch.no_grad():
 
265
  language="en",
266
  task="transcribe"
267
  )
268
+ logger.info(f"Forced decoder IDs: {forced_decoder_ids}")
269
+
270
  predicted_ids = self.whisper_model.generate(
271
  input_features,
272
  forced_decoder_ids=forced_decoder_ids,
 
275
  do_sample=False # Deterministic
276
  )
277
 
278
+ logger.info(f"Predicted token IDs shape: {predicted_ids.shape}, first 10 IDs: {predicted_ids[0][:10].tolist()}")
279
+
280
  # Decode token ids to text
281
  transcription = self.whisper_processor.batch_decode(
282
  predicted_ids,