eduard76 commited on
Commit
7728870
·
verified ·
1 Parent(s): 4e1229a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -2
app.py CHANGED
@@ -180,6 +180,7 @@ class ProfessionalVoiceAgent:
180
  def transcribe_audio(self, audio) -> str:
181
  """Convert speech to text using Whisper"""
182
  if audio is None:
 
183
  return ""
184
 
185
  try:
@@ -187,19 +188,27 @@ class ProfessionalVoiceAgent:
187
  if isinstance(audio, dict):
188
  sample_rate = audio.get("sample_rate", 16000)
189
  audio_data = audio.get("array", audio.get("data", None))
 
190
  if audio_data is None:
191
  logger.error("Audio dict missing 'array' or 'data' key")
192
  return "Could not process audio format."
193
  elif isinstance(audio, tuple):
194
  sample_rate, audio_data = audio
 
195
  else:
196
  audio_data = audio
197
  sample_rate = 16000
 
198
 
199
  # Ensure we have audio data
200
  if audio_data is None or len(audio_data) == 0:
 
201
  return "No audio data received."
202
 
 
 
 
 
203
  # Convert to float32 if needed
204
  if audio_data.dtype == np.int16:
205
  audio_data = audio_data.astype(np.float32) / 32768.0
@@ -232,8 +241,14 @@ class ProfessionalVoiceAgent:
232
  # Generate token ids - optimized for speed
233
  with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
234
  with torch.no_grad():
 
 
 
 
 
235
  predicted_ids = self.whisper_model.generate(
236
  input_features,
 
237
  max_new_tokens=64, # Reduced for faster processing
238
  num_beams=1, # Greedy decoding for speed
239
  do_sample=False # Deterministic
@@ -272,8 +287,12 @@ class ProfessionalVoiceAgent:
272
  for user_msg, bot_msg in conversation_history[-3:]: # Last 3 exchanges
273
  context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
274
  context += f"User: {text}\nAssistant:"
 
275
  else:
276
  context = f"User: {text}\nAssistant:"
 
 
 
277
 
278
  if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
279
  # Tokenize input
@@ -299,10 +318,12 @@ class ProfessionalVoiceAgent:
299
  )
300
 
301
  # Decode response
302
- response = self.chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
303
 
304
  # Clean response
305
- response = response.replace(context, "").strip()
 
306
 
307
  else:
308
  # Use pipeline
@@ -334,6 +355,8 @@ class ProfessionalVoiceAgent:
334
  return None
335
 
336
  try:
 
 
337
  # Truncate if too long and warn
338
  max_chars = 600
339
  if len(text) > max_chars:
 
180
  def transcribe_audio(self, audio) -> str:
181
  """Convert speech to text using Whisper"""
182
  if audio is None:
183
+ logger.warning("No audio input received")
184
  return ""
185
 
186
  try:
 
188
  if isinstance(audio, dict):
189
  sample_rate = audio.get("sample_rate", 16000)
190
  audio_data = audio.get("array", audio.get("data", None))
191
+ logger.info(f"Audio format: dict, sample_rate={sample_rate}, data shape={audio_data.shape if audio_data is not None else 'None'}")
192
  if audio_data is None:
193
  logger.error("Audio dict missing 'array' or 'data' key")
194
  return "Could not process audio format."
195
  elif isinstance(audio, tuple):
196
  sample_rate, audio_data = audio
197
+ logger.info(f"Audio format: tuple, sample_rate={sample_rate}, data shape={audio_data.shape}")
198
  else:
199
  audio_data = audio
200
  sample_rate = 16000
201
+ logger.info(f"Audio format: raw array, shape={audio_data.shape}")
202
 
203
  # Ensure we have audio data
204
  if audio_data is None or len(audio_data) == 0:
205
+ logger.warning("Empty audio data")
206
  return "No audio data received."
207
 
208
+ # Log audio stats
209
+ duration_seconds = len(audio_data) / sample_rate
210
+ logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
211
+
212
  # Convert to float32 if needed
213
  if audio_data.dtype == np.int16:
214
  audio_data = audio_data.astype(np.float32) / 32768.0
 
241
  # Generate token ids - optimized for speed
242
  with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
243
  with torch.no_grad():
244
+ # Force English language to avoid language detection overhead
245
+ forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
246
+ language="en",
247
+ task="transcribe"
248
+ )
249
  predicted_ids = self.whisper_model.generate(
250
  input_features,
251
+ forced_decoder_ids=forced_decoder_ids,
252
  max_new_tokens=64, # Reduced for faster processing
253
  num_beams=1, # Greedy decoding for speed
254
  do_sample=False # Deterministic
 
287
  for user_msg, bot_msg in conversation_history[-3:]: # Last 3 exchanges
288
  context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
289
  context += f"User: {text}\nAssistant:"
290
+ logger.info(f"Input text: '{text}' | History entries: {len(conversation_history)}")
291
  else:
292
  context = f"User: {text}\nAssistant:"
293
+ logger.info(f"Input text: '{text}' | No history")
294
+
295
+ logger.debug(f"Full context sent to model:\n{context}")
296
 
297
  if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
298
  # Tokenize input
 
318
  )
319
 
320
  # Decode response
321
+ full_response = self.chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
322
+ logger.debug(f"Raw model output: '{full_response}'")
323
 
324
  # Clean response
325
+ response = full_response.replace(context, "").strip()
326
+ logger.info(f"Generated response: '{response}'")
327
 
328
  else:
329
  # Use pipeline
 
355
  return None
356
 
357
  try:
358
+ logger.info(f"Synthesizing speech for text: '{text}'")
359
+
360
  # Truncate if too long and warn
361
  max_chars = 600
362
  if len(text) > max_chars: