Spaces:

eduard76
/

Torstens_voice_assistent

Sleeping

App Files Files Community

eduard76 commited on Oct 6, 2025

Commit

7728870

verified ·

1 Parent(s): 4e1229a

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -2

app.py CHANGED Viewed

@@ -180,6 +180,7 @@ class ProfessionalVoiceAgent:
     def transcribe_audio(self, audio) -> str:
         """Convert speech to text using Whisper"""
         if audio is None:
             return ""
         try:
@@ -187,19 +188,27 @@ class ProfessionalVoiceAgent:
             if isinstance(audio, dict):
                 sample_rate = audio.get("sample_rate", 16000)
                 audio_data = audio.get("array", audio.get("data", None))
                 if audio_data is None:
                     logger.error("Audio dict missing 'array' or 'data' key")
                     return "Could not process audio format."
             elif isinstance(audio, tuple):
                 sample_rate, audio_data = audio
             else:
                 audio_data = audio
                 sample_rate = 16000
             # Ensure we have audio data
             if audio_data is None or len(audio_data) == 0:
                 return "No audio data received."
             # Convert to float32 if needed
             if audio_data.dtype == np.int16:
                 audio_data = audio_data.astype(np.float32) / 32768.0
@@ -232,8 +241,14 @@ class ProfessionalVoiceAgent:
                 # Generate token ids - optimized for speed
                 with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
                     with torch.no_grad():
                         predicted_ids = self.whisper_model.generate(
                             input_features,
                             max_new_tokens=64,  # Reduced for faster processing
                             num_beams=1,  # Greedy decoding for speed
                             do_sample=False  # Deterministic
@@ -272,8 +287,12 @@ class ProfessionalVoiceAgent:
                 for user_msg, bot_msg in conversation_history[-3:]:  # Last 3 exchanges
                     context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
                 context += f"User: {text}\nAssistant:"
             else:
                 context = f"User: {text}\nAssistant:"
             if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
                 # Tokenize input
@@ -299,10 +318,12 @@ class ProfessionalVoiceAgent:
                         )
                 # Decode response
-                response = self.chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
                 # Clean response
-                response = response.replace(context, "").strip()
             else:
                 # Use pipeline
@@ -334,6 +355,8 @@ class ProfessionalVoiceAgent:
             return None
         try:
             # Truncate if too long and warn
             max_chars = 600
             if len(text) > max_chars:

     def transcribe_audio(self, audio) -> str:
         """Convert speech to text using Whisper"""
         if audio is None:
+            logger.warning("No audio input received")
             return ""
         try:
             if isinstance(audio, dict):
                 sample_rate = audio.get("sample_rate", 16000)
                 audio_data = audio.get("array", audio.get("data", None))
+                logger.info(f"Audio format: dict, sample_rate={sample_rate}, data shape={audio_data.shape if audio_data is not None else 'None'}")
                 if audio_data is None:
                     logger.error("Audio dict missing 'array' or 'data' key")
                     return "Could not process audio format."
             elif isinstance(audio, tuple):
                 sample_rate, audio_data = audio
+                logger.info(f"Audio format: tuple, sample_rate={sample_rate}, data shape={audio_data.shape}")
             else:
                 audio_data = audio
                 sample_rate = 16000
+                logger.info(f"Audio format: raw array, shape={audio_data.shape}")
             # Ensure we have audio data
             if audio_data is None or len(audio_data) == 0:
+                logger.warning("Empty audio data")
                 return "No audio data received."
+            # Log audio stats
+            duration_seconds = len(audio_data) / sample_rate
+            logger.info(f"Audio duration: {duration_seconds:.2f}s, sample_rate: {sample_rate}Hz")
             # Convert to float32 if needed
             if audio_data.dtype == np.int16:
                 audio_data = audio_data.astype(np.float32) / 32768.0
                 # Generate token ids - optimized for speed
                 with torch.cuda.amp.autocast(enabled=self.device.type == "cuda"):
                     with torch.no_grad():
+                        # Force English language to avoid language detection overhead
+                        forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
+                            language="en",
+                            task="transcribe"
+                        )
                         predicted_ids = self.whisper_model.generate(
                             input_features,
+                            forced_decoder_ids=forced_decoder_ids,
                             max_new_tokens=64,  # Reduced for faster processing
                             num_beams=1,  # Greedy decoding for speed
                             do_sample=False  # Deterministic
                 for user_msg, bot_msg in conversation_history[-3:]:  # Last 3 exchanges
                     context += f"User: {user_msg}\nAssistant: {bot_msg}\n"
                 context += f"User: {text}\nAssistant:"
+                logger.info(f"Input text: '{text}' | History entries: {len(conversation_history)}")
             else:
                 context = f"User: {text}\nAssistant:"
+                logger.info(f"Input text: '{text}' | No history")
+            logger.debug(f"Full context sent to model:\n{context}")
             if self.chat_tokenizer and hasattr(self.chat_model, 'generate'):
                 # Tokenize input
                         )
                 # Decode response
+                full_response = self.chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                logger.debug(f"Raw model output: '{full_response}'")
                 # Clean response
+                response = full_response.replace(context, "").strip()
+                logger.info(f"Generated response: '{response}'")
             else:
                 # Use pipeline
             return None
         try:
+            logger.info(f"Synthesizing speech for text: '{text}'")
             # Truncate if too long and warn
             max_chars = 600
             if len(text) > max_chars: