Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 6, 2025

Commit

1067054

verified ·

1 Parent(s): 27cf157

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +193 -250

services/streaming_voice_service.py CHANGED Viewed

@@ -990,77 +990,106 @@ from core.silero_vad import SileroVAD
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
-        """Khởi tạo VOSK ASR streaming"""
         self.model = None
         self.recognizer = None
         self.sample_rate = 16000
         self.is_streaming = False
-        self.partial_results = []
         # Tự động tải model nếu không có đường dẫn
         if model_path is None:
             model_path = self._download_vosk_model()
-        if os.path.exists(model_path):
-            print("🔄 Đang tải VOSK model...")
-            self.model = Model(model_path)
-            self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
-            print("✅ Đã tải VOSK model thành công")
         else:
             print(f"❌ Không tìm thấy VOSK model tại: {model_path}")
     def _download_vosk_model(self):
         """Tải VOSK model tiếng Việt tự động"""
-        import urllib.request
-        import zipfile
-        model_url = "https://alphacephei.com/vosk/models/vosk-model-small-vn-0.4.zip"
-        model_dir = "models/vosk-model-small-vn-0.4"
-        zip_path = "models/vosk-model-small-vn-0.4.zip"
-        # Tạo thư mục nếu chưa có
-        os.makedirs("models", exist_ok=True)
-        if not os.path.exists(model_dir):
-            print("📥 Đang tải VOSK Vietnamese model...")
-            try:
                 urllib.request.urlretrieve(model_url, zip_path)
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall("models/")
-                os.rename("models/vosk-model-small-vn-0.4", model_dir)
-                os.remove(zip_path)
                 print("✅ Đã tải VOSK model thành công")
-            except Exception as e:
-                print(f"❌ Lỗi tải VOSK model: {e}")
-                return None
-        return model_dir
     def start_stream(self):
         """Bắt đầu stream mới"""
         if self.model is None:
             return False
-        self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
-        self.recognizer.SetWords(True)
-        self.is_streaming = True
-        self.partial_results = []
-        return True
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
-        """Xử lý audio chunk và trả về kết quả"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
         try:
-            # Chuẩn hóa audio
             if sample_rate and sample_rate != self.sample_rate:
                 audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
             if audio_chunk.dtype != np.int16:
-                audio_chunk = (audio_chunk * 32767).astype(np.int16)
             # Chuyển đổi sang bytes
             audio_bytes = audio_chunk.tobytes()
@@ -1068,41 +1097,53 @@ class VoskStreamingASR:
             # Xử lý với VOSK
             if self.recognizer.AcceptWaveform(audio_bytes):
                 # Kết quả cuối cùng
-                result = json.loads(self.recognizer.Result())
                 text = result.get('text', '').strip()
                 if text:
                     return {"text": text, "partial": "", "is_final": True}
             else:
                 # Kết quả tạm thời
-                partial_result = json.loads(self.recognizer.PartialResult())
                 partial_text = partial_result.get('partial', '').strip()
-                return {"text": "", "partial": partial_text, "is_final": False}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
         return {"text": "", "partial": "", "is_final": False}
     def stop_stream(self) -> str:
         """Kết thúc stream và lấy kết quả cuối"""
         if self.recognizer:
-            result = json.loads(self.recognizer.FinalResult())
-            text = result.get('text', '').strip()
-            self.is_streaming = False
-            return text
         return ""
     def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-        """Resample audio"""
         if orig_sr == target_sr:
             return audio
         try:
             from scipy import signal
-            duration = len(audio) / orig_sr
-            new_length = int(duration * target_sr)
-            resampled_audio = signal.resample(audio, new_length)
             return resampled_audio.astype(np.int16)
-        except Exception:
             return audio
 class StreamingVoiceService:
@@ -1111,7 +1152,8 @@ class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
-        # Khởi tạo VOSK ASR thay vì Whisper
         self.vosk_asr = VoskStreamingASR()
         # Khởi tạo VAD
@@ -1119,7 +1161,6 @@ class StreamingVoiceService:
         self.is_listening = False
         self.speech_callback = None
         self.is_processing = False
-        self.processing_lock = threading.Lock()
         # Conversation context
         self.conversation_history = []
@@ -1131,37 +1172,50 @@ class StreamingVoiceService:
         self.processing_threads = []
         self.max_workers = 2
-        # VOSK streaming state
         self.vosk_stream_active = False
         self.last_voice_time = 0
-        self.silence_timeout = 2.0  # 2 giây im lặng thì kết thúc câu
         # Latency tracking
         self.latency_metrics = {
-            'asr': [],
-            'rag': [],
-            'llm': [],
-            'tts': [],
-            'total': [],
-            'vad_detection': [],
-            'queue_waiting': [],
-            'vosk_processing': []
         }
         self.current_callback = None
     def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe với VOSK streaming"""
         if self.is_listening:
             return False
         self.current_callback = speech_callback
         # Khởi động VOSK stream
         if not self.vosk_asr.start_stream():
-            print("❌ Không thể khởi động VOSK ASR")
             return False
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
@@ -1169,6 +1223,7 @@ class StreamingVoiceService:
             self.is_processing = False
             self.vosk_stream_active = True
             self.last_voice_time = time.time()
             # Khởi động worker threads
             if not self.processing_threads:
@@ -1185,74 +1240,76 @@ class StreamingVoiceService:
             threading.Thread(target=self._vosk_streaming_monitor, daemon=True).start()
             print("🎙️ Đã bắt đầu lắng nghe với VOSK ASR streaming")
             return True
         return False
-    def stop_listening(self):
-        """Dừng lắng nghe"""
-        self.vad_processor.stop_stream()
-        self.is_listening = False
-        self.is_processing = False
-        self.vosk_stream_active = False
-        self.current_callback = None
-        # Dừng VOSK stream
-        final_text = self.vosk_asr.stop_stream()
-        if final_text and len(final_text) > 1:
-            self._process_final_transcription(final_text)
-        # Clear queue
-        while not self.response_queue.empty():
-            try:
-                self.response_queue.get_nowait()
-                self.response_queue.task_done()
-            except queue.Empty:
-                break
-        print("🛑 Đã dừng lắng nghe")
     def _vosk_streaming_monitor(self):
         """Theo dõi VOSK streaming và xử lý kết quả real-time"""
         while self.is_listening and self.vosk_stream_active:
             try:
-                # Kiểm tra timeout im lặng
                 current_time = time.time()
                 silence_duration = current_time - self.last_voice_time
                 if silence_duration > self.silence_timeout and self.partial_transcription:
-                    # Im lặng quá lâu, xử lý transcription hiện tại
-                    print(f"⏰ Silence timeout, xử lý: {self.partial_transcription}")
-                    self._process_final_transcription(self.partial_transcription)
                     self.partial_transcription = ""
-                    self.vosk_asr.start_stream()  # Bắt đầu stream mới
-                time.sleep(0.1)  # Giảm CPU usage
             except Exception as e:
                 print(f"❌ Lỗi VOSK monitor: {e}")
                 break
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-        """Callback khi VAD phát hiện speech - sử dụng VOSK real-time"""
-        vad_start = time.time()
-        if not self.vosk_stream_active:
             return
         # Cập nhật thời gian có giọng nói
         self.last_voice_time = time.time()
-        # Xử lý với VOSK
-        vosk_start = time.time()
-        result = self.vosk_asr.process_audio_chunk(speech_audio, sample_rate)
-        vosk_latency = time.time() - vosk_start
-        self._add_latency_sample('vosk_processing', vosk_latency)
         # Xử lý kết quả partial
         if result['partial'] and len(result['partial']) > 1:
             self.partial_transcription = result['partial']
-            print(f"🎯 VOSK Partial: {result['partial']}")
             # Gửi partial result real-time
             if self.current_callback:
@@ -1265,184 +1322,61 @@ class StreamingVoiceService:
         # Xử lý kết quả final
         if result['is_final'] and result['text'] and len(result['text']) > 1:
-            print(f"✅ VOSK Final: {result['text']}")
             self._process_final_transcription(result['text'])
             self.partial_transcription = ""
             self.vosk_asr.start_stream()  # Bắt đầu stream mới
-        vad_latency = time.time() - vad_start
-        self._add_latency_sample('vad_detection', vad_latency)
     def _process_final_transcription(self, transcription: str):
         """Xử lý transcription cuối cùng"""
         if not transcription or len(transcription.strip()) < 2:
             return
-        print(f"📝 Final Transcription: {transcription}")
         self.current_transcription = transcription
         # Đưa vào queue để xử lý
         try:
             self.response_queue.put(transcription, timeout=0.5)
         except queue.Full:
             print("⚠️ Queue đầy, bỏ qua transcription")
-    def _process_response_worker(self):
-        """Worker thread xử lý transcriptions"""
-        while self.is_listening:
-            try:
-                queue_start = time.time()
-                transcription = self.response_queue.get(timeout=1.0)
-                queue_latency = time.time() - queue_start
-                self._add_latency_sample('queue_waiting', queue_latency)
-                # Xử lý AI response
-                self._process_ai_response(transcription)
-                self.response_queue.task_done()
-            except queue.Empty:
-                continue
-            except Exception as e:
-                print(f"❌ Lỗi trong worker thread: {e}")
-                continue
-    def _process_ai_response(self, transcription: str):
-        """Xử lý phản hồi AI cho transcription"""
-        total_start_time = time.time()
-        try:
-            # 1. AI Response Generation
-            rag_start = time.time()
-            response = self._generate_ai_response_optimized(transcription)
-            rag_latency = time.time() - rag_start
-            # 2. TTS Conversion
-            tts_start = time.time()
-            tts_audio_path = self._text_to_speech_optimized(response)
-            tts_latency = time.time() - tts_start
-            total_latency = time.time() - total_start_time
-            # Log latency metrics
-            self._log_latency_metrics({
-                'rag': rag_latency,
-                'tts': tts_latency,
-                'total': total_latency
-            })
-            # Gọi callback
-            if self.current_callback:
-                result = {
-                    'transcription': transcription,
-                    'response': response,
-                    'tts_audio': tts_audio_path,
-                    'status': 'completed',
-                    'latency': total_latency
-                }
-                self.current_callback(result)
-        except Exception as e:
-            print(f"❌ Lỗi xử lý AI response: {e}")
-            traceback.print_exc()
-            if self.current_callback:
-                self.current_callback({
-                    'transcription': transcription,
-                    'response': f"Xin lỗi, có lỗi xảy ra: {str(e)}",
-                    'tts_audio': None,
-                    'status': 'error'
-                })
-    def _generate_ai_response_optimized(self, user_input: str) -> str:
-        """Sinh phản hồi AI với tối ưu hiệu suất"""
-        llm_start = time.time()
-        try:
-            # Thêm vào lịch sử
-            self.conversation_history.append({"role": "user", "content": user_input})
-            # Tìm kiếm RAG
-            rag_start = time.time()
-            rag_results = self.rag_system.semantic_search(user_input, top_k=2) if self.rag_system else []
-            rag_latency = time.time() - rag_start
-            context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
-            system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
-Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
-Thông tin tham khảo:
-{context_text}
-"""
-            messages = [{"role": "system", "content": system_prompt}]
-            messages.extend(self.conversation_history[-6:])
-            llm_inference_start = time.time()
-            completion = self.client.chat.completions.create(
-                model=settings.MULTILINGUAL_LLM_MODEL,
-                messages=messages,
-                max_tokens=300,
-                temperature=0.7
-            )
-            ttft = time.time() - llm_inference_start
-            response = completion.choices[0].message.content
-            self.conversation_history.append({"role": "assistant", "content": response})
-            total_llm_latency = time.time() - llm_start
-            # Giới hạn lịch sử
-            if len(self.conversation_history) > 12:
-                self.conversation_history = self.conversation_history[-12:]
-            self._add_latency_sample('llm', total_llm_latency)
-            self._add_latency_sample('rag', rag_latency)
-            print(f"✅ RAG Latency: {rag_latency:.2f}s")
-            print(f"✅ LLM TTFT: {ttft:.2f}s")
-            print(f"✅ Total LLM Latency: {total_llm_latency:.2f}s")
-            return response
-        except Exception as e:
-            print(f"❌ Lỗi tạo AI response: {e}")
-            return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
-    def _text_to_speech_optimized(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói với tối ưu"""
-        tts_start = time.time()
-        try:
-            if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
-                return None
-            tts_bytes = self.tts_service.text_to_speech(text, 'vi')
-            tts_latency = time.time() - tts_start
-            if tts_bytes:
-                audio_path = self.tts_service.save_audio_to_file(tts_bytes)
-                self._add_latency_sample('tts', tts_latency)
-                return audio_path
-        except Exception as e:
-            print(f"❌ Lỗi TTS: {e}")
-        return None
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming manual mode với VOSK"""
         if not audio_data:
             return self._create_error_response("❌ Không có dữ liệu âm thanh")
         try:
             sample_rate, audio_array = audio_data
             # Khởi động VOSK stream tạm thời
-            self.vosk_asr.start_stream()
             # Xử lý audio với VOSK
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
-            if result['is_final'] and result['text']:
                 transcription = result['text']
-                print(f"📝 Manual Transcription: {transcription}")
                 # Tạo phản hồi AI
                 response = self._generate_ai_response_optimized(transcription)
@@ -1454,9 +1388,16 @@ Thông tin tham khảo:
                     'tts_audio': tts_audio_path,
                     'status': 'completed'
                 }
             else:
                 return {
-                    'transcription': result['partial'] or "Đang nghe...",
                     'response': "",
                     'tts_audio': None,
                     'status': 'listening'
@@ -1464,6 +1405,7 @@ Thông tin tham khảo:
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
             return self._create_error_response(f"❌ Lỗi: {str(e)}")
     def _create_error_response(self, message: str) -> Dict[str, Any]:
@@ -1524,9 +1466,10 @@ Thông tin tham khảo:
             'queue_size': self.response_queue.qsize(),
             'worker_threads': len(self.processing_threads),
             'vosk_active': self.vosk_stream_active,
             'last_update': time.strftime("%H:%M:%S")
         }
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []

 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
+        """Khởi tạo VOSK ASR streaming với debug"""
         self.model = None
         self.recognizer = None
         self.sample_rate = 16000
         self.is_streaming = False
         # Tự động tải model nếu không có đường dẫn
         if model_path is None:
             model_path = self._download_vosk_model()
+        if model_path and os.path.exists(model_path):
+            print(f"🔄 Đang tải VOSK model từ: {model_path}")
+            try:
+                self.model = Model(model_path)
+                self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
+                self.recognizer.SetWords(True)
+                print("✅ Đã tải VOSK model thành công")
+            except Exception as e:
+                print(f"❌ Lỗi khởi tạo VOSK model: {e}")
         else:
             print(f"❌ Không tìm thấy VOSK model tại: {model_path}")
     def _download_vosk_model(self):
         """Tải VOSK model tiếng Việt tự động"""
+        try:
+            import urllib.request
+            import zipfile
+            model_url = "https://alphacephei.com/vosk/models/vosk-model-small-vn-0.4.zip"
+            model_dir = "models/vosk-model-small-vn-0.4"
+            zip_path = "models/vosk-model-small-vn-0.4.zip"
+            # Tạo thư mục nếu chưa có
+            os.makedirs("models", exist_ok=True)
+            if not os.path.exists(model_dir):
+                print("📥 Đang tải VOSK Vietnamese model...")
                 urllib.request.urlretrieve(model_url, zip_path)
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall("models/")
+                # Đảm bảo thư mục tồn tại
+                if os.path.exists("models/vosk-model-small-vn-0.4"):
+                    os.rename("models/vosk-model-small-vn-0.4", model_dir)
+                if os.path.exists(zip_path):
+                    os.remove(zip_path)
                 print("✅ Đã tải VOSK model thành công")
+            return model_dir if os.path.exists(model_dir) else None
+        except Exception as e:
+            print(f"❌ Lỗi tải VOSK model: {e}")
+            return None
     def start_stream(self):
         """Bắt đầu stream mới"""
         if self.model is None:
+            print("❌ VOSK model chưa được khởi tạo")
             return False
+        try:
+            self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
+            self.recognizer.SetWords(True)
+            self.is_streaming = True
+            print("🎤 Đã khởi động VOSK stream")
+            return True
+        except Exception as e:
+            print(f"❌ Lỗi khởi động VOSK stream: {e}")
+            return False
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
+        """Xử lý audio chunk và trả về kết quả - FIXED VERSION"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
         try:
+            # DEBUG: Thông tin audio chunk
+            print(f"🔊 Audio chunk: {len(audio_chunk)} samples, dtype: {audio_chunk.dtype}, max: {np.max(audio_chunk):.4f}")
+            # Chuẩn hóa audio - QUAN TRỌNG: VOSK cần audio ở dạng int16
             if sample_rate and sample_rate != self.sample_rate:
                 audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
+            # Đảm bảo là int16 với giá trị phù hợp
             if audio_chunk.dtype != np.int16:
+                if audio_chunk.dtype in [np.float32, np.float64]:
+                    # Audio float cần được scale về [-32768, 32767]
+                    audio_chunk = (audio_chunk * 32767).astype(np.int16)
+                else:
+                    audio_chunk = audio_chunk.astype(np.int16)
+            # Kiểm tra âm lượng
+            audio_rms = np.sqrt(np.mean(audio_chunk.astype(np.float32)**2)) / 32767.0
+            print(f"📊 Audio RMS: {audio_rms:.4f}")
+            if audio_rms < 0.01:  # Âm lượng quá thấp
+                print("⚠️ Âm lượng quá thấp, bỏ qua")
+                return {"text": "", "partial": "", "is_final": False}
             # Chuyển đổi sang bytes
             audio_bytes = audio_chunk.tobytes()
             # Xử lý với VOSK
             if self.recognizer.AcceptWaveform(audio_bytes):
                 # Kết quả cuối cùng
+                result_json = self.recognizer.Result()
+                result = json.loads(result_json)
                 text = result.get('text', '').strip()
+                print(f"✅ VOSK Final Result: '{text}'")
                 if text:
                     return {"text": text, "partial": "", "is_final": True}
             else:
                 # Kết quả tạm thời
+                partial_json = self.recognizer.PartialResult()
+                partial_result = json.loads(partial_json)
                 partial_text = partial_result.get('partial', '').strip()
+                if partial_text:
+                    print(f"🎯 VOSK Partial: '{partial_text}'")
+                    return {"text": "", "partial": partial_text, "is_final": False}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
+            traceback.print_exc()
         return {"text": "", "partial": "", "is_final": False}
     def stop_stream(self) -> str:
         """Kết thúc stream và lấy kết quả cuối"""
         if self.recognizer:
+            try:
+                result_json = self.recognizer.FinalResult()
+                result = json.loads(result_json)
+                text = result.get('text', '').strip()
+                self.is_streaming = False
+                print(f"🛑 VOSK Final: '{text}'")
+                return text
+            except Exception as e:
+                print(f"❌ Lỗi khi dừng VOSK stream: {e}")
         return ""
     def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio với chất lượng tốt hơn"""
         if orig_sr == target_sr:
             return audio
         try:
             from scipy import signal
+            # Tính số sample mới
+            num_samples = int(len(audio) * target_sr / orig_sr)
+            resampled_audio = signal.resample(audio, num_samples)
             return resampled_audio.astype(np.int16)
+        except Exception as e:
+            print(f"❌ Lỗi resample audio: {e}")
             return audio
 class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
+        # Khởi tạo VOSK ASR - FIXED: Thêm timeout và retry
+        print("🔄 Đang khởi tạo VOSK ASR...")
         self.vosk_asr = VoskStreamingASR()
         # Khởi tạo VAD
         self.is_listening = False
         self.speech_callback = None
         self.is_processing = False
         # Conversation context
         self.conversation_history = []
         self.processing_threads = []
         self.max_workers = 2
+        # Streaming state
         self.vosk_stream_active = False
         self.last_voice_time = 0
+        self.silence_timeout = 3.0  # Tăng timeout lên 3 giây
+        # Audio buffer để cải thiện nhận diện
+        self.audio_buffer = []
+        self.buffer_duration = 1.0  # Buffer 1 giây
+        self.max_buffer_samples = 16000  # 1 giây ở 16kHz
         # Latency tracking
         self.latency_metrics = {
+            'asr': [], 'rag': [], 'llm': [], 'tts': [], 'total': [],
+            'vad_detection': [], 'queue_waiting': [], 'vosk_processing': []
         }
         self.current_callback = None
     def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe với VOSK streaming - FIXED VERSION"""
         if self.is_listening:
+            print("⚠️ Đã đang lắng nghe")
             return False
         self.current_callback = speech_callback
+        # Kiểm tra VOSK model
+        if self.vosk_asr.model is None:
+            print("❌ VOSK model không khả dụng")
+            if self.current_callback:
+                self.current_callback({
+                    'transcription': "Lỗi: VOSK model không khả dụng",
+                    'response': "Không thể khởi động nhận diện giọng nói",
+                    'tts_audio': None,
+                    'status': 'error'
+                })
+            return False
         # Khởi động VOSK stream
         if not self.vosk_asr.start_stream():
+            print("❌ Không thể khởi động VOSK stream")
             return False
+        # Khởi động VAD
         success = self.vad_processor.start_stream(self._on_speech_detected)
         if success:
             self.is_processing = False
             self.vosk_stream_active = True
             self.last_voice_time = time.time()
+            self.audio_buffer = []
             # Khởi động worker threads
             if not self.processing_threads:
             threading.Thread(target=self._vosk_streaming_monitor, daemon=True).start()
             print("🎙️ Đã bắt đầu lắng nghe với VOSK ASR streaming")
+            # Thông báo trạng thái
+            if self.current_callback:
+                self.current_callback({
+                    'transcription': "Đã bắt đầu lắng nghe... Hãy nói gì đó",
+                    'response': "",
+                    'tts_audio': None,
+                    'status': 'listening'
+                })
             return True
         return False
     def _vosk_streaming_monitor(self):
         """Theo dõi VOSK streaming và xử lý kết quả real-time"""
         while self.is_listening and self.vosk_stream_active:
             try:
                 current_time = time.time()
                 silence_duration = current_time - self.last_voice_time
+                # Xử lý audio buffer nếu có dữ liệu
+                if self.audio_buffer and silence_duration > 0.5:  # 0.5 giây im lặng
+                    combined_audio = np.concatenate(self.audio_buffer)
+                    if len(combined_audio) > 1600:  # Ít nhất 0.1 giây audio
+                        result = self.vosk_asr.process_audio_chunk(combined_audio, 16000)
+                        self._handle_vosk_result(result)
+                    self.audio_buffer = []
+                # Timeout im lặng
                 if silence_duration > self.silence_timeout and self.partial_transcription:
+                    print(f"⏰ Silence timeout, xử lý: '{self.partial_transcription}'")
+                    if len(self.partial_transcription) > 2:  # Chỉ xử lý nếu có nội dung
+                        self._process_final_transcription(self.partial_transcription)
                     self.partial_transcription = ""
+                    self.vosk_asr.start_stream()
+                time.sleep(0.1)
             except Exception as e:
                 print(f"❌ Lỗi VOSK monitor: {e}")
                 break
     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+        """Callback khi VAD phát hiện speech - FIXED VERSION"""
+        if not self.vosk_stream_active or not self.is_listening:
             return
         # Cập nhật thời gian có giọng nói
         self.last_voice_time = time.time()
+        # Thêm vào audio buffer để cải thiện nhận diện
+        self.audio_buffer.append(speech_audio)
+        # Giới hạn buffer size
+        total_samples = sum(len(chunk) for chunk in self.audio_buffer)
+        if total_samples > self.max_buffer_samples:
+            # Giữ lại các chunk gần nhất
+            while total_samples > self.max_buffer_samples and len(self.audio_buffer) > 1:
+                removed = self.audio_buffer.pop(0)
+                total_samples -= len(removed)
+        print(f"🎯 VAD detected: {len(speech_audio)} samples, Buffer: {len(self.audio_buffer)} chunks")
+    def _handle_vosk_result(self, result: Dict[str, Any]):
+        """Xử lý kết quả từ VOSK"""
         # Xử lý kết quả partial
         if result['partial'] and len(result['partial']) > 1:
             self.partial_transcription = result['partial']
+            print(f"🎯 VOSK Partial: '{result['partial']}'")
             # Gửi partial result real-time
             if self.current_callback:
         # Xử lý kết quả final
         if result['is_final'] and result['text'] and len(result['text']) > 1:
+            print(f"✅ VOSK Final: '{result['text']}'")
             self._process_final_transcription(result['text'])
             self.partial_transcription = ""
+            self.audio_buffer = []  # Clear buffer
             self.vosk_asr.start_stream()  # Bắt đầu stream mới
     def _process_final_transcription(self, transcription: str):
         """Xử lý transcription cuối cùng"""
         if not transcription or len(transcription.strip()) < 2:
             return
+        print(f"📝 Final Transcription: '{transcription}'")
         self.current_transcription = transcription
         # Đưa vào queue để xử lý
         try:
             self.response_queue.put(transcription, timeout=0.5)
+            print(f"📦 Đã đưa vào queue: '{transcription}'")
         except queue.Full:
             print("⚠️ Queue đầy, bỏ qua transcription")
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming manual mode với VOSK - FIXED VERSION"""
         if not audio_data:
             return self._create_error_response("❌ Không có dữ liệu âm thanh")
         try:
             sample_rate, audio_array = audio_data
+            print(f"🎤 Manual audio: {len(audio_array)} samples, {sample_rate}Hz")
+            # Kiểm tra âm lượng
+            if isinstance(audio_array, np.ndarray):
+                if audio_array.dtype in [np.float32, np.float64]:
+                    audio_rms = np.sqrt(np.mean(audio_array**2))
+                    print(f"📊 Manual audio RMS: {audio_rms:.4f}")
+                    if audio_rms < 0.01:
+                        return {
+                            'transcription': "Âm thanh quá nhỏ, hãy nói to hơn",
+                            'response': "",
+                            'tts_audio': None,
+                            'status': 'listening'
+                        }
             # Khởi động VOSK stream tạm thời
+            if not self.vosk_asr.start_stream():
+                return self._create_error_response("❌ Không thể khởi động VOSK")
             # Xử lý audio với VOSK
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
+            if result['is_final'] and result['text'] and len(result['text']) > 1:
                 transcription = result['text']
+                print(f"📝 Manual Transcription: '{transcription}'")
                 # Tạo phản hồi AI
                 response = self._generate_ai_response_optimized(transcription)
                     'tts_audio': tts_audio_path,
                     'status': 'completed'
                 }
+            elif result['partial']:
+                return {
+                    'transcription': result['partial'],
+                    'response': "",
+                    'tts_audio': None,
+                    'status': 'listening'
+                }
             else:
                 return {
+                    'transcription': "Đang nghe... Hãy nói rõ hơn",
                     'response': "",
                     'tts_audio': None,
                     'status': 'listening'
         except Exception as e:
             print(f"❌ Lỗi xử lý streaming audio: {e}")
+            traceback.print_exc()
             return self._create_error_response(f"❌ Lỗi: {str(e)}")
     def _create_error_response(self, message: str) -> Dict[str, Any]:
             'queue_size': self.response_queue.qsize(),
             'worker_threads': len(self.processing_threads),
             'vosk_active': self.vosk_stream_active,
+            'audio_buffer_chunks': len(self.audio_buffer),
+            'last_voice_time': time.strftime("%H:%M:%S", time.localtime(self.last_voice_time)),
             'last_update': time.strftime("%H:%M:%S")
         }
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         self.conversation_history = []