Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 13, 2025

Commit

361b4a9

verified ·

1 Parent(s): cc18c18

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +508 -798

services/streaming_voice_service.py CHANGED Viewed

@@ -1,4 +1,219 @@
 # import io
 # import numpy as np
 # import soundfile as sf
@@ -91,7 +306,7 @@
 #             return False
 #     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
-#         """Xử lý audio chunk và trả về kết quả - FIXED VERSION"""
 #         if self.recognizer is None or not self.is_streaming:
 #             return {"text": "", "partial": "", "is_final": False}
@@ -111,12 +326,16 @@
 #                 else:
 #                     audio_chunk = audio_chunk.astype(np.int16)
-#             # Kiểm tra âm lượng
 #             audio_rms = np.sqrt(np.mean(audio_chunk.astype(np.float32)**2)) / 32767.0
-#             print(f"📊 Audio RMS: {audio_rms:.4f}")
-#             if audio_rms < 0.01:  # Âm lượng quá thấp
-#                 print("⚠️ Âm lượng quá thấp, bỏ qua")
 #                 return {"text": "", "partial": "", "is_final": False}
 #             # Chuyển đổi sang bytes
@@ -146,6 +365,27 @@
 #         return {"text": "", "partial": "", "is_final": False}
 #     def stop_stream(self) -> str:
 #         """Kết thúc stream và lấy kết quả cuối"""
 #         if self.recognizer:
@@ -180,7 +420,7 @@
 #         self.rag_system = rag_system
 #         self.tts_service = tts_service
-#         # Khởi tạo VOSK ASR - FIXED: Thêm timeout và retry
 #         print("🔄 Đang khởi tạo VOSK ASR...")
 #         self.vosk_asr = VoskStreamingASR()
@@ -200,15 +440,19 @@
 #         self.processing_threads = []
 #         self.max_workers = 2
-#         # Streaming state
 #         self.vosk_stream_active = False
 #         self.last_voice_time = 0
-#         self.silence_timeout = 3.0  # Tăng timeout lên 3 giây
 #         # Audio buffer để cải thiện nhận diện
 #         self.audio_buffer = []
-#         self.buffer_duration = 1.0  # Buffer 1 giây
-#         self.max_buffer_samples = 16000  # 1 giây ở 16kHz
 #         # Latency tracking
 #         self.latency_metrics = {
@@ -252,6 +496,7 @@
 #             self.vosk_stream_active = True
 #             self.last_voice_time = time.time()
 #             self.audio_buffer = []
 #             # Khởi động worker threads
 #             if not self.processing_threads:
@@ -264,8 +509,8 @@
 #                     thread.start()
 #                     self.processing_threads.append(thread)
-#             # Bắt đầu thread theo dõi VOSK streaming
-#             threading.Thread(target=self._vosk_streaming_monitor, daemon=True).start()
 #             print("🎙️ Đã bắt đầu lắng nghe với VOSK ASR streaming")
@@ -282,118 +527,184 @@
 #         return False
-#     def _vosk_streaming_monitor(self):
-#         """Theo dõi VOSK streaming và xử lý kết quả real-time"""
-#         while self.is_listening and self.vosk_stream_active:
 #             try:
 #                 current_time = time.time()
 #                 silence_duration = current_time - self.last_voice_time
-#                 # Xử lý audio buffer nếu có dữ liệu
-#                 if self.audio_buffer and silence_duration > 0.5:  # 0.5 giây im lặng
-#                     combined_audio = np.concatenate(self.audio_buffer)
-#                     if len(combined_audio) > 1600:  # Ít nhất 0.1 giây audio
-#                         result = self.vosk_asr.process_audio_chunk(combined_audio, 16000)
-#                         self._handle_vosk_result(result)
-#                     self.audio_buffer = []
-#                 # Timeout im lặng
-#                 if silence_duration > self.silence_timeout and self.partial_transcription:
 #                     print(f"⏰ Silence timeout, xử lý: '{self.partial_transcription}'")
-#                     if len(self.partial_transcription) > 2:  # Chỉ xử lý nếu có nội dung
-#                         self._process_final_transcription(self.partial_transcription)
 #                     self.partial_transcription = ""
 #                     self.vosk_asr.start_stream()
-#                 time.sleep(0.1)
 #             except Exception as e:
-#                 print(f"❌ Lỗi VOSK monitor: {e}")
-#                 break
-#     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-#         """Callback khi VAD phát hiện speech - FIXED VERSION"""
-#         if not self.vosk_stream_active or not self.is_listening:
-#             return
-#         # Cập nhật thời gian có giọng nói
-#         self.last_voice_time = time.time()
-#         # Thêm vào audio buffer để cải thiện nhận diện
-#         self.audio_buffer.append(speech_audio)
-#         # Giới hạn buffer size
-#         total_samples = sum(len(chunk) for chunk in self.audio_buffer)
-#         if total_samples > self.max_buffer_samples:
-#             # Giữ lại các chunk gần nhất
-#             while total_samples > self.max_buffer_samples and len(self.audio_buffer) > 1:
-#                 removed = self.audio_buffer.pop(0)
-#                 total_samples -= len(removed)
-#         print(f"🎯 VAD detected: {len(speech_audio)} samples, Buffer: {len(self.audio_buffer)} chunks")
-#     def _handle_vosk_result(self, result: Dict[str, Any]):
-#         """Xử lý kết quả từ VOSK"""
-#         # Xử lý kết quả partial
-#         if result['partial'] and len(result['partial']) > 1:
-#             self.partial_transcription = result['partial']
-#             print(f"🎯 VOSK Partial: '{result['partial']}'")
-#             # Gửi partial result real-time
-#             if self.current_callback:
-#                 self.current_callback({
-#                     'transcription': result['partial'],
-#                     'response': "",
-#                     'tts_audio': None,
-#                     'status': 'partial'
-#                 })
-#         # Xử lý kết quả final
-#         if result['is_final'] and result['text'] and len(result['text']) > 1:
-#             print(f"✅ VOSK Final: '{result['text']}'")
-#             self._process_final_transcription(result['text'])
-#             self.partial_transcription = ""
-#             self.audio_buffer = []  # Clear buffer
-#             self.vosk_asr.start_stream()  # Bắt đầu stream mới
-#     def _process_final_transcription(self, transcription: str):
-#         """Xử lý transcription cuối cùng"""
-#         if not transcription or len(transcription.strip()) < 2:
-#             return
-#         print(f"📝 Final Transcription: '{transcription}'")
-#         self.current_transcription = transcription
-#         # Đưa vào queue để xử lý
-#         try:
-#             self.response_queue.put(transcription, timeout=0.5)
-#             print(f"📦 Đã đưa vào queue: '{transcription}'")
-#         except queue.Full:
-#             print("⚠️ Queue đầy, bỏ qua transcription")
 #     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-#         """Xử lý audio streaming manual mode với VOSK - FIXED VERSION"""
 #         if not audio_data:
 #             return self._create_error_response("❌ Không có dữ liệu âm thanh")
 #         try:
 #             sample_rate, audio_array = audio_data
-#             print(f"🎤 Manual audio: {len(audio_array)} samples, {sample_rate}Hz")
-#             # Kiểm tra âm lượng
 #             if isinstance(audio_array, np.ndarray):
 #                 if audio_array.dtype in [np.float32, np.float64]:
 #                     audio_rms = np.sqrt(np.mean(audio_array**2))
-#                     print(f"📊 Manual audio RMS: {audio_rms:.4f}")
-#                     if audio_rms < 0.01:
-#                         return {
-#                             'transcription': "Âm thanh quá nhỏ, hãy nói to hơn",
-#                             'response': "",
-#                             'tts_audio': None,
-#                             'status': 'listening'
-#                         }
 #             # Khởi động VOSK stream tạm thời
 #             if not self.vosk_asr.start_stream():
@@ -425,7 +736,7 @@
 #                 }
 #             else:
 #                 return {
-#                     'transcription': "Đang nghe... Hãy nói rõ hơn",
 #                     'response': "",
 #                     'tts_audio': None,
 #                     'status': 'listening'
@@ -435,6 +746,80 @@
 #             print(f"❌ Lỗi xử lý streaming audio: {e}")
 #             traceback.print_exc()
 #             return self._create_error_response(f"❌ Lỗi: {str(e)}")
 #     def _create_error_response(self, message: str) -> Dict[str, Any]:
 #         """Tạo response lỗi chuẩn"""
@@ -445,21 +830,27 @@
 #             'status': 'error'
 #         }
-#     def _add_latency_sample(self, component: str, latency: float):
-#         """Thêm mẫu latency"""
-#         if component in self.latency_metrics:
-#             self.latency_metrics[component].append(latency)
-#             if len(self.latency_metrics[component]) > 100:
-#                 self.latency_metrics[component] = self.latency_metrics[component][-100:]
-#     def _log_latency_metrics(self, latencies: dict):
-#         """Log và theo dõi latency metrics"""
-#         for key, value in latencies.items():
-#             self._add_latency_sample(key, value)
-#         print("📊 LATENCY REPORT:")
-#         for component, latency in latencies.items():
-#             print(f"   {component.upper()}: {latency:.2f}s")
 #     def get_latency_stats(self) -> dict:
 #         """Lấy thống kê latency"""
@@ -481,685 +872,4 @@
 #                     'avg': 0, 'min': 0, 'max': 0, 'count': 0,
 #                     'recent_avg': 0, 'recent_min': 0, 'recent_max': 0
 #                 }
-#         return stats
-#     def get_conversation_state(self) -> dict:
-#         """Lấy trạng thái hội thoại"""
-#         return {
-#             'is_listening': self.is_listening,
-#             'is_processing': self.is_processing,
-#             'history_length': len(self.conversation_history),
-#             'current_transcription': self.current_transcription,
-#             'partial_transcription': self.partial_transcription,
-#             'queue_size': self.response_queue.qsize(),
-#             'worker_threads': len(self.processing_threads),
-#             'vosk_active': self.vosk_stream_active,
-#             'audio_buffer_chunks': len(self.audio_buffer),
-#             'last_voice_time': time.strftime("%H:%M:%S", time.localtime(self.last_voice_time)),
-#             'last_update': time.strftime("%H:%M:%S")
-#         }
-#     def clear_conversation(self):
-#         """Xóa lịch sử hội thoại"""
-#         self.conversation_history = []
-#         self.current_transcription = ""
-#         self.partial_transcription = ""
-#         print("🗑️ Đã xóa lịch sử hội thoại")
-import io
-import numpy as np
-import soundfile as sf
-import time
-import traceback
-import threading
-import queue
-import json
-import os
-from vosk import Model, KaldiRecognizer
-from groq import Groq
-from typing import Optional, Dict, Any, Callable
-from config.settings import settings
-from core.rag_system import EnhancedRAGSystem
-from core.tts_service import EnhancedTTSService
-from core.silero_vad import SileroVAD
-class VoskStreamingASR:
-    def __init__(self, model_path: str = None):
-        """Khởi tạo VOSK ASR streaming với debug"""
-        self.model = None
-        self.recognizer = None
-        self.sample_rate = 16000
-        self.is_streaming = False
-        # Tự động tải model nếu không có đường dẫn
-        if model_path is None:
-            model_path = self._download_vosk_model()
-        if model_path and os.path.exists(model_path):
-            print(f"🔄 Đang tải VOSK model từ: {model_path}")
-            try:
-                self.model = Model(model_path)
-                self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
-                self.recognizer.SetWords(True)
-                print("✅ Đã tải VOSK model thành công")
-            except Exception as e:
-                print(f"❌ Lỗi khởi tạo VOSK model: {e}")
-        else:
-            print(f"❌ Không tìm thấy VOSK model tại: {model_path}")
-    def _download_vosk_model(self):
-        """Tải VOSK model tiếng Việt tự động"""
-        try:
-            import urllib.request
-            import zipfile
-            model_url = "https://alphacephei.com/vosk/models/vosk-model-small-vn-0.4.zip"
-            model_dir = "models/vosk-model-small-vn-0.4"
-            zip_path = "models/vosk-model-small-vn-0.4.zip"
-            # Tạo thư mục nếu chưa có
-            os.makedirs("models", exist_ok=True)
-            if not os.path.exists(model_dir):
-                print("📥 Đang tải VOSK Vietnamese model...")
-                urllib.request.urlretrieve(model_url, zip_path)
-                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                    zip_ref.extractall("models/")
-                # Đảm bảo thư mục tồn tại
-                if os.path.exists("models/vosk-model-small-vn-0.4"):
-                    os.rename("models/vosk-model-small-vn-0.4", model_dir)
-                if os.path.exists(zip_path):
-                    os.remove(zip_path)
-                print("✅ Đã tải VOSK model thành công")
-            return model_dir if os.path.exists(model_dir) else None
-        except Exception as e:
-            print(f"❌ Lỗi tải VOSK model: {e}")
-            return None
-    def start_stream(self):
-        """Bắt đầu stream mới"""
-        if self.model is None:
-            print("❌ VOSK model chưa được khởi tạo")
-            return False
-        try:
-            self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
-            self.recognizer.SetWords(True)
-            self.is_streaming = True
-            print("🎤 Đã khởi động VOSK stream")
-            return True
-        except Exception as e:
-            print(f"❌ Lỗi khởi động VOSK stream: {e}")
-            return False
-    def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
-        """Xử lý audio chunk và trả về kết quả - FIXED VOLUME VERSION"""
-        if self.recognizer is None or not self.is_streaming:
-            return {"text": "", "partial": "", "is_final": False}
-        try:
-            # DEBUG: Thông tin audio chunk
-            print(f"🔊 Audio chunk: {len(audio_chunk)} samples, dtype: {audio_chunk.dtype}, max: {np.max(audio_chunk):.4f}")
-            # Chuẩn hóa audio - QUAN TRỌNG: VOSK cần audio ở dạng int16
-            if sample_rate and sample_rate != self.sample_rate:
-                audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
-            # Đảm bảo là int16 với giá trị phù hợp
-            if audio_chunk.dtype != np.int16:
-                if audio_chunk.dtype in [np.float32, np.float64]:
-                    # Audio float cần được scale về [-32768, 32767]
-                    audio_chunk = (audio_chunk * 32767).astype(np.int16)
-                else:
-                    audio_chunk = audio_chunk.astype(np.int16)
-            # FIXED: Tăng cường âm lượng trước khi kiểm tra
-            audio_chunk = self._boost_audio_volume(audio_chunk)
-            # Kiểm tra âm lượng - GIẢM ngưỡng xuống
-            audio_rms = np.sqrt(np.mean(audio_chunk.astype(np.float32)**2)) / 32767.0
-            print(f"📊 Audio RMS: {audio_rms:.4f}, Max: {np.max(audio_chunk)}")
-            # FIXED: Giảm ngưỡng âm lượng từ 0.01 xuống 0.001
-            if audio_rms < 0.001:  # Giảm ngưỡng 10 lần
-                print(f"⚠️ Âm lượng quá thấp (RMS: {audio_rms:.6f}), bỏ qua")
-                return {"text": "", "partial": "", "is_final": False}
-            # Chuyển đổi sang bytes
-            audio_bytes = audio_chunk.tobytes()
-            # Xử lý với VOSK
-            if self.recognizer.AcceptWaveform(audio_bytes):
-                # Kết quả cuối cùng
-                result_json = self.recognizer.Result()
-                result = json.loads(result_json)
-                text = result.get('text', '').strip()
-                print(f"✅ VOSK Final Result: '{text}'")
-                if text:
-                    return {"text": text, "partial": "", "is_final": True}
-            else:
-                # Kết quả tạm thời
-                partial_json = self.recognizer.PartialResult()
-                partial_result = json.loads(partial_json)
-                partial_text = partial_result.get('partial', '').strip()
-                if partial_text:
-                    print(f"🎯 VOSK Partial: '{partial_text}'")
-                    return {"text": "", "partial": partial_text, "is_final": False}
-        except Exception as e:
-            print(f"❌ Lỗi VOSK processing: {e}")
-            traceback.print_exc()
-        return {"text": "", "partial": "", "is_final": False}
-    def _boost_audio_volume(self, audio_chunk: np.ndarray, boost_factor: float = 5.0) -> np.ndarray:
-        """Tăng cường âm lượng audio"""
-        try:
-            # Chuyển sang float để xử lý
-            audio_float = audio_chunk.astype(np.float32) / 32768.0
-            # Tăng âm lượng
-            boosted_audio = audio_float * boost_factor
-            # Ngăn chặn clipping
-            boosted_audio = np.clip(boosted_audio, -1.0, 1.0)
-            # Chuyển lại sang int16
-            boosted_audio_int16 = (boosted_audio * 32767).astype(np.int16)
-            print(f"🔊 Volume boosted: {boost_factor}x, New max: {np.max(boosted_audio_int16)}")
-            return boosted_audio_int16
-        except Exception as e:
-            print(f"⚠️ Lỗi boost volume: {e}")
-            return audio_chunk
-    def stop_stream(self) -> str:
-        """Kết thúc stream và lấy kết quả cuối"""
-        if self.recognizer:
-            try:
-                result_json = self.recognizer.FinalResult()
-                result = json.loads(result_json)
-                text = result.get('text', '').strip()
-                self.is_streaming = False
-                print(f"🛑 VOSK Final: '{text}'")
-                return text
-            except Exception as e:
-                print(f"❌ Lỗi khi dừng VOSK stream: {e}")
-        return ""
-    def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
-        """Resample audio với chất lượng tốt hơn"""
-        if orig_sr == target_sr:
-            return audio
-        try:
-            from scipy import signal
-            # Tính số sample mới
-            num_samples = int(len(audio) * target_sr / orig_sr)
-            resampled_audio = signal.resample(audio, num_samples)
-            return resampled_audio.astype(np.int16)
-        except Exception as e:
-            print(f"❌ Lỗi resample audio: {e}")
-            return audio
-class StreamingVoiceService:
-    def __init__(self, groq_client: Groq, rag_system, tts_service):
-        self.client = groq_client
-        self.rag_system = rag_system
-        self.tts_service = tts_service
-        # Khởi tạo VOSK ASR
-        print("🔄 Đang khởi tạo VOSK ASR...")
-        self.vosk_asr = VoskStreamingASR()
-        # Khởi tạo VAD
-        self.vad_processor = SileroVAD()
-        self.is_listening = False
-        self.speech_callback = None
-        self.is_processing = False
-        # Conversation context
-        self.conversation_history = []
-        self.current_transcription = ""
-        self.partial_transcription = ""
-        # Multi-thread processing
-        self.response_queue = queue.Queue()
-        self.processing_threads = []
-        self.max_workers = 2
-        # Streaming state - FIXED: Thêm các biến state mới
-        self.vosk_stream_active = False
-        self.last_voice_time = 0
-        self.silence_timeout = 3.0
-        # Audio buffer để cải thiện nhận diện
-        self.audio_buffer = []
-        self.buffer_duration = 1.0
-        self.max_buffer_samples = 16000
-        # Real-time processing
-        self.realtime_buffer = queue.Queue()
-        self.processing_active = False
-        # Latency tracking
-        self.latency_metrics = {
-            'asr': [], 'rag': [], 'llm': [], 'tts': [], 'total': [],
-            'vad_detection': [], 'queue_waiting': [], 'vosk_processing': []
-        }
-        self.current_callback = None
-    def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe với VOSK streaming - FIXED VERSION"""
-        if self.is_listening:
-            print("⚠️ Đã đang lắng nghe")
-            return False
-        self.current_callback = speech_callback
-        # Kiểm tra VOSK model
-        if self.vosk_asr.model is None:
-            print("❌ VOSK model không khả dụng")
-            if self.current_callback:
-                self.current_callback({
-                    'transcription': "Lỗi: VOSK model không khả dụng",
-                    'response': "Không thể khởi động nhận diện giọng nói",
-                    'tts_audio': None,
-                    'status': 'error'
-                })
-            return False
-        # Khởi động VOSK stream
-        if not self.vosk_asr.start_stream():
-            print("❌ Không thể khởi động VOSK stream")
-            return False
-        # Khởi động VAD
-        success = self.vad_processor.start_stream(self._on_speech_detected)
-        if success:
-            self.is_listening = True
-            self.is_processing = False
-            self.vosk_stream_active = True
-            self.last_voice_time = time.time()
-            self.audio_buffer = []
-            self.processing_active = True
-            # Khởi động worker threads
-            if not self.processing_threads:
-                for i in range(self.max_workers):
-                    thread = threading.Thread(
-                        target=self._process_response_worker,
-                        daemon=True,
-                        name=f"ASR-Worker-{i}"
-                    )
-                    thread.start()
-                    self.processing_threads.append(thread)
-            # Bắt đầu real-time processing thread
-            threading.Thread(target=self._realtime_processing_worker, daemon=True).start()
-            print("🎙️ Đã bắt đầu lắng nghe với VOSK ASR streaming")
-            # Thông báo trạng thái
-            if self.current_callback:
-                self.current_callback({
-                    'transcription': "Đã bắt đầu lắng nghe... Hãy nói gì đó",
-                    'response': "",
-                    'tts_audio': None,
-                    'status': 'listening'
-                })
-            return True
-        return False
-    def stop_listening(self):
-        """Dừng lắng nghe"""
-        self.is_listening = False
-        self.vosk_stream_active = False
-        self.processing_active = False
-        self.vad_processor.stop_stream()
-        print("🛑 Đã dừng lắng nghe")
-    def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
-        """Callback khi VAD phát hiện speech - FIXED VERSION"""
-        if not self.vosk_stream_active or not self.is_listening:
-            return
-        try:
-            # Cập nhật thời gian có giọng nói
-            self.last_voice_time = time.time()
-            print(f"🎯 VAD detected: {len(speech_audio)} samples, {sample_rate}Hz")
-            # Xử lý real-time với VOSK
-            result = self.vosk_asr.process_audio_chunk(speech_audio, sample_rate)
-            self._handle_vosk_result(result)
-        except Exception as e:
-            print(f"❌ Lỗi trong speech detection: {e}")
-    def _handle_vosk_result(self, result: Dict[str, Any]):
-        """Xử lý kết quả từ VOSK - FIXED VERSION"""
-        try:
-            # Xử lý kết quả partial (real-time)
-            if result['partial'] and len(result['partial']) > 1:
-                self.partial_transcription = result['partial']
-                print(f"🎯 VOSK Partial: '{result['partial']}'")
-                # Gửi partial result real-time
-                if self.current_callback:
-                    self.current_callback({
-                        'transcription': result['partial'],
-                        'response': "",
-                        'tts_audio': None,
-                        'status': 'partial'
-                    })
-            # Xử lý kết quả final
-            if result['is_final'] and result['text'] and len(result['text']) > 1:
-                print(f"✅ VOSK Final: '{result['text']}'")
-                # Đưa vào queue để xử lý phản hồi AI
-                try:
-                    self.response_queue.put({
-                        'transcription': result['text'],
-                        'timestamp': time.time()
-                    }, timeout=0.5)
-                    print(f"📦 Đã đưa vào queue: '{result['text']}'")
-                    # Cập nhật UI ngay lập tức
-                    if self.current_callback:
-                        self.current_callback({
-                            'transcription': result['text'],
-                            'response': "Đang xử lý...",
-                            'tts_audio': None,
-                            'status': 'processing'
-                        })
-                except queue.Full:
-                    print("⚠️ Queue đầy, bỏ qua transcription")
-                # Reset VOSK stream cho lần tiếp theo
-                self.vosk_asr.start_stream()
-        except Exception as e:
-            print(f"❌ Lỗi xử lý VOSK result: {e}")
-    def _process_response_worker(self):
-        """Worker xử lý phản hồi AI từ queue"""
-        while self.processing_active:
-            try:
-                # Lấy item từ queue với timeout
-                item = self.response_queue.get(timeout=1.0)
-                if item is None:  # Tín hiệu dừng
-                    break
-                transcription = item['transcription']
-                start_time = item['timestamp']
-                print(f"🤖 Processing AI response for: '{transcription}'")
-                # Tạo phản hồi AI
-                response = self._generate_ai_response_optimized(transcription)
-                tts_audio_path = self._text_to_speech_optimized(response)
-                # Gửi kết quả về callback
-                if self.current_callback:
-                    self.current_callback({
-                        'transcription': transcription,
-                        'response': response,
-                        'tts_audio': tts_audio_path,
-                        'status': 'completed'
-                    })
-                # Đánh dấu task hoàn thành
-                self.response_queue.task_done()
-            except queue.Empty:
-                continue
-            except Exception as e:
-                print(f"❌ Lỗi trong response worker: {e}")
-                if self.current_callback:
-                    self.current_callback({
-                        'transcription': "Lỗi xử lý",
-                        'response': f"Xin lỗi, có lỗi xảy ra: {str(e)}",
-                        'tts_audio': None,
-                        'status': 'error'
-                    })
-    def _realtime_processing_worker(self):
-        """Worker xử lý real-time để theo dõi timeout"""
-        while self.processing_active:
-            try:
-                current_time = time.time()
-                silence_duration = current_time - self.last_voice_time
-                # Xử lý timeout nếu im lặng quá lâu và có partial text
-                if (silence_duration > self.silence_timeout and
-                    self.partial_transcription and
-                    len(self.partial_transcription) > 2):
-                    print(f"⏰ Silence timeout, xử lý: '{self.partial_transcription}'")
-                    # Xử lý partial text như final
-                    try:
-                        self.response_queue.put({
-                            'transcription': self.partial_transcription,
-                            'timestamp': time.time()
-                        }, timeout=0.5)
-                    except queue.Full:
-                        print("⚠️ Queue đầy, bỏ qua timeout transcription")
-                    # Reset
-                    self.partial_transcription = ""
-                    self.vosk_asr.start_stream()
-                time.sleep(0.1)  # Giảm CPU usage
-            except Exception as e:
-                print(f"❌ Lỗi real-time worker: {e}")
-                time.sleep(0.5)
-    def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming manual mode với VOSK - FIXED VOLUME VERSION"""
-        if not audio_data:
-            return self._create_error_response("❌ Không có dữ liệu âm thanh")
-        try:
-            sample_rate, audio_array = audio_data
-            print(f"🎤 Manual audio: {len(audio_array)} samples, {sample_rate}Hz, Max: {np.max(audio_array)}")
-            # FIXED: Tăng cường âm lượng trước khi xử lý
-            audio_array = self._boost_input_volume(audio_array)
-            # Kiểm tra âm lượng với ngưỡng thấp hơn
-            if isinstance(audio_array, np.ndarray):
-                if audio_array.dtype in [np.float32, np.float64]:
-                    audio_rms = np.sqrt(np.mean(audio_array**2))
-                else:
-                    audio_rms = np.sqrt(np.mean(audio_array.astype(np.float32)**2)) / 32768.0
-                print(f"📊 Manual audio RMS: {audio_rms:.6f}, Max: {np.max(audio_array)}")
-                # FIXED: Giảm ngưỡng âm lượng
-                if audio_rms < 0.001:  # Giảm từ 0.01 xuống 0.001
-                    return {
-                        'transcription': f"Âm thanh quá nhỏ (RMS: {audio_rms:.6f}), hãy nói to hơn hoặc điều chỉnh microphone",
-                        'response': "",
-                        'tts_audio': None,
-                        'status': 'listening'
-                    }
-            # Khởi động VOSK stream tạm thời
-            if not self.vosk_asr.start_stream():
-                return self._create_error_response("❌ Không thể khởi động VOSK")
-            # Xử lý audio với VOSK
-            result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
-            if result['is_final'] and result['text'] and len(result['text']) > 1:
-                transcription = result['text']
-                print(f"📝 Manual Transcription: '{transcription}'")
-                # Tạo phản hồi AI
-                response = self._generate_ai_response_optimized(transcription)
-                tts_audio_path = self._text_to_speech_optimized(response)
-                return {
-                    'transcription': transcription,
-                    'response': response,
-                    'tts_audio': tts_audio_path,
-                    'status': 'completed'
-                }
-            elif result['partial']:
-                return {
-                    'transcription': result['partial'],
-                    'response': "",
-                    'tts_audio': None,
-                    'status': 'listening'
-                }
-            else:
-                return {
-                    'transcription': "Đang nghe... Hãy nói rõ hơn và gần microphone",
-                    'response': "",
-                    'tts_audio': None,
-                    'status': 'listening'
-                }
-        except Exception as e:
-            print(f"❌ Lỗi xử lý streaming audio: {e}")
-            traceback.print_exc()
-            return self._create_error_response(f"❌ Lỗi: {str(e)}")
-    def _boost_input_volume(self, audio_array: np.ndarray, boost_factor: float = 10.0) -> np.ndarray:
-        """Tăng cường âm lượng input audio"""
-        try:
-            if audio_array.dtype in [np.float32, np.float64]:
-                # Audio đã ở dạng float
-                boosted = audio_array * boost_factor
-                boosted = np.clip(boosted, -1.0, 1.0)
-            else:
-                # Audio ở dạng int
-                boosted = audio_array.astype(np.float32) * boost_factor
-                max_val = np.iinfo(audio_array.dtype).max
-                boosted = np.clip(boosted, -max_val, max_val).astype(audio_array.dtype)
-            print(f"🔊 Input volume boosted: {boost_factor}x")
-            return boosted
-        except Exception as e:
-            print(f"⚠️ Lỗi boost input volume: {e}")
-            return audio_array
-    def _generate_ai_response_optimized(self, transcription: str) -> str:
-        """Tạo phản hồi AI tối ưu hóa"""
-        try:
-            # Thêm vào lịch sử hội thoại
-            self.conversation_history.append({"role": "user", "content": transcription})
-            # Giới hạn lịch sử hội thoại
-            if len(self.conversation_history) > 10:
-                self.conversation_history = self.conversation_history[-10:]
-            # Tạo prompt
-            messages = [
-                {"role": "system", "content": "Bạn là trợ lý AI hữu ích. Hãy trả lời ngắn gọn, tự nhiên bằng tiếng Việt."},
-                *self.conversation_history
-            ]
-            # Gọi Groq API
-            response = self.client.chat.completions.create(
-                model="llama-3.1-8b-instant",
-                messages=messages,
-                max_tokens=150,
-                temperature=0.7
-            )
-            ai_response = response.choices[0].message.content.strip()
-            # Thêm vào lịch sử
-            self.conversation_history.append({"role": "assistant", "content": ai_response})
-            return ai_response
-        except Exception as e:
-            print(f"❌ Lỗi tạo phản hồi AI: {e}")
-            return "Xin lỗi, tôi không thể xử lý yêu cầu ngay lúc này."
-    def _text_to_speech_optimized(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói tối ưu hóa"""
-        try:
-            if not text or len(text.strip()) == 0:
-                return None
-            # Sử dụng TTS service
-            audio_path = self.tts_service.text_to_speech(
-                text=text,
-                language='vi',
-                speed=1.0
-            )
-            return audio_path
-        except Exception as e:
-            print(f"❌ Lỗi TTS: {e}")
-            return None
-    def _create_error_response(self, message: str) -> Dict[str, Any]:
-        """Tạo response lỗi chuẩn"""
-        return {
-            'transcription': message,
-            'response': "Vui lòng thử lại",
-            'tts_audio': None,
-            'status': 'error'
-        }
-    def get_conversation_state(self) -> dict:
-        """Lấy trạng thái hội thoại"""
-        return {
-            'is_listening': self.is_listening,
-            'is_processing': self.is_processing,
-            'history_length': len(self.conversation_history),
-            'current_transcription': self.current_transcription,
-            'partial_transcription': self.partial_transcription,
-            'queue_size': self.response_queue.qsize(),
-            'worker_threads': len([t for t in self.processing_threads if t.is_alive()]),
-            'vosk_active': self.vosk_stream_active,
-            'last_voice_time': time.strftime("%H:%M:%S", time.localtime(self.last_voice_time)),
-            'last_update': time.strftime("%H:%M:%S")
-        }
-    def clear_conversation(self):
-        """Xóa lịch sử hội thoại"""
-        self.conversation_history = []
-        self.current_transcription = ""
-        self.partial_transcription = ""
-        print("🗑️ Đã xóa lịch sử hội thoại")
-    def get_latency_stats(self) -> dict:
-        """Lấy thống kê latency"""
-        stats = {}
-        for component, latencies in self.latency_metrics.items():
-            if latencies:
-                recent_latencies = latencies[-10:]
-                stats[component] = {
-                    'avg': sum(latencies) / len(latencies),
-                    'min': min(latencies),
-                    'max': max(latencies),
-                    'count': len(latencies),
-                    'recent_avg': sum(recent_latencies) / len(recent_latencies),
-                    'recent_min': min(recent_latencies),
-                    'recent_max': max(recent_latencies)
-                }
-            else:
-                stats[component] = {
-                    'avg': 0, 'min': 0, 'max': 0, 'count': 0,
-                    'recent_avg': 0, 'recent_min': 0, 'recent_max': 0
-                }
-        return stats

+import io
+import numpy as np
+import soundfile as sf
+import time
+import traceback
+import threading
+import queue
+import json
+import os
+from vosk import Model, KaldiRecognizer
+from groq import Groq
+from typing import Optional, Dict, Any, Callable
+from config.settings import settings
+from core.rag_system import EnhancedRAGSystem
+from core.tts_service import EnhancedTTSService
+from core.silero_vad import SileroVAD
+class VoskStreamingASR:
+    def __init__(self, model_path: str = None):
+        self.model = None
+        self.recognizer = None
+        self.sample_rate = 16000
+        self.is_streaming = False
+        # Buffer để tích luỹ audio - QUAN TRỌNG
+        self.audio_buffer = []
+        self.buffer_duration = 2.0  # tích luỹ 2 giây audio
+        self.min_samples_for_recognition = 32000  # ít nhất 2 giây audio
+        if model_path is None:
+            model_path = self._download_vosk_model()
+        if model_path and os.path.exists(model_path):
+            print(f"🔄 Đang tải VOSK model từ: {model_path}")
+            try:
+                self.model = Model(model_path)
+                self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
+                self.recognizer.SetWords(True)
+                print("✅ Đã tải VOSK model thành công")
+            except Exception as e:
+                print(f"❌ Lỗi khởi tạo VOSK model: {e}")
+        else:
+            print(f"❌ Không tìm thấy VOSK model tại: {model_path}")
+    def start_stream(self):
+        """Bắt đầu stream mới"""
+        if self.model is None:
+            return False
+        try:
+            self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
+            self.recognizer.SetWords(True)
+            self.is_streaming = True
+            self.audio_buffer = []  # reset buffer
+            print("🎤 Đã khởi động VOSK stream")
+            return True
+        except Exception as e:
+            print(f"❌ Lỗi khởi động VOSK stream: {e}")
+            return False
+    def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
+        """Xử lý audio chunk - SIMPLIFIED VERSION"""
+        if self.recognizer is None or not self.is_streaming:
+            return {"text": "", "partial": "", "is_final": False}
+        try:
+            # Đơn giản hoá: luôn xử lý, không check âm lượng
+            if sample_rate and sample_rate != self.sample_rate:
+                audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
+            # Đảm bảo là int16
+            if audio_chunk.dtype != np.int16:
+                audio_chunk = audio_chunk.astype(np.int16)
+            # THÊM VÀO BUFFER - QUAN TRỌNG
+            self.audio_buffer.extend(audio_chunk)
+            # Chỉ xử lý khi có đủ audio
+            if len(self.audio_buffer) < 16000:  # ít nhất 1 giây
+                return {"text": "", "partial": "Đang nghe...", "is_final": False}
+            # Lấy audio từ buffer để xử lý
+            process_audio = np.array(self.audio_buffer[-32000:], dtype=np.int16)  # lấy 2 giây gần nhất
+            # Chuyển sang bytes
+            audio_bytes = process_audio.tobytes()
+            # Xử lý với VOSK - GỬI NHIỀU LẦN
+            for i in range(0, len(audio_bytes), 8000):  # gửi từng chunk nhỏ
+                chunk = audio_bytes[i:i+8000]
+                if len(chunk) > 0:
+                    if self.recognizer.AcceptWaveform(chunk):
+                        result_json = self.recognizer.Result()
+                        result = json.loads(result_json)
+                        text = result.get('text', '').strip()
+                        if text:
+                            print(f"✅ VOSK Final: '{text}'")
+                            # Reset buffer sau khi có kết quả
+                            self.audio_buffer = []
+                            return {"text": text, "partial": "", "is_final": True}
+            # Kiểm tra partial result
+            partial_json = self.recognizer.PartialResult()
+            partial_result = json.loads(partial_json)
+            partial_text = partial_result.get('partial', '').strip()
+            if partial_text:
+                print(f"🎯 VOSK Partial: '{partial_text}'")
+                return {"text": "", "partial": partial_text, "is_final": False}
+            else:
+                return {"text": "", "partial": "Đang xử lý âm thanh...", "is_final": False}
+        except Exception as e:
+            print(f"❌ Lỗi VOSK processing: {e}")
+        return {"text": "", "partial": "", "is_final": False}
+class StreamingVoiceService:
+    def __init__(self, groq_client: Groq, rag_system, tts_service):
+        self.client = groq_client
+        self.rag_system = rag_system
+        self.tts_service = tts_service
+        # Khởi tạo VOSK ASR - ĐƠN GIẢN
+        print("🔄 Đang khởi tạo VOSK ASR...")
+        self.vosk_asr = VoskStreamingASR()
+        self.is_listening = False
+        self.current_callback = None
+    def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe - ĐƠN GIẢN"""
+        if self.is_listening:
+            return False
+        self.current_callback = speech_callback
+        if self.vosk_asr.model is None:
+            return False
+        if not self.vosk_asr.start_stream():
+            return False
+        self.is_listening = True
+        print("🎙️ Đã bắt đầu lắng nghe")
+        return True
+    def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming - ĐƠN GIẢN VÀ HIỆU QUẢ"""
+        if not audio_data:
+            return {"transcription": "Không có âm thanh", "response": "", "tts_audio": None, "status": "error"}
+        try:
+            sample_rate, audio_array = audio_data
+            # Đảm bảo VOSK stream đang chạy
+            if not self.vosk_asr.is_streaming:
+                self.vosk_asr.start_stream()
+            # Xử lý với VOSK
+            result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
+            # LUÔN trả về kết quả partial để hiển thị real-time
+            if result['partial']:
+                return {
+                    'transcription': result['partial'],
+                    'response': "",
+                    'tts_audio': None,
+                    'status': 'listening'
+                }
+            elif result['is_final'] and result['text']:
+                # Có kết quả cuối - tạo phản hồi AI
+                response = self._generate_ai_response(result['text'])
+                return {
+                    'transcription': result['text'],
+                    'response': response,
+                    'tts_audio': None,  # có thể thêm TTS sau
+                    'status': 'completed'
+                }
+            else:
+                return {
+                    'transcription': "🎤 Đang nghe... nói tiếp đi",
+                    'response': "",
+                    'tts_audio': None,
+                    'status': 'listening'
+                }
+        except Exception as e:
+            print(f"❌ Lỗi: {e}")
+            return {"transcription": f"Lỗi: {e}", "response": "", "tts_audio": None, "status": "error"}
+    def _generate_ai_response(self, transcription: str) -> str:
+        """Tạo phản hồi AI đơn giản"""
+        try:
+            messages = [
+                {"role": "system", "content": "Bạn là trợ lý AI. Trả lời ngắn gọn bằng tiếng Việt."},
+                {"role": "user", "content": transcription}
+            ]
+            response = self.client.chat.completions.create(
+                model="llama-3.1-8b-instant",
+                messages=messages,
+                max_tokens=100,
+                temperature=0.7
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            return "Xin lỗi, tôi không thể trả lời ngay lúc này."
+    def stop_listening(self):
+        """Dừng lắng nghe"""
+        self.is_listening = False
+        if self.vosk_asr:
+            self.vosk_asr.stop_stream()
+        print("🛑 Đã dừng lắng nghe")
 # import io
 # import numpy as np
 # import soundfile as sf
 #             return False
 #     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
+#         """Xử lý audio chunk và trả về kết quả - FIXED VOLUME VERSION"""
 #         if self.recognizer is None or not self.is_streaming:
 #             return {"text": "", "partial": "", "is_final": False}
 #                 else:
 #                     audio_chunk = audio_chunk.astype(np.int16)
+#             # FIXED: Tăng cường âm lượng trước khi kiểm tra
+#             audio_chunk = self._boost_audio_volume(audio_chunk)
+#             # Kiểm tra âm lượng - GIẢM ngưỡng xuống
 #             audio_rms = np.sqrt(np.mean(audio_chunk.astype(np.float32)**2)) / 32767.0
+#             print(f"📊 Audio RMS: {audio_rms:.4f}, Max: {np.max(audio_chunk)}")
+#             # FIXED: Giảm ngưỡng âm lượng t�� 0.01 xuống 0.001
+#             if audio_rms < 0.001:  # Giảm ngưỡng 10 lần
+#                 print(f"⚠️ Âm lượng quá thấp (RMS: {audio_rms:.6f}), bỏ qua")
 #                 return {"text": "", "partial": "", "is_final": False}
 #             # Chuyển đổi sang bytes
 #         return {"text": "", "partial": "", "is_final": False}
+#     def _boost_audio_volume(self, audio_chunk: np.ndarray, boost_factor: float = 5.0) -> np.ndarray:
+#         """Tăng cường âm lượng audio"""
+#         try:
+#             # Chuyển sang float để xử lý
+#             audio_float = audio_chunk.astype(np.float32) / 32768.0
+#             # Tăng âm lượng
+#             boosted_audio = audio_float * boost_factor
+#             # Ngăn chặn clipping
+#             boosted_audio = np.clip(boosted_audio, -1.0, 1.0)
+#             # Chuyển lại sang int16
+#             boosted_audio_int16 = (boosted_audio * 32767).astype(np.int16)
+#             print(f"🔊 Volume boosted: {boost_factor}x, New max: {np.max(boosted_audio_int16)}")
+#             return boosted_audio_int16
+#         except Exception as e:
+#             print(f"⚠️ Lỗi boost volume: {e}")
+#             return audio_chunk
 #     def stop_stream(self) -> str:
 #         """Kết thúc stream và lấy kết quả cuối"""
 #         if self.recognizer:
 #         self.rag_system = rag_system
 #         self.tts_service = tts_service
+#         # Khởi tạo VOSK ASR
 #         print("🔄 Đang khởi tạo VOSK ASR...")
 #         self.vosk_asr = VoskStreamingASR()
 #         self.processing_threads = []
 #         self.max_workers = 2
+#         # Streaming state - FIXED: Thêm các biến state mới
 #         self.vosk_stream_active = False
 #         self.last_voice_time = 0
+#         self.silence_timeout = 3.0
 #         # Audio buffer để cải thiện nhận diện
 #         self.audio_buffer = []
+#         self.buffer_duration = 1.0
+#         self.max_buffer_samples = 16000
+#         # Real-time processing
+#         self.realtime_buffer = queue.Queue()
+#         self.processing_active = False
 #         # Latency tracking
 #         self.latency_metrics = {
 #             self.vosk_stream_active = True
 #             self.last_voice_time = time.time()
 #             self.audio_buffer = []
+#             self.processing_active = True
 #             # Khởi động worker threads
 #             if not self.processing_threads:
 #                     thread.start()
 #                     self.processing_threads.append(thread)
+#             # Bắt đầu real-time processing thread
+#             threading.Thread(target=self._realtime_processing_worker, daemon=True).start()
 #             print("🎙️ Đã bắt đầu lắng nghe với VOSK ASR streaming")
 #         return False
+#     def stop_listening(self):
+#         """Dừng lắng nghe"""
+#         self.is_listening = False
+#         self.vosk_stream_active = False
+#         self.processing_active = False
+#         self.vad_processor.stop_stream()
+#         print("🛑 Đã dừng lắng nghe")
+#     def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+#         """Callback khi VAD phát hiện speech - FIXED VERSION"""
+#         if not self.vosk_stream_active or not self.is_listening:
+#             return
+#         try:
+#             # Cập nhật thời gian có giọng nói
+#             self.last_voice_time = time.time()
+#             print(f"🎯 VAD detected: {len(speech_audio)} samples, {sample_rate}Hz")
+#             # Xử lý real-time với VOSK
+#             result = self.vosk_asr.process_audio_chunk(speech_audio, sample_rate)
+#             self._handle_vosk_result(result)
+#         except Exception as e:
+#             print(f"❌ Lỗi trong speech detection: {e}")
+#     def _handle_vosk_result(self, result: Dict[str, Any]):
+#         """Xử lý kết quả từ VOSK - FIXED VERSION"""
+#         try:
+#             # Xử lý kết quả partial (real-time)
+#             if result['partial'] and len(result['partial']) > 1:
+#                 self.partial_transcription = result['partial']
+#                 print(f"🎯 VOSK Partial: '{result['partial']}'")
+#                 # Gửi partial result real-time
+#                 if self.current_callback:
+#                     self.current_callback({
+#                         'transcription': result['partial'],
+#                         'response': "",
+#                         'tts_audio': None,
+#                         'status': 'partial'
+#                     })
+#             # Xử lý kết quả final
+#             if result['is_final'] and result['text'] and len(result['text']) > 1:
+#                 print(f"✅ VOSK Final: '{result['text']}'")
+#                 # Đưa vào queue để xử lý phản hồi AI
+#                 try:
+#                     self.response_queue.put({
+#                         'transcription': result['text'],
+#                         'timestamp': time.time()
+#                     }, timeout=0.5)
+#                     print(f"📦 Đã đưa vào queue: '{result['text']}'")
+#                     # Cập nhật UI ngay lập tức
+#                     if self.current_callback:
+#                         self.current_callback({
+#                             'transcription': result['text'],
+#                             'response': "Đang xử lý...",
+#                             'tts_audio': None,
+#                             'status': 'processing'
+#                         })
+#                 except queue.Full:
+#                     print("⚠️ Queue đầy, bỏ qua transcription")
+#                 # Reset VOSK stream cho lần tiếp theo
+#                 self.vosk_asr.start_stream()
+#         except Exception as e:
+#             print(f"❌ Lỗi xử lý VOSK result: {e}")
+#     def _process_response_worker(self):
+#         """Worker xử lý phản hồi AI từ queue"""
+#         while self.processing_active:
+#             try:
+#                 # Lấy item từ queue với timeout
+#                 item = self.response_queue.get(timeout=1.0)
+#                 if item is None:  # Tín hiệu dừng
+#                     break
+#                 transcription = item['transcription']
+#                 start_time = item['timestamp']
+#                 print(f"🤖 Processing AI response for: '{transcription}'")
+#                 # Tạo phản hồi AI
+#                 response = self._generate_ai_response_optimized(transcription)
+#                 tts_audio_path = self._text_to_speech_optimized(response)
+#                 # Gửi kết quả về callback
+#                 if self.current_callback:
+#                     self.current_callback({
+#                         'transcription': transcription,
+#                         'response': response,
+#                         'tts_audio': tts_audio_path,
+#                         'status': 'completed'
+#                     })
+#                 # Đánh dấu task hoàn thành
+#                 self.response_queue.task_done()
+#             except queue.Empty:
+#                 continue
+#             except Exception as e:
+#                 print(f"❌ Lỗi trong response worker: {e}")
+#                 if self.current_callback:
+#                     self.current_callback({
+#                         'transcription': "Lỗi xử lý",
+#                         'response': f"Xin lỗi, có lỗi xảy ra: {str(e)}",
+#                         'tts_audio': None,
+#                         'status': 'error'
+#                     })
+#     def _realtime_processing_worker(self):
+#         """Worker xử lý real-time để theo dõi timeout"""
+#         while self.processing_active:
 #             try:
 #                 current_time = time.time()
 #                 silence_duration = current_time - self.last_voice_time
+#                 # Xử lý timeout nếu im lặng quá lâu và có partial text
+#                 if (silence_duration > self.silence_timeout and
+#                     self.partial_transcription and
+#                     len(self.partial_transcription) > 2):
 #                     print(f"⏰ Silence timeout, xử lý: '{self.partial_transcription}'")
+#                     # Xử lý partial text như final
+#                     try:
+#                         self.response_queue.put({
+#                             'transcription': self.partial_transcription,
+#                             'timestamp': time.time()
+#                         }, timeout=0.5)
+#                     except queue.Full:
+#                         print("⚠️ Queue đầy, bỏ qua timeout transcription")
+#                     # Reset
 #                     self.partial_transcription = ""
 #                     self.vosk_asr.start_stream()
+#                 time.sleep(0.1)  # Giảm CPU usage
 #             except Exception as e:
+#                 print(f"❌ Lỗi real-time worker: {e}")
+#                 time.sleep(0.5)
 #     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+#         """Xử lý audio streaming manual mode với VOSK - FIXED VOLUME VERSION"""
 #         if not audio_data:
 #             return self._create_error_response("❌ Không có dữ liệu âm thanh")
 #         try:
 #             sample_rate, audio_array = audio_data
+#             print(f"🎤 Manual audio: {len(audio_array)} samples, {sample_rate}Hz, Max: {np.max(audio_array)}")
+#             # FIXED: Tăng cường âm lượng trước khi xử lý
+#             audio_array = self._boost_input_volume(audio_array)
+#             # Kiểm tra âm lượng với ngưỡng thấp hơn
 #             if isinstance(audio_array, np.ndarray):
 #                 if audio_array.dtype in [np.float32, np.float64]:
 #                     audio_rms = np.sqrt(np.mean(audio_array**2))
+#                 else:
+#                     audio_rms = np.sqrt(np.mean(audio_array.astype(np.float32)**2)) / 32768.0
+#                 print(f"📊 Manual audio RMS: {audio_rms:.6f}, Max: {np.max(audio_array)}")
+#                 # FIXED: Giảm ngưỡng âm lượng
+#                 if audio_rms < 0.001:  # Giảm từ 0.01 xuống 0.001
+#                     return {
+#                         'transcription': f"Âm thanh quá nhỏ (RMS: {audio_rms:.6f}), hãy nói to hơn hoặc điều chỉnh microphone",
+#                         'response': "",
+#                         'tts_audio': None,
+#                         'status': 'listening'
+#                     }
 #             # Khởi động VOSK stream tạm thời
 #             if not self.vosk_asr.start_stream():
 #                 }
 #             else:
 #                 return {
+#                     'transcription': "Đang nghe... Hãy nói rõ hơn và gần microphone",
 #                     'response': "",
 #                     'tts_audio': None,
 #                     'status': 'listening'
 #             print(f"❌ Lỗi xử lý streaming audio: {e}")
 #             traceback.print_exc()
 #             return self._create_error_response(f"❌ Lỗi: {str(e)}")
+#     def _boost_input_volume(self, audio_array: np.ndarray, boost_factor: float = 10.0) -> np.ndarray:
+#         """Tăng cường âm lượng input audio"""
+#         try:
+#             if audio_array.dtype in [np.float32, np.float64]:
+#                 # Audio đã ở dạng float
+#                 boosted = audio_array * boost_factor
+#                 boosted = np.clip(boosted, -1.0, 1.0)
+#             else:
+#                 # Audio ở dạng int
+#                 boosted = audio_array.astype(np.float32) * boost_factor
+#                 max_val = np.iinfo(audio_array.dtype).max
+#                 boosted = np.clip(boosted, -max_val, max_val).astype(audio_array.dtype)
+#             print(f"🔊 Input volume boosted: {boost_factor}x")
+#             return boosted
+#         except Exception as e:
+#             print(f"⚠️ Lỗi boost input volume: {e}")
+#             return audio_array
+#     def _generate_ai_response_optimized(self, transcription: str) -> str:
+#         """Tạo phản hồi AI tối ưu hóa"""
+#         try:
+#             # Thêm vào lịch sử hội thoại
+#             self.conversation_history.append({"role": "user", "content": transcription})
+#             # Giới hạn lịch sử hội thoại
+#             if len(self.conversation_history) > 10:
+#                 self.conversation_history = self.conversation_history[-10:]
+#             # Tạo prompt
+#             messages = [
+#                 {"role": "system", "content": "Bạn là trợ lý AI hữu ích. Hãy trả lời ngắn gọn, tự nhiên bằng tiếng Việt."},
+#                 *self.conversation_history
+#             ]
+#             # Gọi Groq API
+#             response = self.client.chat.completions.create(
+#                 model="llama-3.1-8b-instant",
+#                 messages=messages,
+#                 max_tokens=150,
+#                 temperature=0.7
+#             )
+#             ai_response = response.choices[0].message.content.strip()
+#             # Thêm vào lịch sử
+#             self.conversation_history.append({"role": "assistant", "content": ai_response})
+#             return ai_response
+#         except Exception as e:
+#             print(f"❌ Lỗi tạo phản hồi AI: {e}")
+#             return "Xin lỗi, tôi không thể xử lý yêu cầu ngay lúc này."
+#     def _text_to_speech_optimized(self, text: str) -> Optional[str]:
+#         """Chuyển văn bản thành giọng nói tối ưu hóa"""
+#         try:
+#             if not text or len(text.strip()) == 0:
+#                 return None
+#             # Sử dụng TTS service
+#             audio_path = self.tts_service.text_to_speech(
+#                 text=text,
+#                 language='vi',
+#                 speed=1.0
+#             )
+#             return audio_path
+#         except Exception as e:
+#             print(f"❌ Lỗi TTS: {e}")
+#             return None
 #     def _create_error_response(self, message: str) -> Dict[str, Any]:
 #         """Tạo response lỗi chuẩn"""
 #             'status': 'error'
 #         }
+#     def get_conversation_state(self) -> dict:
+#         """Lấy trạng thái hội thoại"""
+#         return {
+#             'is_listening': self.is_listening,
+#             'is_processing': self.is_processing,
+#             'history_length': len(self.conversation_history),
+#             'current_transcription': self.current_transcription,
+#             'partial_transcription': self.partial_transcription,
+#             'queue_size': self.response_queue.qsize(),
+#             'worker_threads': len([t for t in self.processing_threads if t.is_alive()]),
+#             'vosk_active': self.vosk_stream_active,
+#             'last_voice_time': time.strftime("%H:%M:%S", time.localtime(self.last_voice_time)),
+#             'last_update': time.strftime("%H:%M:%S")
+#         }
+#     def clear_conversation(self):
+#         """Xóa lịch sử hội thoại"""
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#         self.partial_transcription = ""
+#         print("🗑️ Đã xóa lịch sử hội thoại")
 #     def get_latency_stats(self) -> dict:
 #         """Lấy thống kê latency"""
 #                     'avg': 0, 'min': 0, 'max': 0, 'count': 0,
 #                     'recent_avg': 0, 'recent_min': 0, 'recent_max': 0
 #                 }
+#         return stats