Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 4, 2025

Commit

4c41d4a

verified ·

1 Parent(s): be8cb43

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +93 -15

services/streaming_voice_service.py CHANGED Viewed

@@ -39,6 +39,15 @@ class StreamingVoiceService:
         self.response_queue = queue.Queue()
         self.current_task = None
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe với VAD tối ưu"""
         if self.is_listening:
@@ -99,11 +108,13 @@ class StreamingVoiceService:
         with self.processing_lock:
             self.is_processing = True
         try:
             # Chuyển đổi speech thành text
             transcription = self._transcribe_audio(speech_audio, sample_rate)
             if not transcription or len(transcription.strip()) < 2:
                 print("⚠️ Transcription quá ngắn hoặc trống")
                 return
@@ -112,11 +123,21 @@ class StreamingVoiceService:
             self.current_transcription = transcription
             # Tạo phản hồi AI
             response = self._generate_ai_response(transcription)
             # Tạo TTS
             tts_audio_path = self._text_to_speech(response)
             # Gửi kết quả đến callback
             if self.speech_callback:
                 self.speech_callback({
@@ -132,7 +153,7 @@ class StreamingVoiceService:
         finally:
             with self.processing_lock:
                 self.is_processing = False
-    # THÊM LẠI PHƯƠNG THỨC process_streaming_audio ĐÃ BỊ THIẾU
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming (phương thức cũ cho compatibility với Gradio)"""
         if not audio_data:
@@ -284,6 +305,7 @@ class StreamingVoiceService:
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
         """Chuyển audio -> text với xử lý cải tiến"""
         try:
             # Đảm bảo kiểu dữ liệu và chuẩn hóa
             if audio_data.dtype != np.int16:
@@ -303,13 +325,13 @@ class StreamingVoiceService:
                 sample_rate = target_sample_rate
             # Giới hạn độ dài audio
-            max_duration = 15  # giây
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
             # Đảm bảo audio đủ dài
-            min_duration = 0.8  # giây
             min_samples = int(sample_rate * min_duration)
             if len(audio_data) < min_samples:
                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
@@ -322,7 +344,9 @@ class StreamingVoiceService:
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
             # Gọi API Whisper
             try:
                 transcription = self.client.audio.transcriptions.create(
                     model=settings.WHISPER_MODEL,
@@ -334,7 +358,7 @@ class StreamingVoiceService:
             except Exception as e:
                 print(f"❌ Lỗi Whisper API: {e}")
                 return None
             # Xử lý response
             if hasattr(transcription, 'text'):
                 result = transcription.text.strip()
@@ -342,7 +366,8 @@ class StreamingVoiceService:
                 result = transcription.strip()
             else:
                 result = str(transcription).strip()
             print(f"✅ Transcription: '{result}'")
             return result
@@ -352,12 +377,15 @@ class StreamingVoiceService:
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI với xử lý lỗi"""
         try:
             # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
             # Tìm ki���m RAG
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
@@ -371,19 +399,21 @@ Thông tin tham khảo:
             messages.extend(self.conversation_history[-6:])
             completion = self.client.chat.completions.create(
-                model="llama-3.1-8b-instant",
                 messages=messages,
-                max_tokens=150,
                 temperature=0.7
             )
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
             # Giới hạn lịch sử
             if len(self.conversation_history) > 12:
                 self.conversation_history = self.conversation_history[-12:]
             return response
         except Exception as e:
@@ -391,14 +421,18 @@ Thông tin tham khảo:
             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
     def _text_to_speech(self, text: str) -> Optional[str]:
-        """Chuyển văn bản thành giọng nói"""
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
             if tts_bytes:
                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
                 print(f"✅ Đã tạo TTS: {audio_path}")
                 return audio_path
         except Exception as e:
@@ -421,7 +455,51 @@ Thông tin tham khảo:
         self.conversation_history = []
         self.current_transcription = ""
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {

         self.response_queue = queue.Queue()
         self.current_task = None
+        #Latency
+        self.latency_metrics = {
+            'asr': [],
+            'rag' : [],
+            'llm' : [],
+            'tts' : [],
+            'total' : []
+        }
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe với VAD tối ưu"""
         if self.is_listening:
         with self.processing_lock:
             self.is_processing = True
+        total_start_time = time.time()
         try:
             # Chuyển đổi speech thành text
+            # 1. ASR
+            asr_start = time.time()
             transcription = self._transcribe_audio(speech_audio, sample_rate)
+            asr_latency = time.time() - asr_start
             if not transcription or len(transcription.strip()) < 2:
                 print("⚠️ Transcription quá ngắn hoặc trống")
                 return
             self.current_transcription = transcription
             # Tạo phản hồi AI
+            rag_start = time.time()
             response = self._generate_ai_response(transcription)
+            rag_latency = time.time() - rag_start
             # Tạo TTS
+             tts_start = time.time()
             tts_audio_path = self._text_to_speech(response)
+            tts_latency = time.time() - tts_start
+             total_latency = time.time() - total_start_time
+            # Log latency metrics
+            self._log_latency_metrics({
+                'asr': asr_latency,
+                'rag': rag_latency,
+                'tts': tts_latency,
+                'total': total_latency
+            })
             # Gửi kết quả đến callback
             if self.speech_callback:
                 self.speech_callback({
         finally:
             with self.processing_lock:
                 self.is_processing = False
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming (phương thức cũ cho compatibility với Gradio)"""
         if not audio_data:
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
         """Chuyển audio -> text với xử lý cải tiến"""
+        asr_start = time.time()
         try:
             # Đảm bảo kiểu dữ liệu và chuẩn hóa
             if audio_data.dtype != np.int16:
                 sample_rate = target_sample_rate
             # Giới hạn độ dài audio
+            max_duration = 30  # giây
             max_samples = sample_rate * max_duration
             if len(audio_data) > max_samples:
                 audio_data = audio_data[:max_samples]
             # Đảm bảo audio đủ dài
+            min_duration = 2  # giây
             min_samples = int(sample_rate * min_duration)
             if len(audio_data) < min_samples:
                 padding = np.zeros(min_samples - len(audio_data), dtype=np.int16)
             sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
             # Gọi API Whisper
+            api_start = time.time()
             try:
                 transcription = self.client.audio.transcriptions.create(
                     model=settings.WHISPER_MODEL,
             except Exception as e:
                 print(f"❌ Lỗi Whisper API: {e}")
                 return None
+            api_latency = time.time() - api_start
             # Xử lý response
             if hasattr(transcription, 'text'):
                 result = transcription.text.strip()
                 result = transcription.strip()
             else:
                 result = str(transcription).strip()
+            total_asr_latency = time.time() - asr_start
+            print(f"✅ ASR Latency: {total_asr_latency:.2f}s (API: {api_latency:.2f}s)")
             print(f"✅ Transcription: '{result}'")
             return result
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI với xử lý lỗi"""
+        llm_start = time.time()
         try:
             # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
             # Tìm ki���m RAG
+            rag_start = time.time()
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
+            rag_latency = time.time() - rag_start
             context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
             messages.extend(self.conversation_history[-6:])
             completion = self.client.chat.completions.create(
+                model=settings.MULTILINGUAL_LLM_MODEL,
                 messages=messages,
+                max_tokens=300,
                 temperature=0.7
             )
+            ttft = time.time() - llm_inference_start  # Time To First Token
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
+            total_llm_latency = time.time() - llm_start
             # Giới hạn lịch sử
             if len(self.conversation_history) > 12:
                 self.conversation_history = self.conversation_history[-12:]
+            print(f"✅ RAG Latency: {rag_latency:.2f}s")
+            print(f"✅ LLM TTFT: {ttft:.2f}s")
+            print(f"✅ Total LLM Latency: {total_llm_latency:.2f}s")
             return response
         except Exception as e:
             return "Xin lỗi, tôi gặp lỗi khi tạo phản hồi. Vui lòng thử lại."
     def _text_to_speech(self, text: str) -> Optional[str]:
+        """Chuyển văn bản thành giọng nói với latency tracking"""
+        tts_start = time.time()
         try:
             if not text or text.startswith("❌") or text.startswith("Xin lỗi"):
                 return None
             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
+            tts_latency = time.time() - tts_start
             if tts_bytes:
                 audio_path = self.tts_service.save_audio_to_file(tts_bytes)
+                print(f"✅ TTS Latency: {tts_latency:.2f}s")
                 print(f"✅ Đã tạo TTS: {audio_path}")
                 return audio_path
         except Exception as e:
         self.conversation_history = []
         self.current_transcription = ""
         print("🗑️ Đã xóa lịch sử hội thoại")
+    def _log_latency_metrics(self, latencies: dict):
+        """Log và theo dõi latency metrics"""
+        for key, value in latencies.items():
+            if key in self.latency_metrics:
+                self.latency_metrics[key].append(value)
+                # Giữ chỉ 100 mẫu gần nhất
+                if len(self.latency_metrics[key]) > 100:
+                    self.latency_metrics[key] = self.latency_metrics[key][-100:]
+        # Log tổng hợp
+        print("📊 LATENCY REPORT:")
+        print(f"   ASR: {latencies['asr']:.2f}s")
+        print(f"   RAG: {latencies['rag']:.2f}s")
+        print(f"   TTS: {latencies['tts']:.2f}s")
+        print(f"   TOTAL: {latencies['total']:.2f}s")
+        # Tính toán và hiển thị latency trung bình
+        self._print_average_latencies()
+    def _print_average_latencies(self):
+        """In ra latency trung bình"""
+        if len(self.latency_metrics['total']) > 0:
+            print("📈 AVERAGE LATENCIES (last 10 requests):")
+            for component in ['asr', 'rag', 'tts', 'total']:
+                recent_latencies = self.latency_metrics[component][-10:]
+                if recent_latencies:
+                    avg = sum(recent_latencies) / len(recent_latencies)
+                    print(f"   {component.upper()}: {avg:.2f}s")
+    def get_latency_stats(self) -> dict:
+        """Lấy thống kê latency"""
+        stats = {}
+        for component, latencies in self.latency_metrics.items():
+            if latencies:
+                stats[component] = {
+                    'avg': sum(latencies) / len(latencies),
+                    'min': min(latencies),
+                    'max': max(latencies),
+                    'count': len(latencies),
+                    'recent_avg': sum(latencies[-10:]) / min(10, len(latencies)) if latencies else 0
+                }
+            else:
+                stats[component] = {'avg': 0, 'min': 0, 'max': 0, 'count': 0, 'recent_avg': 0}
+        return stats
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {