Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 13, 2025

Commit

835fc53

verified ·

1 Parent(s): d102321

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +56 -34

services/streaming_voice_service.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import io
 import numpy as np
 import soundfile as sf
@@ -13,7 +12,6 @@ import zipfile
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
-from config.settings import settings
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
@@ -193,30 +191,15 @@ class StreamingVoiceService:
         self.is_listening = False
         self.current_callback = None
-        # Latency tracking - FIXED
         self.latency_metrics = {
             'asr': [], 'llm': [], 'tts': [], 'total': []
         }
-        self.last_processing_time = 0
-    def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe"""
-        if self.is_listening:
-            return False
-        self.current_callback = speech_callback
-        if self.vosk_asr.model is None:
-            print("❌ VOSK model không khả dụng")
-            return False
-        if not self.vosk_asr.start_stream():
-            print("❌ Không thể khởi động VOSK stream")
-            return False
-        self.is_listening = True
-        print("🎙️ Đã bắt đầu lắng nghe với VOSK")
-        return True
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming - FIXED LATENCY TRACKING"""
@@ -239,7 +222,7 @@ class StreamingVoiceService:
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
             asr_time = time.time() - asr_start_time
-            # Cập nhật latency metrics
             if 'processing_time' in result and result['processing_time'] > 0:
                 self.latency_metrics['asr'].append(result['processing_time'])
             else:
@@ -252,6 +235,7 @@ class StreamingVoiceService:
             # LUÔN trả về text để hiển thị real-time
             if result['partial']:
                 return {
                     'transcription': result['partial'],
                     'response': "",
@@ -260,6 +244,8 @@ class StreamingVoiceService:
                 }
             elif result['is_final'] and result['text']:
                 # Có kết quả cuối - tạo phản hồi AI với latency tracking
                 print(f"📝 Final transcription: '{result['text']}'")
                 llm_start_time = time.time()
@@ -294,9 +280,16 @@ class StreamingVoiceService:
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI"""
         try:
             messages = [
                 {"role": "system", "content": "Bạn là trợ lý AI. Trả lời ngắn gọn bằng tiếng Việt."},
-                {"role": "user", "content": transcription}
             ]
             response = self.client.chat.completions.create(
@@ -306,7 +299,12 @@ class StreamingVoiceService:
                 temperature=0.7
             )
-            return response.choices[0].message.content.strip()
         except Exception as e:
             print(f"❌ Lỗi AI: {e}")
@@ -340,6 +338,23 @@ class StreamingVoiceService:
             'status': 'error'
         }
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
@@ -349,13 +364,20 @@ class StreamingVoiceService:
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
-            'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False
         }
     def get_latency_stats(self) -> dict:
@@ -363,18 +385,18 @@ class StreamingVoiceService:
         stats = {}
         for component, latencies in self.latency_metrics.items():
             if latencies and len(latencies) > 0:
-                # Lấy 10 giá trị gần nhất
-                recent_latencies = latencies[-10:] if len(latencies) > 10 else latencies
                 stats[component] = {
-                    'avg': sum(recent_latencies) / len(recent_latencies),
-                    'min': min(recent_latencies),
-                    'max': max(recent_latencies),
                     'count': len(recent_latencies),
-                    'recent_values': [f"{x:.3f}s" for x in recent_latencies]
                 }
             else:
                 stats[component] = {
-                    'avg': 0, 'min': 0, 'max': 0, 'count': 0, 'recent_values': []
                 }
         print(f"📊 Latency stats: {stats}")

 import io
 import numpy as np
 import soundfile as sf
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
         self.is_listening = False
         self.current_callback = None
+        # Conversation context
+        self.conversation_history = []
+        self.current_transcription = ""
+        self.partial_transcription = ""
+        # Latency tracking - FIXED: Đơn giản hoá
         self.latency_metrics = {
             'asr': [], 'llm': [], 'tts': [], 'total': []
         }
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
         """Xử lý audio streaming - FIXED LATENCY TRACKING"""
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
             asr_time = time.time() - asr_start_time
+            # Cập nhật latency metrics - FIXED
             if 'processing_time' in result and result['processing_time'] > 0:
                 self.latency_metrics['asr'].append(result['processing_time'])
             else:
             # LUÔN trả về text để hiển thị real-time
             if result['partial']:
+                self.partial_transcription = result['partial']
                 return {
                     'transcription': result['partial'],
                     'response': "",
                 }
             elif result['is_final'] and result['text']:
                 # Có kết quả cuối - tạo phản hồi AI với latency tracking
+                self.current_transcription = result['text']
+                self.partial_transcription = ""
                 print(f"📝 Final transcription: '{result['text']}'")
                 llm_start_time = time.time()
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI"""
         try:
+            # Thêm vào lịch sử hội thoại
+            self.conversation_history.append({"role": "user", "content": transcription})
+            # Giới hạn lịch sử hội thoại
+            if len(self.conversation_history) > 10:
+                self.conversation_history = self.conversation_history[-10:]
             messages = [
                 {"role": "system", "content": "Bạn là trợ lý AI. Trả lời ngắn gọn bằng tiếng Việt."},
+                *self.conversation_history
             ]
             response = self.client.chat.completions.create(
                 temperature=0.7
             )
+            ai_response = response.choices[0].message.content.strip()
+            # Thêm vào lịch sử
+            self.conversation_history.append({"role": "assistant", "content": ai_response})
+            return ai_response
         except Exception as e:
             print(f"❌ Lỗi AI: {e}")
             'status': 'error'
         }
+    def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe - ĐƠN GIẢN HOÁ"""
+        if self.is_listening:
+            return False
+        self.current_callback = speech_callback
+        if self.vosk_asr.model is None:
+            return False
+        if not self.vosk_asr.start_stream():
+            return False
+        self.is_listening = True
+        print("🎙️ Đã bắt đầu lắng nghe với VOSK")
+        return True
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
     def clear_conversation(self):
         """Xóa lịch sử hội thoại"""
+        self.conversation_history = []
+        self.current_transcription = ""
+        self.partial_transcription = ""
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
+            'history_length': len(self.conversation_history),
+            'current_transcription': self.current_transcription,
+            'partial_transcription': self.partial_transcription,
+            'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False,
+            'last_update': time.strftime("%H:%M:%S")
         }
     def get_latency_stats(self) -> dict:
         stats = {}
         for component, latencies in self.latency_metrics.items():
             if latencies and len(latencies) > 0:
+                # Lấy 5 giá trị gần nhất
+                recent_latencies = latencies[-5:] if len(latencies) > 5 else latencies
                 stats[component] = {
+                    'avg': f"{sum(recent_latencies) / len(recent_latencies):.3f}s",
+                    'min': f"{min(recent_latencies):.3f}s",
+                    'max': f"{max(recent_latencies):.3f}s",
                     'count': len(recent_latencies),
+                    'recent': [f"{x:.3f}s" for x in recent_latencies]
                 }
             else:
                 stats[component] = {
+                    'avg': "0.000s", 'min': "0.000s", 'max': "0.000s", 'count': 0, 'recent': []
                 }
         print(f"📊 Latency stats: {stats}")