Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 13, 2025

Commit

19ac002

verified ·

1 Parent(s): 835fc53

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +205 -60

services/streaming_voice_service.py CHANGED Viewed

@@ -12,6 +12,7 @@ import zipfile
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
@@ -188,21 +189,181 @@ class StreamingVoiceService:
         # Khởi tạo VOSK ASR
         print("🔄 Đang khởi tạo VOSK ASR...")
         self.vosk_asr = VoskStreamingASR()
         self.is_listening = False
-        self.current_callback = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
         self.partial_transcription = ""
-        # Latency tracking - FIXED: Đơn giản hoá
         self.latency_metrics = {
             'asr': [], 'llm': [], 'tts': [], 'total': []
         }
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming - FIXED LATENCY TRACKING"""
         if not audio_data:
             return self._create_error_response("❌ Không có dữ liệu âm thanh")
@@ -211,19 +372,21 @@ class StreamingVoiceService:
         try:
             sample_rate, audio_array = audio_data
-            print(f"🎤 Nhận audio: {len(audio_array)} samples, {sample_rate}Hz")
-            # Đảm bảo VOSK stream đang chạy
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
-            # Xử lý với VOSK - với latency tracking
             asr_start_time = time.time()
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
             asr_time = time.time() - asr_start_time
-            # Cập nhật latency metrics - FIXED
-            if 'processing_time' in result and result['processing_time'] > 0:
                 self.latency_metrics['asr'].append(result['processing_time'])
             else:
                 self.latency_metrics['asr'].append(asr_time)
@@ -233,8 +396,35 @@ class StreamingVoiceService:
             print(f"⏱️ ASR time: {asr_time:.3f}s, Total: {total_time:.3f}s")
-            # LUÔN trả về text để hiển thị real-time
-            if result['partial']:
                 self.partial_transcription = result['partial']
                 return {
                     'transcription': result['partial'],
@@ -242,29 +432,6 @@ class StreamingVoiceService:
                     'tts_audio': None,
                     'status': 'listening'
                 }
-            elif result['is_final'] and result['text']:
-                # Có kết quả cuối - tạo phản hồi AI với latency tracking
-                self.current_transcription = result['text']
-                self.partial_transcription = ""
-                print(f"📝 Final transcription: '{result['text']}'")
-                llm_start_time = time.time()
-                response = self._generate_ai_response(result['text'])
-                llm_time = time.time() - llm_start_time
-                self.latency_metrics['llm'].append(llm_time)
-                tts_start_time = time.time()
-                tts_audio_path = self._text_to_speech(response)
-                tts_time = time.time() - tts_start_time
-                if tts_time > 0:
-                    self.latency_metrics['tts'].append(tts_time)
-                return {
-                    'transcription': result['text'],
-                    'response': response,
-                    'tts_audio': tts_audio_path,
-                    'status': 'completed'
-                }
             else:
                 return {
                     'transcription': "🎤 Đang nghe... tiếp tục nói",
@@ -280,10 +447,8 @@ class StreamingVoiceService:
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI"""
         try:
-            # Thêm vào lịch sử hội thoại
             self.conversation_history.append({"role": "user", "content": transcription})
-            # Giới hạn lịch sử hội thoại
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
@@ -301,7 +466,6 @@ class StreamingVoiceService:
             ai_response = response.choices[0].message.content.strip()
-            # Thêm vào lịch sử
             self.conversation_history.append({"role": "assistant", "content": ai_response})
             return ai_response
@@ -316,7 +480,6 @@ class StreamingVoiceService:
             if not text:
                 return None
-            # Sử dụng TTS service
             audio_path = self.tts_service.text_to_speech(
                 text=text,
                 language='vi',
@@ -330,7 +493,6 @@ class StreamingVoiceService:
             return None
     def _create_error_response(self, message: str) -> Dict[str, Any]:
-        """Tạo response lỗi"""
         return {
             'transcription': message,
             'response': "Vui lòng thử lại",
@@ -338,26 +500,11 @@ class StreamingVoiceService:
             'status': 'error'
         }
-    def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe - ĐƠN GIẢN HOÁ"""
-        if self.is_listening:
-            return False
-        self.current_callback = speech_callback
-        if self.vosk_asr.model is None:
-            return False
-        if not self.vosk_asr.start_stream():
-            return False
-        self.is_listening = True
-        print("🎙️ Đã bắt đầu lắng nghe với VOSK")
-        return True
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
         if self.vosk_asr:
             self.vosk_asr.stop_stream()
         print("🛑 Đã dừng lắng nghe")
@@ -370,26 +517,25 @@ class StreamingVoiceService:
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
-        """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'partial_transcription': self.partial_transcription,
             'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False,
             'last_update': time.strftime("%H:%M:%S")
         }
     def get_latency_stats(self) -> dict:
-        """Lấy thống kê latency - FIXED VERSION"""
         stats = {}
         for component, latencies in self.latency_metrics.items():
             if latencies and len(latencies) > 0:
-                # Lấy 5 giá trị gần nhất
                 recent_latencies = latencies[-5:] if len(latencies) > 5 else latencies
                 stats[component] = {
                     'avg': f"{sum(recent_latencies) / len(recent_latencies):.3f}s",
-                    'min': f"{min(recent_latencies):.3f}s",
                     'max': f"{max(recent_latencies):.3f}s",
                     'count': len(recent_latencies),
                     'recent': [f"{x:.3f}s" for x in recent_latencies]
@@ -399,7 +545,6 @@ class StreamingVoiceService:
                     'avg': "0.000s", 'min': "0.000s", 'max': "0.000s", 'count': 0, 'recent': []
                 }
-        print(f"📊 Latency stats: {stats}")
         return stats
 # import io
 # import numpy as np

 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
+from core.silero_vad import SileroVAD
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
         # Khởi tạo VOSK ASR
         print("🔄 Đang khởi tạo VOSK ASR...")
         self.vosk_asr = VoskStreamingASR()
+        # Khởi tạo VAD - SỬ DỤNG SILERO VAD CỦA MÀY
+        self.vad_processor = SileroVAD()
         self.is_listening = False
+        self.speech_callback = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
         self.partial_transcription = ""
+        # Response queue cho xử lý bất đồng bộ
+        self.response_queue = queue.Queue()
+        self.processing_active = False
+        # Latency tracking
         self.latency_metrics = {
             'asr': [], 'llm': [], 'tts': [], 'total': []
         }
+    def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe với Silero VAD"""
+        if self.is_listening:
+            print("⚠️ Đã đang lắng nghe")
+            return False
+        self.speech_callback = speech_callback
+        # Kiểm tra VOSK model
+        if self.vosk_asr.model is None:
+            print("❌ VOSK model không khả dụng")
+            return False
+        # Khởi động VOSK stream
+        if not self.vosk_asr.start_stream():
+            print("❌ Không thể khởi động VOSK stream")
+            return False
+        # Khởi động VAD với callback
+        success = self.vad_processor.start_stream(self._on_speech_detected)
+        if success:
+            self.is_listening = True
+            self.processing_active = True
+            # Khởi động worker thread cho xử lý AI response
+            worker_thread = threading.Thread(
+                target=self._process_response_worker,
+                daemon=True,
+                name="AI-Response-Worker"
+            )
+            worker_thread.start()
+            print("🎙️ Đã bắt đầu lắng nghe với Silero VAD")
+            # Thông báo trạng thái
+            if self.speech_callback:
+                self.speech_callback({
+                    'transcription': "Đã bắt đầu lắng nghe... Hãy nói gì đó",
+                    'response': "",
+                    'tts_audio': None,
+                    'status': 'listening'
+                })
+            return True
+        return False
+    def _on_speech_detected(self, speech_audio: np.ndarray, sample_rate: int):
+        """Callback khi Silero VAD phát hiện speech - FIXED VERSION"""
+        if not self.is_listening:
+            return
+        try:
+            print(f"🎯 Silero VAD detected speech: {len(speech_audio)} samples")
+            # Đảm bảo VOSK stream đang chạy
+            if not self.vosk_asr.is_streaming:
+                self.vosk_asr.start_stream()
+            # Xử lý audio với VOSK
+            result = self.vosk_asr.process_audio_chunk(speech_audio, sample_rate)
+            # Xử lý kết quả
+            if result['is_final'] and result['text']:
+                print(f"✅ VOSK Final from VAD: '{result['text']}'")
+                # Đưa vào queue để xử lý AI response
+                try:
+                    self.response_queue.put({
+                        'transcription': result['text'],
+                        'timestamp': time.time(),
+                        'source': 'vad'
+                    }, timeout=0.5)
+                    # Cập nhật UI ngay lập tức
+                    if self.speech_callback:
+                        self.speech_callback({
+                            'transcription': result['text'],
+                            'response': "Đang xử lý...",
+                            'tts_audio': None,
+                            'status': 'processing'
+                        })
+                except queue.Full:
+                    print("⚠️ Queue đầy, bỏ qua transcription")
+                # Reset VOSK stream cho lần tiếp theo
+                self.vosk_asr.start_stream()
+            elif result['partial']:
+                # Hiển thị partial text real-time
+                if self.speech_callback:
+                    self.speech_callback({
+                        'transcription': result['partial'],
+                        'response': "",
+                        'tts_audio': None,
+                        'status': 'partial'
+                    })
+        except Exception as e:
+            print(f"❌ Lỗi trong VAD speech detection: {e}")
+    def _process_response_worker(self):
+        """Worker xử lý phản hồi AI từ queue"""
+        while self.processing_active:
+            try:
+                # Lấy item từ queue với timeout
+                item = self.response_queue.get(timeout=1.0)
+                if item is None:  # Tín hiệu dừng
+                    break
+                transcription = item['transcription']
+                start_time = item['timestamp']
+                print(f"🤖 Processing AI response for: '{transcription}'")
+                # Tạo phản hồi AI với latency tracking
+                llm_start_time = time.time()
+                response = self._generate_ai_response(transcription)
+                llm_time = time.time() - llm_start_time
+                self.latency_metrics['llm'].append(llm_time)
+                tts_start_time = time.time()
+                tts_audio_path = self._text_to_speech(response)
+                tts_time = time.time() - tts_start_time
+                if tts_time > 0:
+                    self.latency_metrics['tts'].append(tts_time)
+                # Gửi kết quả về callback
+                if self.speech_callback:
+                    self.speech_callback({
+                        'transcription': transcription,
+                        'response': response,
+                        'tts_audio': tts_audio_path,
+                        'status': 'completed'
+                    })
+                # Đánh dấu task hoàn thành
+                self.response_queue.task_done()
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"❌ Lỗi trong response worker: {e}")
+                if self.speech_callback:
+                    self.speech_callback({
+                        'transcription': "Lỗi xử lý",
+                        'response': f"Xin lỗi, có lỗi xảy ra: {str(e)}",
+                        'tts_audio': None,
+                        'status': 'error'
+                    })
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming manual mode"""
         if not audio_data:
             return self._create_error_response("❌ Không có dữ liệu âm thanh")
         try:
             sample_rate, audio_array = audio_data
+            print(f"🎤 Manual audio: {len(audio_array)} samples, {sample_rate}Hz")
+            # Đưa audio vào VAD để xử lý (cho manual mode)
+            self.vad_processor.process_stream(audio_array, sample_rate)
+            # Đồng thời xử lý trực tiếp với VOSK để có kết quả real-time
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
             asr_start_time = time.time()
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
             asr_time = time.time() - asr_start_time
+            # Cập nhật latency
+            if 'processing_time' in result:
                 self.latency_metrics['asr'].append(result['processing_time'])
             else:
                 self.latency_metrics['asr'].append(asr_time)
             print(f"⏱️ ASR time: {asr_time:.3f}s, Total: {total_time:.3f}s")
+            # Xử lý kết quả
+            if result['is_final'] and result['text']:
+                self.current_transcription = result['text']
+                print(f"📝 Manual Final: '{result['text']}'")
+                # Đưa vào queue để xử lý AI response
+                try:
+                    self.response_queue.put({
+                        'transcription': result['text'],
+                        'timestamp': time.time(),
+                        'source': 'manual'
+                    }, timeout=0.5)
+                    return {
+                        'transcription': result['text'],
+                        'response': "Đang xử lý...",
+                        'tts_audio': None,
+                        'status': 'processing'
+                    }
+                except queue.Full:
+                    return {
+                        'transcription': result['text'],
+                        'response': "Hệ thống bận, vui lòng thử lại",
+                        'tts_audio': None,
+                        'status': 'completed'
+                    }
+            elif result['partial']:
                 self.partial_transcription = result['partial']
                 return {
                     'transcription': result['partial'],
                     'tts_audio': None,
                     'status': 'listening'
                 }
             else:
                 return {
                     'transcription': "🎤 Đang nghe... tiếp tục nói",
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI"""
         try:
             self.conversation_history.append({"role": "user", "content": transcription})
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
             ai_response = response.choices[0].message.content.strip()
             self.conversation_history.append({"role": "assistant", "content": ai_response})
             return ai_response
             if not text:
                 return None
             audio_path = self.tts_service.text_to_speech(
                 text=text,
                 language='vi',
             return None
     def _create_error_response(self, message: str) -> Dict[str, Any]:
         return {
             'transcription': message,
             'response': "Vui lòng thử lại",
             'status': 'error'
         }
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
+        self.processing_active = False
+        self.vad_processor.stop_stream()
         if self.vosk_asr:
             self.vosk_asr.stop_stream()
         print("🛑 Đã dừng lắng nghe")
         print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         return {
             'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription,
             'partial_transcription': self.partial_transcription,
+            'queue_size': self.response_queue.qsize(),
             'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False,
             'last_update': time.strftime("%H:%M:%S")
         }
     def get_latency_stats(self) -> dict:
+        """Lấy thống kê latency"""
         stats = {}
         for component, latencies in self.latency_metrics.items():
             if latencies and len(latencies) > 0:
                 recent_latencies = latencies[-5:] if len(latencies) > 5 else latencies
                 stats[component] = {
                     'avg': f"{sum(recent_latencies) / len(recent_latencies):.3f}s",
+                    'min': f"{min(recent_latencies):.3f}s",
                     'max': f"{max(recent_latencies):.3f}s",
                     'count': len(recent_latencies),
                     'recent': [f"{x:.3f}s" for x in recent_latencies]
                     'avg': "0.000s", 'min': "0.000s", 'max': "0.000s", 'count': 0, 'recent': []
                 }
         return stats
 # import io
 # import numpy as np