Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 21, 2025

Commit

242645b

verified ·

1 Parent(s): 09266d3

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +65 -90

services/streaming_voice_service.py CHANGED Viewed

@@ -3,7 +3,7 @@ import numpy as np
 import soundfile as sf
 import threading
 import time
-import pyaudio
 from groq import Groq
 from typing import Optional, Callable
 from config.settings import settings
@@ -11,6 +11,7 @@ from core.speechbrain_vad import SpeechBrainVAD
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client
@@ -21,96 +22,82 @@ class StreamingVoiceService:
         # Streaming state
         self.is_listening = False
         self.audio_stream = None
-        self.pyaudio_instance = None
         self.callback_handler = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
     def start_listening(self, callback_handler: Callable):
-        """Bắt đầu lắng nghe với VAD"""
         if self.is_listening:
             return False
         try:
             self.callback_handler = callback_handler
             self.is_listening = True
             self.conversation_history = []
-            # Initialize PyAudio
-            self.pyaudio_instance = pyaudio.PyAudio()
-            # Start audio stream
-            self.audio_stream = self.pyaudio_instance.open(
-                format=pyaudio.paInt16,
-                channels=1,
-                rate=settings.SAMPLE_RATE,
-                input=True,
-                frames_per_buffer=1024,
-                stream_callback=self._audio_callback
-            )
-            # Start VAD processing
             self.vad_processor.start_stream(self._process_speech_segment)
-            print("🎙️ Bắt đầu lắng nghe...")
             return True
         except Exception as e:
             print(f"❌ Lỗi khởi động stream: {e}")
             self.stop_listening()
             return False
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
-        if self.audio_stream:
-            self.audio_stream.stop_stream()
-            self.audio_stream.close()
-        if self.pyaudio_instance:
-            self.pyaudio_instance.terminate()
         self.vad_processor.stop_stream()
         print("🛑 Đã dừng lắng nghe")
-    def _audio_callback(self, in_data, frame_count, time_info, status):
         """Callback xử lý audio input real-time"""
         if status:
-            print(f"Audio stream status: {status}")
         if self.is_listening:
-            # Convert audio data to numpy array
-            audio_data = np.frombuffer(in_data, dtype=np.int16)
-            audio_float = audio_data.astype(np.float32) / 32768.0
-            # Process with VAD
-            self.vad_processor.process_stream(audio_float, settings.SAMPLE_RATE)
-        return (in_data, pyaudio.paContinue)
     def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
-        """Xử lý segment giọng nói đã được VAD phát hiện"""
         if not self.is_listening or len(speech_audio) == 0:
             return
         print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
-        # Transcribe speech segment
         transcription = self._transcribe_audio(speech_audio, sample_rate)
         if transcription and len(transcription.strip()) > 0:
             self.current_transcription = transcription
             print(f"📝 Transcription: {transcription}")
-            # Generate AI response
             response = self._generate_ai_response(transcription)
-            # Convert response to speech
             tts_audio = self._text_to_speech(response)
-            # Call callback with results
             if self.callback_handler:
                 self.callback_handler({
                     'transcription': transcription,
@@ -118,73 +105,62 @@ class StreamingVoiceService:
                     'tts_audio': tts_audio,
                     'speech_audio': speech_audio
                 })
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
-        """Chuyển đổi audio thành văn bản sử dụng Whisper"""
         try:
-            # Convert numpy array to bytes buffer
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav')
             buffer.seek(0)
-            # Transcribe with Whisper
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read()),
                 response_format="text",
-                language="vi"  # Focus on Vietnamese
             )
             return transcription.strip()
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
     def _generate_ai_response(self, user_input: str) -> str:
-        """Tạo phản hồi AI với RAG context"""
         try:
-            # Add to conversation history
             self.conversation_history.append({"role": "user", "content": user_input})
-            # Semantic search với RAG
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
             context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
-            # Prepare messages với conversation history
-            system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt. Hãy trả lời ngắn gọn, tự nhiên và hữu ích. Sử dụng thông tin từ cơ sở kiến thức khi có liên quan.
 Thông tin tham khảo:
 {context_text}
-Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyện."""
             messages = [{"role": "system", "content": system_prompt}]
-            # Add last 3 exchanges for context (to keep it manageable)
-            recent_history = self.conversation_history[-6:]  # Last 3 user-assistant pairs
-            messages.extend(recent_history)
-            # Generate response
             completion = self.client.chat.completions.create(
                 model=settings.LLM_MODEL,
                 messages=messages,
-                max_tokens=150,  # Keep responses concise for streaming
                 temperature=0.7
             )
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
-            # Keep conversation history manageable
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
             return response
         except Exception as e:
             return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
         """Chuyển văn bản thành giọng nói"""
         try:
@@ -193,13 +169,12 @@ Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyệ
                 return self.tts_service.save_audio_to_file(tts_bytes)
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
         return None
     def get_conversation_state(self) -> dict:
-        """Lấy trạng thái cuộc hội thoại hiện tại"""
         return {
             'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription
-        }

 import soundfile as sf
 import threading
 import time
+import sounddevice as sd
 from groq import Groq
 from typing import Optional, Callable
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
         self.client = groq_client
         # Streaming state
         self.is_listening = False
         self.audio_stream = None
         self.callback_handler = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
     def start_listening(self, callback_handler: Callable):
+        """Bắt đầu lắng nghe với sounddevice"""
         if self.is_listening:
             return False
         try:
             self.callback_handler = callback_handler
             self.is_listening = True
             self.conversation_history = []
+            # Start VAD processing thread
             self.vad_processor.start_stream(self._process_speech_segment)
+            # Khởi động thread lắng nghe
+            threading.Thread(target=self._listen_loop, daemon=True).start()
+            print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
             return True
         except Exception as e:
             print(f"❌ Lỗi khởi động stream: {e}")
             self.stop_listening()
             return False
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
         self.vad_processor.stop_stream()
         print("🛑 Đã dừng lắng nghe")
+    def _listen_loop(self):
+        """Luồng lấy mẫu âm thanh liên tục"""
+        try:
+            with sd.InputStream(
+                samplerate=settings.SAMPLE_RATE,
+                channels=1,
+                dtype="float32",
+                blocksize=1024,
+                callback=self._audio_callback
+            ):
+                while self.is_listening:
+                    time.sleep(0.05)
+        except Exception as e:
+            print(f"❌ Lỗi luồng âm thanh: {e}")
+            self.stop_listening()
+    def _audio_callback(self, in_data, frames, time_info, status):
         """Callback xử lý audio input real-time"""
         if status:
+            print(f"⚠️ Trạng thái âm thanh: {status}")
         if self.is_listening:
+            audio_data = np.copy(in_data[:, 0])  # Mono
+            self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
     def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
+        """Xử lý đoạn giọng nói"""
         if not self.is_listening or len(speech_audio) == 0:
             return
         print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
         transcription = self._transcribe_audio(speech_audio, sample_rate)
         if transcription and len(transcription.strip()) > 0:
             self.current_transcription = transcription
             print(f"📝 Transcription: {transcription}")
             response = self._generate_ai_response(transcription)
             tts_audio = self._text_to_speech(response)
             if self.callback_handler:
                 self.callback_handler({
                     'transcription': transcription,
                     'tts_audio': tts_audio,
                     'speech_audio': speech_audio
                 })
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+        """Chuyển audio -> text"""
         try:
             buffer = io.BytesIO()
             sf.write(buffer, audio_data, sample_rate, format='wav')
             buffer.seek(0)
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
                 file=("speech.wav", buffer.read()),
                 response_format="text",
+                language="vi"
             )
             return transcription.strip()
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
     def _generate_ai_response(self, user_input: str) -> str:
+        """Sinh phản hồi AI"""
         try:
             self.conversation_history.append({"role": "user", "content": user_input})
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
             context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
+            system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
 Thông tin tham khảo:
 {context_text}
+"""
             messages = [{"role": "system", "content": system_prompt}]
+            messages.extend(self.conversation_history[-6:])
             completion = self.client.chat.completions.create(
                 model=settings.LLM_MODEL,
                 messages=messages,
+                max_tokens=150,
                 temperature=0.7
             )
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
             if len(self.conversation_history) > 10:
                 self.conversation_history = self.conversation_history[-10:]
             return response
         except Exception as e:
             return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
     def _text_to_speech(self, text: str) -> Optional[str]:
         """Chuyển văn bản thành giọng nói"""
         try:
                 return self.tts_service.save_audio_to_file(tts_bytes)
         except Exception as e:
             print(f"❌ Lỗi TTS: {e}")
         return None
     def get_conversation_state(self) -> dict:
+        """Lấy trạng thái hội thoại"""
         return {
             'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription
+        }