Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Oct 22, 2025

Commit

adfdb5e

verified ·

1 Parent(s): 55d88b0

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +259 -91

services/streaming_voice_service.py CHANGED Viewed

@@ -1,13 +1,192 @@
 import io
 import numpy as np
 import soundfile as sf
 import threading
 import time
-import sounddevice as sd
 from groq import Groq
 from typing import Optional, Callable
 from config.settings import settings
-from core.speechbrain_vad import SpeechBrainVAD
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
@@ -17,110 +196,92 @@ class StreamingVoiceService:
         self.client = groq_client
         self.rag_system = rag_system
         self.tts_service = tts_service
-        self.vad_processor = SpeechBrainVAD()
         # Streaming state
         self.is_listening = False
-        self.audio_stream = None
         self.callback_handler = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
-    def start_listening(self, callback_handler: Callable):
-        """Bắt đầu lắng nghe với sounddevice"""
-        if self.is_listening:
-            return False
-        try:
-            self.callback_handler = callback_handler
-            self.is_listening = True
-            self.conversation_history = []
-            # Start VAD processing thread
-            self.vad_processor.start_stream(self._process_speech_segment)
-            # Khởi động thread lắng nghe
-            threading.Thread(target=self._listen_loop, daemon=True).start()
-            print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
-            return True
-        except Exception as e:
-            print(f"❌ Lỗi khởi động stream: {e}")
-            self.stop_listening()
-            return False
-    def stop_listening(self):
-        """Dừng lắng nghe"""
-        self.is_listening = False
-        self.vad_processor.stop_stream()
-        print("🛑 Đã dừng lắng nghe")
-    def _listen_loop(self):
-        """Luồng lấy mẫu âm thanh liên tục"""
         try:
-            with sd.InputStream(
-                samplerate=settings.SAMPLE_RATE,
-                channels=1,
-                dtype="float32",
-                blocksize=1024,
-                callback=self._audio_callback
-            ):
-                while self.is_listening:
-                    time.sleep(0.05)
-        except Exception as e:
-            print(f"❌ Lỗi luồng âm thanh: {e}")
-            self.stop_listening()
-    def _audio_callback(self, in_data, frames, time_info, status):
-        """Callback xử lý audio input real-time"""
-        if status:
-            print(f"⚠️ Trạng thái âm thanh: {status}")
-        if self.is_listening:
-            audio_data = np.copy(in_data[:, 0])  # Mono
-            self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
-    def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
-        """Xử lý đoạn giọng nói"""
-        if not self.is_listening or len(speech_audio) == 0:
-            return
-        print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
-        transcription = self._transcribe_audio(speech_audio, sample_rate)
-        if transcription and len(transcription.strip()) > 0:
-            self.current_transcription = transcription
-            print(f"📝 Transcription: {transcription}")
             response = self._generate_ai_response(transcription)
-            tts_audio = self._text_to_speech(response)
-            if self.callback_handler:
-                self.callback_handler({
-                    'transcription': transcription,
-                    'response': response,
-                    'tts_audio': tts_audio,
-                    'speech_audio': speech_audio
-                })
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
         """Chuyển audio -> text"""
         try:
             buffer = io.BytesIO()
-            sf.write(buffer, audio_data, sample_rate, format='wav')
             buffer.seek(0)
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
-                file=("speech.wav", buffer.read()),
                 response_format="text",
                 language="vi"
             )
-            return transcription.strip()
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
@@ -128,23 +289,25 @@ class StreamingVoiceService:
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI"""
         try:
             self.conversation_history.append({"role": "user", "content": user_input})
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
-            context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
-Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
 Thông tin tham khảo:
 {context_text}
 """
             messages = [{"role": "system", "content": system_prompt}]
-            messages.extend(self.conversation_history[-6:])
             completion = self.client.chat.completions.create(
-                model=settings.LLM_MODEL,
                 messages=messages,
                 max_tokens=150,
                 temperature=0.7
@@ -153,8 +316,9 @@ Thông tin tham khảo:
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
-            if len(self.conversation_history) > 10:
-                self.conversation_history = self.conversation_history[-10:]
             return response
@@ -171,10 +335,14 @@ Thông tin tham khảo:
             print(f"❌ Lỗi TTS: {e}")
         return None
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
-            'is_listening': self.is_listening,
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription
-        }

+# import io
+# import numpy as np
+# import soundfile as sf
+# import threading
+# import time
+# import sounddevice as sd
+# from groq import Groq
+# from typing import Optional, Callable
+# from config.settings import settings
+# from core.speechbrain_vad import SpeechBrainVAD
+# from core.rag_system import EnhancedRAGSystem
+# from core.tts_service import EnhancedTTSService
+# class StreamingVoiceService:
+#     def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
+#         self.client = groq_client
+#         self.rag_system = rag_system
+#         self.tts_service = tts_service
+#         self.vad_processor = SpeechBrainVAD()
+#         # Streaming state
+#         self.is_listening = False
+#         self.audio_stream = None
+#         self.callback_handler = None
+#         # Conversation context
+#         self.conversation_history = []
+#         self.current_transcription = ""
+#     def start_listening(self, callback_handler: Callable):
+#         """Bắt đầu lắng nghe với sounddevice"""
+#         if self.is_listening:
+#             return False
+#         try:
+#             self.callback_handler = callback_handler
+#             self.is_listening = True
+#             self.conversation_history = []
+#             # Start VAD processing thread
+#             self.vad_processor.start_stream(self._process_speech_segment)
+#             # Khởi động thread lắng nghe
+#             threading.Thread(target=self._listen_loop, daemon=True).start()
+#             print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
+#             return True
+#         except Exception as e:
+#             print(f"❌ Lỗi khởi động stream: {e}")
+#             self.stop_listening()
+#             return False
+#     def stop_listening(self):
+#         """Dừng lắng nghe"""
+#         self.is_listening = False
+#         self.vad_processor.stop_stream()
+#         print("🛑 Đã dừng lắng nghe")
+#     def _listen_loop(self):
+#         """Luồng lấy mẫu âm thanh liên tục"""
+#         try:
+#             with sd.InputStream(
+#                 samplerate=settings.SAMPLE_RATE,
+#                 channels=1,
+#                 dtype="float32",
+#                 blocksize=1024,
+#                 callback=self._audio_callback
+#             ):
+#                 while self.is_listening:
+#                     time.sleep(0.05)
+#         except Exception as e:
+#             print(f"❌ Lỗi luồng âm thanh: {e}")
+#             self.stop_listening()
+#     def _audio_callback(self, in_data, frames, time_info, status):
+#         """Callback xử lý audio input real-time"""
+#         if status:
+#             print(f"⚠️ Trạng thái âm thanh: {status}")
+#         if self.is_listening:
+#             audio_data = np.copy(in_data[:, 0])  # Mono
+#             self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
+#     def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
+#         """Xử lý đoạn giọng nói"""
+#         if not self.is_listening or len(speech_audio) == 0:
+#             return
+#         print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
+#         transcription = self._transcribe_audio(speech_audio, sample_rate)
+#         if transcription and len(transcription.strip()) > 0:
+#             self.current_transcription = transcription
+#             print(f"📝 Transcription: {transcription}")
+#             response = self._generate_ai_response(transcription)
+#             tts_audio = self._text_to_speech(response)
+#             if self.callback_handler:
+#                 self.callback_handler({
+#                     'transcription': transcription,
+#                     'response': response,
+#                     'tts_audio': tts_audio,
+#                     'speech_audio': speech_audio
+#                 })
+#     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
+#         """Chuyển audio -> text"""
+#         try:
+#             buffer = io.BytesIO()
+#             sf.write(buffer, audio_data, sample_rate, format='wav')
+#             buffer.seek(0)
+#             transcription = self.client.audio.transcriptions.create(
+#                 model=settings.WHISPER_MODEL,
+#                 file=("speech.wav", buffer.read()),
+#                 response_format="text",
+#                 language="vi"
+#             )
+#             return transcription.strip()
+#         except Exception as e:
+#             print(f"❌ Lỗi transcription: {e}")
+#             return None
+#     def _generate_ai_response(self, user_input: str) -> str:
+#         """Sinh phản hồi AI"""
+#         try:
+#             self.conversation_history.append({"role": "user", "content": user_input})
+#             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
+#             context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
+#             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+# Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
+# Thông tin tham khảo:
+# {context_text}
+# """
+#             messages = [{"role": "system", "content": system_prompt}]
+#             messages.extend(self.conversation_history[-6:])
+#             completion = self.client.chat.completions.create(
+#                 model=settings.LLM_MODEL,
+#                 messages=messages,
+#                 max_tokens=150,
+#                 temperature=0.7
+#             )
+#             response = completion.choices[0].message.content
+#             self.conversation_history.append({"role": "assistant", "content": response})
+#             if len(self.conversation_history) > 10:
+#                 self.conversation_history = self.conversation_history[-10:]
+#             return response
+#         except Exception as e:
+#             return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
+#     def _text_to_speech(self, text: str) -> Optional[str]:
+#         """Chuyển văn bản thành giọng nói"""
+#         try:
+#             tts_bytes = self.tts_service.text_to_speech(text, 'vi')
+#             if tts_bytes:
+#                 return self.tts_service.save_audio_to_file(tts_bytes)
+#         except Exception as e:
+#             print(f"❌ Lỗi TTS: {e}")
+#         return None
+#     def get_conversation_state(self) -> dict:
+#         """Lấy trạng thái hội thoại"""
+#         return {
+#             'is_listening': self.is_listening,
+#             'history_length': len(self.conversation_history),
+#             'current_transcription': self.current_transcription
+#         }
 import io
 import numpy as np
 import soundfile as sf
 import threading
 import time
+import traceback
 from groq import Groq
 from typing import Optional, Callable
 from config.settings import settings
 from core.rag_system import EnhancedRAGSystem
 from core.tts_service import EnhancedTTSService
         self.client = groq_client
         self.rag_system = rag_system
         self.tts_service = tts_service
         # Streaming state
         self.is_listening = False
         self.callback_handler = None
         # Conversation context
         self.conversation_history = []
         self.current_transcription = ""
+    def process_streaming_audio(self, audio_data: tuple) -> dict:
+        """Xử lý audio streaming từ Gradio microphone component"""
+        if not audio_data:
+            return {
+                'transcription': "❌ Không có dữ liệu âm thanh",
+                'response': "Vui lòng nói lại",
+                'tts_audio': None
+            }
         try:
+            # Lấy dữ liệu audio từ Gradio
+            sample_rate, audio_array = audio_data
+            print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
+            # Chuyển đổi thành văn bản
+            transcription = self._transcribe_audio(audio_array, sample_rate)
+            if not transcription or len(transcription.strip()) == 0:
+                return {
+                    'transcription': "❌ Không nghe rõ",
+                    'response': "Xin vui lòng nói lại rõ hơn",
+                    'tts_audio': None
+                }
+            print(f"📝 Đã chuyển đổi: {transcription}")
+            # Tạo phản hồi AI
             response = self._generate_ai_response(transcription)
+            # Tạo TTS
+            tts_audio_path = self._text_to_speech(response)
+            return {
+                'transcription': transcription,
+                'response': response,
+                'tts_audio': tts_audio_path
+            }
+        except Exception as e:
+            print(f"❌ Lỗi xử lý streaming audio: {e}")
+            return {
+                'transcription': f"❌ Lỗi: {str(e)}",
+                'response': "Xin lỗi, có lỗi xảy ra",
+                'tts_audio': None
+            }
     def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
         """Chuyển audio -> text"""
         try:
+            # Chuẩn hóa audio data
+            if audio_data.ndim > 1:
+                audio_data = np.mean(audio_data, axis=1)  # Chuyển sang mono
+            # Normalize
+            if np.max(np.abs(audio_data)) > 0:
+                audio_data = audio_data / np.max(np.abs(audio_data))
             buffer = io.BytesIO()
+            sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
             buffer.seek(0)
             transcription = self.client.audio.transcriptions.create(
                 model=settings.WHISPER_MODEL,
+                file=("speech.wav", buffer.read(), "audio/wav"),
                 response_format="text",
                 language="vi"
             )
+            # Xử lý response
+            if hasattr(transcription, 'text'):
+                return transcription.text.strip()
+            elif isinstance(transcription, str):
+                return transcription.strip()
+            else:
+                return str(transcription).strip()
         except Exception as e:
             print(f"❌ Lỗi transcription: {e}")
             return None
     def _generate_ai_response(self, user_input: str) -> str:
         """Sinh phản hồi AI"""
         try:
+            # Thêm vào lịch sử
             self.conversation_history.append({"role": "user", "content": user_input})
+            # Tìm kiếm RAG
             rag_results = self.rag_system.semantic_search(user_input, top_k=2)
+            context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
             system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
+Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
 Thông tin tham khảo:
 {context_text}
 """
             messages = [{"role": "system", "content": system_prompt}]
+            # Giữ lại 4 tin nhắn gần nhất
+            messages.extend(self.conversation_history[-4:])
             completion = self.client.chat.completions.create(
+                model="llama-3.1-8b-instant",
                 messages=messages,
                 max_tokens=150,
                 temperature=0.7
             response = completion.choices[0].message.content
             self.conversation_history.append({"role": "assistant", "content": response})
+            # Giới hạn lịch sử
+            if len(self.conversation_history) > 8:
+                self.conversation_history = self.conversation_history[-8:]
             return response
             print(f"❌ Lỗi TTS: {e}")
         return None
+    def clear_conversation(self):
+        """Xóa lịch sử hội thoại"""
+        self.conversation_history = []
+        print("🗑️ Đã xóa lịch sử hội thoại")
     def get_conversation_state(self) -> dict:
         """Lấy trạng thái hội thoại"""
         return {
             'history_length': len(self.conversation_history),
             'current_transcription': self.current_transcription
+        }