Update services/streaming_voice_service.py
Browse files
services/streaming_voice_service.py
CHANGED
|
@@ -3,7 +3,7 @@ import numpy as np
|
|
| 3 |
import soundfile as sf
|
| 4 |
import threading
|
| 5 |
import time
|
| 6 |
-
import
|
| 7 |
from groq import Groq
|
| 8 |
from typing import Optional, Callable
|
| 9 |
from config.settings import settings
|
|
@@ -11,6 +11,7 @@ from core.speechbrain_vad import SpeechBrainVAD
|
|
| 11 |
from core.rag_system import EnhancedRAGSystem
|
| 12 |
from core.tts_service import EnhancedTTSService
|
| 13 |
|
|
|
|
| 14 |
class StreamingVoiceService:
|
| 15 |
def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
|
| 16 |
self.client = groq_client
|
|
@@ -21,96 +22,82 @@ class StreamingVoiceService:
|
|
| 21 |
# Streaming state
|
| 22 |
self.is_listening = False
|
| 23 |
self.audio_stream = None
|
| 24 |
-
self.pyaudio_instance = None
|
| 25 |
self.callback_handler = None
|
| 26 |
|
| 27 |
# Conversation context
|
| 28 |
self.conversation_history = []
|
| 29 |
self.current_transcription = ""
|
| 30 |
-
|
| 31 |
def start_listening(self, callback_handler: Callable):
|
| 32 |
-
"""Bắt đầu lắng nghe với
|
| 33 |
if self.is_listening:
|
| 34 |
return False
|
| 35 |
-
|
| 36 |
try:
|
| 37 |
self.callback_handler = callback_handler
|
| 38 |
self.is_listening = True
|
| 39 |
self.conversation_history = []
|
| 40 |
-
|
| 41 |
-
#
|
| 42 |
-
self.pyaudio_instance = pyaudio.PyAudio()
|
| 43 |
-
|
| 44 |
-
# Start audio stream
|
| 45 |
-
self.audio_stream = self.pyaudio_instance.open(
|
| 46 |
-
format=pyaudio.paInt16,
|
| 47 |
-
channels=1,
|
| 48 |
-
rate=settings.SAMPLE_RATE,
|
| 49 |
-
input=True,
|
| 50 |
-
frames_per_buffer=1024,
|
| 51 |
-
stream_callback=self._audio_callback
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
# Start VAD processing
|
| 55 |
self.vad_processor.start_stream(self._process_speech_segment)
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
| 58 |
return True
|
| 59 |
-
|
| 60 |
except Exception as e:
|
| 61 |
print(f"❌ Lỗi khởi động stream: {e}")
|
| 62 |
self.stop_listening()
|
| 63 |
return False
|
| 64 |
-
|
| 65 |
def stop_listening(self):
|
| 66 |
"""Dừng lắng nghe"""
|
| 67 |
self.is_listening = False
|
| 68 |
-
|
| 69 |
-
if self.audio_stream:
|
| 70 |
-
self.audio_stream.stop_stream()
|
| 71 |
-
self.audio_stream.close()
|
| 72 |
-
|
| 73 |
-
if self.pyaudio_instance:
|
| 74 |
-
self.pyaudio_instance.terminate()
|
| 75 |
-
|
| 76 |
self.vad_processor.stop_stream()
|
| 77 |
print("🛑 Đã dừng lắng nghe")
|
| 78 |
-
|
| 79 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"""Callback xử lý audio input real-time"""
|
| 81 |
if status:
|
| 82 |
-
print(f"
|
| 83 |
-
|
| 84 |
if self.is_listening:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
# Process with VAD
|
| 90 |
-
self.vad_processor.process_stream(audio_float, settings.SAMPLE_RATE)
|
| 91 |
-
|
| 92 |
-
return (in_data, pyaudio.paContinue)
|
| 93 |
-
|
| 94 |
def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
|
| 95 |
-
"""Xử lý
|
| 96 |
if not self.is_listening or len(speech_audio) == 0:
|
| 97 |
return
|
| 98 |
-
|
| 99 |
print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
|
| 100 |
-
|
| 101 |
-
# Transcribe speech segment
|
| 102 |
transcription = self._transcribe_audio(speech_audio, sample_rate)
|
| 103 |
if transcription and len(transcription.strip()) > 0:
|
| 104 |
self.current_transcription = transcription
|
| 105 |
print(f"📝 Transcription: {transcription}")
|
| 106 |
-
|
| 107 |
-
# Generate AI response
|
| 108 |
response = self._generate_ai_response(transcription)
|
| 109 |
-
|
| 110 |
-
# Convert response to speech
|
| 111 |
tts_audio = self._text_to_speech(response)
|
| 112 |
-
|
| 113 |
-
# Call callback with results
|
| 114 |
if self.callback_handler:
|
| 115 |
self.callback_handler({
|
| 116 |
'transcription': transcription,
|
|
@@ -118,73 +105,62 @@ class StreamingVoiceService:
|
|
| 118 |
'tts_audio': tts_audio,
|
| 119 |
'speech_audio': speech_audio
|
| 120 |
})
|
| 121 |
-
|
| 122 |
def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
|
| 123 |
-
"""Chuyển
|
| 124 |
try:
|
| 125 |
-
# Convert numpy array to bytes buffer
|
| 126 |
buffer = io.BytesIO()
|
| 127 |
sf.write(buffer, audio_data, sample_rate, format='wav')
|
| 128 |
buffer.seek(0)
|
| 129 |
-
|
| 130 |
-
# Transcribe with Whisper
|
| 131 |
transcription = self.client.audio.transcriptions.create(
|
| 132 |
model=settings.WHISPER_MODEL,
|
| 133 |
file=("speech.wav", buffer.read()),
|
| 134 |
response_format="text",
|
| 135 |
-
language="vi"
|
| 136 |
)
|
| 137 |
-
|
| 138 |
return transcription.strip()
|
| 139 |
-
|
| 140 |
except Exception as e:
|
| 141 |
print(f"❌ Lỗi transcription: {e}")
|
| 142 |
return None
|
| 143 |
-
|
| 144 |
def _generate_ai_response(self, user_input: str) -> str:
|
| 145 |
-
"""
|
| 146 |
try:
|
| 147 |
-
# Add to conversation history
|
| 148 |
self.conversation_history.append({"role": "user", "content": user_input})
|
| 149 |
-
|
| 150 |
-
# Semantic search với RAG
|
| 151 |
rag_results = self.rag_system.semantic_search(user_input, top_k=2)
|
| 152 |
context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
Thông tin tham khảo:
|
| 158 |
{context_text}
|
|
|
|
| 159 |
|
| 160 |
-
Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyện."""
|
| 161 |
-
|
| 162 |
messages = [{"role": "system", "content": system_prompt}]
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
recent_history = self.conversation_history[-6:] # Last 3 user-assistant pairs
|
| 166 |
-
messages.extend(recent_history)
|
| 167 |
-
|
| 168 |
-
# Generate response
|
| 169 |
completion = self.client.chat.completions.create(
|
| 170 |
model=settings.LLM_MODEL,
|
| 171 |
messages=messages,
|
| 172 |
-
max_tokens=150,
|
| 173 |
temperature=0.7
|
| 174 |
)
|
| 175 |
-
|
| 176 |
response = completion.choices[0].message.content
|
| 177 |
self.conversation_history.append({"role": "assistant", "content": response})
|
| 178 |
-
|
| 179 |
-
# Keep conversation history manageable
|
| 180 |
if len(self.conversation_history) > 10:
|
| 181 |
self.conversation_history = self.conversation_history[-10:]
|
| 182 |
-
|
| 183 |
return response
|
| 184 |
-
|
| 185 |
except Exception as e:
|
| 186 |
return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
|
| 187 |
-
|
| 188 |
def _text_to_speech(self, text: str) -> Optional[str]:
|
| 189 |
"""Chuyển văn bản thành giọng nói"""
|
| 190 |
try:
|
|
@@ -193,13 +169,12 @@ Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyệ
|
|
| 193 |
return self.tts_service.save_audio_to_file(tts_bytes)
|
| 194 |
except Exception as e:
|
| 195 |
print(f"❌ Lỗi TTS: {e}")
|
| 196 |
-
|
| 197 |
return None
|
| 198 |
-
|
| 199 |
def get_conversation_state(self) -> dict:
|
| 200 |
-
"""Lấy trạng thái
|
| 201 |
return {
|
| 202 |
'is_listening': self.is_listening,
|
| 203 |
'history_length': len(self.conversation_history),
|
| 204 |
'current_transcription': self.current_transcription
|
| 205 |
-
}
|
|
|
|
| 3 |
import soundfile as sf
|
| 4 |
import threading
|
| 5 |
import time
|
| 6 |
+
import sounddevice as sd
|
| 7 |
from groq import Groq
|
| 8 |
from typing import Optional, Callable
|
| 9 |
from config.settings import settings
|
|
|
|
| 11 |
from core.rag_system import EnhancedRAGSystem
|
| 12 |
from core.tts_service import EnhancedTTSService
|
| 13 |
|
| 14 |
+
|
| 15 |
class StreamingVoiceService:
|
| 16 |
def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
|
| 17 |
self.client = groq_client
|
|
|
|
| 22 |
# Streaming state
|
| 23 |
self.is_listening = False
|
| 24 |
self.audio_stream = None
|
|
|
|
| 25 |
self.callback_handler = None
|
| 26 |
|
| 27 |
# Conversation context
|
| 28 |
self.conversation_history = []
|
| 29 |
self.current_transcription = ""
|
| 30 |
+
|
| 31 |
def start_listening(self, callback_handler: Callable):
|
| 32 |
+
"""Bắt đầu lắng nghe với sounddevice"""
|
| 33 |
if self.is_listening:
|
| 34 |
return False
|
| 35 |
+
|
| 36 |
try:
|
| 37 |
self.callback_handler = callback_handler
|
| 38 |
self.is_listening = True
|
| 39 |
self.conversation_history = []
|
| 40 |
+
|
| 41 |
+
# Start VAD processing thread
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
self.vad_processor.start_stream(self._process_speech_segment)
|
| 43 |
+
|
| 44 |
+
# Khởi động thread lắng nghe
|
| 45 |
+
threading.Thread(target=self._listen_loop, daemon=True).start()
|
| 46 |
+
|
| 47 |
+
print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
|
| 48 |
return True
|
| 49 |
+
|
| 50 |
except Exception as e:
|
| 51 |
print(f"❌ Lỗi khởi động stream: {e}")
|
| 52 |
self.stop_listening()
|
| 53 |
return False
|
| 54 |
+
|
| 55 |
def stop_listening(self):
|
| 56 |
"""Dừng lắng nghe"""
|
| 57 |
self.is_listening = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
self.vad_processor.stop_stream()
|
| 59 |
print("🛑 Đã dừng lắng nghe")
|
| 60 |
+
|
| 61 |
+
def _listen_loop(self):
|
| 62 |
+
"""Luồng lấy mẫu âm thanh liên tục"""
|
| 63 |
+
try:
|
| 64 |
+
with sd.InputStream(
|
| 65 |
+
samplerate=settings.SAMPLE_RATE,
|
| 66 |
+
channels=1,
|
| 67 |
+
dtype="float32",
|
| 68 |
+
blocksize=1024,
|
| 69 |
+
callback=self._audio_callback
|
| 70 |
+
):
|
| 71 |
+
while self.is_listening:
|
| 72 |
+
time.sleep(0.05)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"❌ Lỗi luồng âm thanh: {e}")
|
| 75 |
+
self.stop_listening()
|
| 76 |
+
|
| 77 |
+
def _audio_callback(self, in_data, frames, time_info, status):
|
| 78 |
"""Callback xử lý audio input real-time"""
|
| 79 |
if status:
|
| 80 |
+
print(f"⚠️ Trạng thái âm thanh: {status}")
|
| 81 |
+
|
| 82 |
if self.is_listening:
|
| 83 |
+
audio_data = np.copy(in_data[:, 0]) # Mono
|
| 84 |
+
self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
|
| 85 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
|
| 87 |
+
"""Xử lý đoạn giọng nói"""
|
| 88 |
if not self.is_listening or len(speech_audio) == 0:
|
| 89 |
return
|
| 90 |
+
|
| 91 |
print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
|
| 92 |
+
|
|
|
|
| 93 |
transcription = self._transcribe_audio(speech_audio, sample_rate)
|
| 94 |
if transcription and len(transcription.strip()) > 0:
|
| 95 |
self.current_transcription = transcription
|
| 96 |
print(f"📝 Transcription: {transcription}")
|
| 97 |
+
|
|
|
|
| 98 |
response = self._generate_ai_response(transcription)
|
|
|
|
|
|
|
| 99 |
tts_audio = self._text_to_speech(response)
|
| 100 |
+
|
|
|
|
| 101 |
if self.callback_handler:
|
| 102 |
self.callback_handler({
|
| 103 |
'transcription': transcription,
|
|
|
|
| 105 |
'tts_audio': tts_audio,
|
| 106 |
'speech_audio': speech_audio
|
| 107 |
})
|
| 108 |
+
|
| 109 |
def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
|
| 110 |
+
"""Chuyển audio -> text"""
|
| 111 |
try:
|
|
|
|
| 112 |
buffer = io.BytesIO()
|
| 113 |
sf.write(buffer, audio_data, sample_rate, format='wav')
|
| 114 |
buffer.seek(0)
|
| 115 |
+
|
|
|
|
| 116 |
transcription = self.client.audio.transcriptions.create(
|
| 117 |
model=settings.WHISPER_MODEL,
|
| 118 |
file=("speech.wav", buffer.read()),
|
| 119 |
response_format="text",
|
| 120 |
+
language="vi"
|
| 121 |
)
|
| 122 |
+
|
| 123 |
return transcription.strip()
|
|
|
|
| 124 |
except Exception as e:
|
| 125 |
print(f"❌ Lỗi transcription: {e}")
|
| 126 |
return None
|
| 127 |
+
|
| 128 |
def _generate_ai_response(self, user_input: str) -> str:
|
| 129 |
+
"""Sinh phản hồi AI"""
|
| 130 |
try:
|
|
|
|
| 131 |
self.conversation_history.append({"role": "user", "content": user_input})
|
| 132 |
+
|
|
|
|
| 133 |
rag_results = self.rag_system.semantic_search(user_input, top_k=2)
|
| 134 |
context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
|
| 135 |
+
|
| 136 |
+
system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
|
| 137 |
+
Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
|
| 138 |
|
| 139 |
Thông tin tham khảo:
|
| 140 |
{context_text}
|
| 141 |
+
"""
|
| 142 |
|
|
|
|
|
|
|
| 143 |
messages = [{"role": "system", "content": system_prompt}]
|
| 144 |
+
messages.extend(self.conversation_history[-6:])
|
| 145 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
completion = self.client.chat.completions.create(
|
| 147 |
model=settings.LLM_MODEL,
|
| 148 |
messages=messages,
|
| 149 |
+
max_tokens=150,
|
| 150 |
temperature=0.7
|
| 151 |
)
|
| 152 |
+
|
| 153 |
response = completion.choices[0].message.content
|
| 154 |
self.conversation_history.append({"role": "assistant", "content": response})
|
| 155 |
+
|
|
|
|
| 156 |
if len(self.conversation_history) > 10:
|
| 157 |
self.conversation_history = self.conversation_history[-10:]
|
| 158 |
+
|
| 159 |
return response
|
| 160 |
+
|
| 161 |
except Exception as e:
|
| 162 |
return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
|
| 163 |
+
|
| 164 |
def _text_to_speech(self, text: str) -> Optional[str]:
|
| 165 |
"""Chuyển văn bản thành giọng nói"""
|
| 166 |
try:
|
|
|
|
| 169 |
return self.tts_service.save_audio_to_file(tts_bytes)
|
| 170 |
except Exception as e:
|
| 171 |
print(f"❌ Lỗi TTS: {e}")
|
|
|
|
| 172 |
return None
|
| 173 |
+
|
| 174 |
def get_conversation_state(self) -> dict:
|
| 175 |
+
"""Lấy trạng thái hội thoại"""
|
| 176 |
return {
|
| 177 |
'is_listening': self.is_listening,
|
| 178 |
'history_length': len(self.conversation_history),
|
| 179 |
'current_transcription': self.current_transcription
|
| 180 |
+
}
|