datbkpro commited on
Commit
242645b
·
verified ·
1 Parent(s): 09266d3

Update services/streaming_voice_service.py

Browse files
Files changed (1) hide show
  1. services/streaming_voice_service.py +65 -90
services/streaming_voice_service.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import soundfile as sf
4
  import threading
5
  import time
6
- import pyaudio
7
  from groq import Groq
8
  from typing import Optional, Callable
9
  from config.settings import settings
@@ -11,6 +11,7 @@ from core.speechbrain_vad import SpeechBrainVAD
11
  from core.rag_system import EnhancedRAGSystem
12
  from core.tts_service import EnhancedTTSService
13
 
 
14
  class StreamingVoiceService:
15
  def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
16
  self.client = groq_client
@@ -21,96 +22,82 @@ class StreamingVoiceService:
21
  # Streaming state
22
  self.is_listening = False
23
  self.audio_stream = None
24
- self.pyaudio_instance = None
25
  self.callback_handler = None
26
 
27
  # Conversation context
28
  self.conversation_history = []
29
  self.current_transcription = ""
30
-
31
  def start_listening(self, callback_handler: Callable):
32
- """Bắt đầu lắng nghe với VAD"""
33
  if self.is_listening:
34
  return False
35
-
36
  try:
37
  self.callback_handler = callback_handler
38
  self.is_listening = True
39
  self.conversation_history = []
40
-
41
- # Initialize PyAudio
42
- self.pyaudio_instance = pyaudio.PyAudio()
43
-
44
- # Start audio stream
45
- self.audio_stream = self.pyaudio_instance.open(
46
- format=pyaudio.paInt16,
47
- channels=1,
48
- rate=settings.SAMPLE_RATE,
49
- input=True,
50
- frames_per_buffer=1024,
51
- stream_callback=self._audio_callback
52
- )
53
-
54
- # Start VAD processing
55
  self.vad_processor.start_stream(self._process_speech_segment)
56
-
57
- print("🎙️ Bắt đầu lắng nghe...")
 
 
 
58
  return True
59
-
60
  except Exception as e:
61
  print(f"❌ Lỗi khởi động stream: {e}")
62
  self.stop_listening()
63
  return False
64
-
65
  def stop_listening(self):
66
  """Dừng lắng nghe"""
67
  self.is_listening = False
68
-
69
- if self.audio_stream:
70
- self.audio_stream.stop_stream()
71
- self.audio_stream.close()
72
-
73
- if self.pyaudio_instance:
74
- self.pyaudio_instance.terminate()
75
-
76
  self.vad_processor.stop_stream()
77
  print("🛑 Đã dừng lắng nghe")
78
-
79
- def _audio_callback(self, in_data, frame_count, time_info, status):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  """Callback xử lý audio input real-time"""
81
  if status:
82
- print(f"Audio stream status: {status}")
83
-
84
  if self.is_listening:
85
- # Convert audio data to numpy array
86
- audio_data = np.frombuffer(in_data, dtype=np.int16)
87
- audio_float = audio_data.astype(np.float32) / 32768.0
88
-
89
- # Process with VAD
90
- self.vad_processor.process_stream(audio_float, settings.SAMPLE_RATE)
91
-
92
- return (in_data, pyaudio.paContinue)
93
-
94
  def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
95
- """Xử lý segment giọng nói đã được VAD phát hiện"""
96
  if not self.is_listening or len(speech_audio) == 0:
97
  return
98
-
99
  print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
100
-
101
- # Transcribe speech segment
102
  transcription = self._transcribe_audio(speech_audio, sample_rate)
103
  if transcription and len(transcription.strip()) > 0:
104
  self.current_transcription = transcription
105
  print(f"📝 Transcription: {transcription}")
106
-
107
- # Generate AI response
108
  response = self._generate_ai_response(transcription)
109
-
110
- # Convert response to speech
111
  tts_audio = self._text_to_speech(response)
112
-
113
- # Call callback with results
114
  if self.callback_handler:
115
  self.callback_handler({
116
  'transcription': transcription,
@@ -118,73 +105,62 @@ class StreamingVoiceService:
118
  'tts_audio': tts_audio,
119
  'speech_audio': speech_audio
120
  })
121
-
122
  def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
123
- """Chuyển đổi audio thành văn bản sử dụng Whisper"""
124
  try:
125
- # Convert numpy array to bytes buffer
126
  buffer = io.BytesIO()
127
  sf.write(buffer, audio_data, sample_rate, format='wav')
128
  buffer.seek(0)
129
-
130
- # Transcribe with Whisper
131
  transcription = self.client.audio.transcriptions.create(
132
  model=settings.WHISPER_MODEL,
133
  file=("speech.wav", buffer.read()),
134
  response_format="text",
135
- language="vi" # Focus on Vietnamese
136
  )
137
-
138
  return transcription.strip()
139
-
140
  except Exception as e:
141
  print(f"❌ Lỗi transcription: {e}")
142
  return None
143
-
144
  def _generate_ai_response(self, user_input: str) -> str:
145
- """Tạo phản hồi AI với RAG context"""
146
  try:
147
- # Add to conversation history
148
  self.conversation_history.append({"role": "user", "content": user_input})
149
-
150
- # Semantic search với RAG
151
  rag_results = self.rag_system.semantic_search(user_input, top_k=2)
152
  context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
153
-
154
- # Prepare messages với conversation history
155
- system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt. Hãy trả lời ngắn gọn, tự nhiên và hữu ích. Sử dụng thông tin từ cơ sở kiến thức khi có liên quan.
156
 
157
  Thông tin tham khảo:
158
  {context_text}
 
159
 
160
- Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyện."""
161
-
162
  messages = [{"role": "system", "content": system_prompt}]
163
-
164
- # Add last 3 exchanges for context (to keep it manageable)
165
- recent_history = self.conversation_history[-6:] # Last 3 user-assistant pairs
166
- messages.extend(recent_history)
167
-
168
- # Generate response
169
  completion = self.client.chat.completions.create(
170
  model=settings.LLM_MODEL,
171
  messages=messages,
172
- max_tokens=150, # Keep responses concise for streaming
173
  temperature=0.7
174
  )
175
-
176
  response = completion.choices[0].message.content
177
  self.conversation_history.append({"role": "assistant", "content": response})
178
-
179
- # Keep conversation history manageable
180
  if len(self.conversation_history) > 10:
181
  self.conversation_history = self.conversation_history[-10:]
182
-
183
  return response
184
-
185
  except Exception as e:
186
  return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
187
-
188
  def _text_to_speech(self, text: str) -> Optional[str]:
189
  """Chuyển văn bản thành giọng nói"""
190
  try:
@@ -193,13 +169,12 @@ Hãy giữ câu trả lời ngắn gọn và tự nhiên như đang trò chuyệ
193
  return self.tts_service.save_audio_to_file(tts_bytes)
194
  except Exception as e:
195
  print(f"❌ Lỗi TTS: {e}")
196
-
197
  return None
198
-
199
  def get_conversation_state(self) -> dict:
200
- """Lấy trạng thái cuộc hội thoại hiện tại"""
201
  return {
202
  'is_listening': self.is_listening,
203
  'history_length': len(self.conversation_history),
204
  'current_transcription': self.current_transcription
205
- }
 
3
  import soundfile as sf
4
  import threading
5
  import time
6
+ import sounddevice as sd
7
  from groq import Groq
8
  from typing import Optional, Callable
9
  from config.settings import settings
 
11
  from core.rag_system import EnhancedRAGSystem
12
  from core.tts_service import EnhancedTTSService
13
 
14
+
15
  class StreamingVoiceService:
16
  def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
17
  self.client = groq_client
 
22
  # Streaming state
23
  self.is_listening = False
24
  self.audio_stream = None
 
25
  self.callback_handler = None
26
 
27
  # Conversation context
28
  self.conversation_history = []
29
  self.current_transcription = ""
30
+
31
  def start_listening(self, callback_handler: Callable):
32
+ """Bắt đầu lắng nghe với sounddevice"""
33
  if self.is_listening:
34
  return False
35
+
36
  try:
37
  self.callback_handler = callback_handler
38
  self.is_listening = True
39
  self.conversation_history = []
40
+
41
+ # Start VAD processing thread
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  self.vad_processor.start_stream(self._process_speech_segment)
43
+
44
+ # Khởi động thread lắng nghe
45
+ threading.Thread(target=self._listen_loop, daemon=True).start()
46
+
47
+ print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
48
  return True
49
+
50
  except Exception as e:
51
  print(f"❌ Lỗi khởi động stream: {e}")
52
  self.stop_listening()
53
  return False
54
+
55
  def stop_listening(self):
56
  """Dừng lắng nghe"""
57
  self.is_listening = False
 
 
 
 
 
 
 
 
58
  self.vad_processor.stop_stream()
59
  print("🛑 Đã dừng lắng nghe")
60
+
61
+ def _listen_loop(self):
62
+ """Luồng lấy mẫu âm thanh liên tục"""
63
+ try:
64
+ with sd.InputStream(
65
+ samplerate=settings.SAMPLE_RATE,
66
+ channels=1,
67
+ dtype="float32",
68
+ blocksize=1024,
69
+ callback=self._audio_callback
70
+ ):
71
+ while self.is_listening:
72
+ time.sleep(0.05)
73
+ except Exception as e:
74
+ print(f"❌ Lỗi luồng âm thanh: {e}")
75
+ self.stop_listening()
76
+
77
+ def _audio_callback(self, in_data, frames, time_info, status):
78
  """Callback xử lý audio input real-time"""
79
  if status:
80
+ print(f"⚠️ Trạng thái âm thanh: {status}")
81
+
82
  if self.is_listening:
83
+ audio_data = np.copy(in_data[:, 0]) # Mono
84
+ self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
85
+
 
 
 
 
 
 
86
  def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
87
+ """Xử lý đoạn giọng nói"""
88
  if not self.is_listening or len(speech_audio) == 0:
89
  return
90
+
91
  print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
92
+
 
93
  transcription = self._transcribe_audio(speech_audio, sample_rate)
94
  if transcription and len(transcription.strip()) > 0:
95
  self.current_transcription = transcription
96
  print(f"📝 Transcription: {transcription}")
97
+
 
98
  response = self._generate_ai_response(transcription)
 
 
99
  tts_audio = self._text_to_speech(response)
100
+
 
101
  if self.callback_handler:
102
  self.callback_handler({
103
  'transcription': transcription,
 
105
  'tts_audio': tts_audio,
106
  'speech_audio': speech_audio
107
  })
108
+
109
  def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
110
+ """Chuyển audio -> text"""
111
  try:
 
112
  buffer = io.BytesIO()
113
  sf.write(buffer, audio_data, sample_rate, format='wav')
114
  buffer.seek(0)
115
+
 
116
  transcription = self.client.audio.transcriptions.create(
117
  model=settings.WHISPER_MODEL,
118
  file=("speech.wav", buffer.read()),
119
  response_format="text",
120
+ language="vi"
121
  )
122
+
123
  return transcription.strip()
 
124
  except Exception as e:
125
  print(f"❌ Lỗi transcription: {e}")
126
  return None
127
+
128
  def _generate_ai_response(self, user_input: str) -> str:
129
+ """Sinh phản hồi AI"""
130
  try:
 
131
  self.conversation_history.append({"role": "user", "content": user_input})
132
+
 
133
  rag_results = self.rag_system.semantic_search(user_input, top_k=2)
134
  context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
135
+
136
+ system_prompt = f"""Bạn trợ lý AI thông minh chuyên về tiếng Việt.
137
+ Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
138
 
139
  Thông tin tham khảo:
140
  {context_text}
141
+ """
142
 
 
 
143
  messages = [{"role": "system", "content": system_prompt}]
144
+ messages.extend(self.conversation_history[-6:])
145
+
 
 
 
 
146
  completion = self.client.chat.completions.create(
147
  model=settings.LLM_MODEL,
148
  messages=messages,
149
+ max_tokens=150,
150
  temperature=0.7
151
  )
152
+
153
  response = completion.choices[0].message.content
154
  self.conversation_history.append({"role": "assistant", "content": response})
155
+
 
156
  if len(self.conversation_history) > 10:
157
  self.conversation_history = self.conversation_history[-10:]
158
+
159
  return response
160
+
161
  except Exception as e:
162
  return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
163
+
164
  def _text_to_speech(self, text: str) -> Optional[str]:
165
  """Chuyển văn bản thành giọng nói"""
166
  try:
 
169
  return self.tts_service.save_audio_to_file(tts_bytes)
170
  except Exception as e:
171
  print(f"❌ Lỗi TTS: {e}")
 
172
  return None
173
+
174
  def get_conversation_state(self) -> dict:
175
+ """Lấy trạng thái hội thoại"""
176
  return {
177
  'is_listening': self.is_listening,
178
  'history_length': len(self.conversation_history),
179
  'current_transcription': self.current_transcription
180
+ }