datbkpro commited on
Commit
adfdb5e
·
verified ·
1 Parent(s): 55d88b0

Update services/streaming_voice_service.py

Browse files
Files changed (1) hide show
  1. services/streaming_voice_service.py +259 -91
services/streaming_voice_service.py CHANGED
@@ -1,13 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import numpy as np
3
  import soundfile as sf
4
  import threading
5
  import time
6
- import sounddevice as sd
7
  from groq import Groq
8
  from typing import Optional, Callable
9
  from config.settings import settings
10
- from core.speechbrain_vad import SpeechBrainVAD
11
  from core.rag_system import EnhancedRAGSystem
12
  from core.tts_service import EnhancedTTSService
13
 
@@ -17,110 +196,92 @@ class StreamingVoiceService:
17
  self.client = groq_client
18
  self.rag_system = rag_system
19
  self.tts_service = tts_service
20
- self.vad_processor = SpeechBrainVAD()
21
 
22
  # Streaming state
23
  self.is_listening = False
24
- self.audio_stream = None
25
  self.callback_handler = None
26
 
27
  # Conversation context
28
  self.conversation_history = []
29
  self.current_transcription = ""
30
 
31
- def start_listening(self, callback_handler: Callable):
32
- """Bắt đầu lắng nghe với sounddevice"""
33
- if self.is_listening:
34
- return False
35
-
36
- try:
37
- self.callback_handler = callback_handler
38
- self.is_listening = True
39
- self.conversation_history = []
40
-
41
- # Start VAD processing thread
42
- self.vad_processor.start_stream(self._process_speech_segment)
43
-
44
- # Khởi động thread lắng nghe
45
- threading.Thread(target=self._listen_loop, daemon=True).start()
46
-
47
- print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
48
- return True
49
-
50
- except Exception as e:
51
- print(f"❌ Lỗi khởi động stream: {e}")
52
- self.stop_listening()
53
- return False
54
-
55
- def stop_listening(self):
56
- """Dừng lắng nghe"""
57
- self.is_listening = False
58
- self.vad_processor.stop_stream()
59
- print("🛑 Đã dừng lắng nghe")
60
-
61
- def _listen_loop(self):
62
- """Luồng lấy mẫu âm thanh liên tục"""
63
  try:
64
- with sd.InputStream(
65
- samplerate=settings.SAMPLE_RATE,
66
- channels=1,
67
- dtype="float32",
68
- blocksize=1024,
69
- callback=self._audio_callback
70
- ):
71
- while self.is_listening:
72
- time.sleep(0.05)
73
- except Exception as e:
74
- print(f"❌ Lỗi luồng âm thanh: {e}")
75
- self.stop_listening()
76
-
77
- def _audio_callback(self, in_data, frames, time_info, status):
78
- """Callback xử lý audio input real-time"""
79
- if status:
80
- print(f"⚠️ Trạng thái âm thanh: {status}")
81
-
82
- if self.is_listening:
83
- audio_data = np.copy(in_data[:, 0]) # Mono
84
- self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
85
-
86
- def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
87
- """Xử lý đoạn giọng nói"""
88
- if not self.is_listening or len(speech_audio) == 0:
89
- return
90
-
91
- print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
92
-
93
- transcription = self._transcribe_audio(speech_audio, sample_rate)
94
- if transcription and len(transcription.strip()) > 0:
95
- self.current_transcription = transcription
96
- print(f"📝 Transcription: {transcription}")
97
-
98
  response = self._generate_ai_response(transcription)
99
- tts_audio = self._text_to_speech(response)
100
-
101
- if self.callback_handler:
102
- self.callback_handler({
103
- 'transcription': transcription,
104
- 'response': response,
105
- 'tts_audio': tts_audio,
106
- 'speech_audio': speech_audio
107
- })
 
 
 
 
 
 
 
 
108
 
109
  def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
110
  """Chuyển audio -> text"""
111
  try:
 
 
 
 
 
 
 
 
112
  buffer = io.BytesIO()
113
- sf.write(buffer, audio_data, sample_rate, format='wav')
114
  buffer.seek(0)
115
 
116
  transcription = self.client.audio.transcriptions.create(
117
  model=settings.WHISPER_MODEL,
118
- file=("speech.wav", buffer.read()),
119
  response_format="text",
120
  language="vi"
121
  )
122
 
123
- return transcription.strip()
 
 
 
 
 
 
 
124
  except Exception as e:
125
  print(f"❌ Lỗi transcription: {e}")
126
  return None
@@ -128,23 +289,25 @@ class StreamingVoiceService:
128
  def _generate_ai_response(self, user_input: str) -> str:
129
  """Sinh phản hồi AI"""
130
  try:
 
131
  self.conversation_history.append({"role": "user", "content": user_input})
132
 
 
133
  rag_results = self.rag_system.semantic_search(user_input, top_k=2)
134
- context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
135
 
136
  system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
137
- Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
138
-
139
  Thông tin tham khảo:
140
  {context_text}
141
  """
142
 
143
  messages = [{"role": "system", "content": system_prompt}]
144
- messages.extend(self.conversation_history[-6:])
 
145
 
146
  completion = self.client.chat.completions.create(
147
- model=settings.LLM_MODEL,
148
  messages=messages,
149
  max_tokens=150,
150
  temperature=0.7
@@ -153,8 +316,9 @@ Thông tin tham khảo:
153
  response = completion.choices[0].message.content
154
  self.conversation_history.append({"role": "assistant", "content": response})
155
 
156
- if len(self.conversation_history) > 10:
157
- self.conversation_history = self.conversation_history[-10:]
 
158
 
159
  return response
160
 
@@ -171,10 +335,14 @@ Thông tin tham khảo:
171
  print(f"❌ Lỗi TTS: {e}")
172
  return None
173
 
 
 
 
 
 
174
  def get_conversation_state(self) -> dict:
175
  """Lấy trạng thái hội thoại"""
176
  return {
177
- 'is_listening': self.is_listening,
178
  'history_length': len(self.conversation_history),
179
  'current_transcription': self.current_transcription
180
- }
 
1
+ # import io
2
+ # import numpy as np
3
+ # import soundfile as sf
4
+ # import threading
5
+ # import time
6
+ # import sounddevice as sd
7
+ # from groq import Groq
8
+ # from typing import Optional, Callable
9
+ # from config.settings import settings
10
+ # from core.speechbrain_vad import SpeechBrainVAD
11
+ # from core.rag_system import EnhancedRAGSystem
12
+ # from core.tts_service import EnhancedTTSService
13
+
14
+
15
+ # class StreamingVoiceService:
16
+ # def __init__(self, groq_client: Groq, rag_system: EnhancedRAGSystem, tts_service: EnhancedTTSService):
17
+ # self.client = groq_client
18
+ # self.rag_system = rag_system
19
+ # self.tts_service = tts_service
20
+ # self.vad_processor = SpeechBrainVAD()
21
+
22
+ # # Streaming state
23
+ # self.is_listening = False
24
+ # self.audio_stream = None
25
+ # self.callback_handler = None
26
+
27
+ # # Conversation context
28
+ # self.conversation_history = []
29
+ # self.current_transcription = ""
30
+
31
+ # def start_listening(self, callback_handler: Callable):
32
+ # """Bắt đầu lắng nghe với sounddevice"""
33
+ # if self.is_listening:
34
+ # return False
35
+
36
+ # try:
37
+ # self.callback_handler = callback_handler
38
+ # self.is_listening = True
39
+ # self.conversation_history = []
40
+
41
+ # # Start VAD processing thread
42
+ # self.vad_processor.start_stream(self._process_speech_segment)
43
+
44
+ # # Khởi động thread lắng nghe
45
+ # threading.Thread(target=self._listen_loop, daemon=True).start()
46
+
47
+ # print("🎙️ Bắt đầu lắng nghe (sounddevice)...")
48
+ # return True
49
+
50
+ # except Exception as e:
51
+ # print(f"❌ Lỗi khởi động stream: {e}")
52
+ # self.stop_listening()
53
+ # return False
54
+
55
+ # def stop_listening(self):
56
+ # """Dừng lắng nghe"""
57
+ # self.is_listening = False
58
+ # self.vad_processor.stop_stream()
59
+ # print("🛑 Đã dừng lắng nghe")
60
+
61
+ # def _listen_loop(self):
62
+ # """Luồng lấy mẫu âm thanh liên tục"""
63
+ # try:
64
+ # with sd.InputStream(
65
+ # samplerate=settings.SAMPLE_RATE,
66
+ # channels=1,
67
+ # dtype="float32",
68
+ # blocksize=1024,
69
+ # callback=self._audio_callback
70
+ # ):
71
+ # while self.is_listening:
72
+ # time.sleep(0.05)
73
+ # except Exception as e:
74
+ # print(f"❌ Lỗi luồng âm thanh: {e}")
75
+ # self.stop_listening()
76
+
77
+ # def _audio_callback(self, in_data, frames, time_info, status):
78
+ # """Callback xử lý audio input real-time"""
79
+ # if status:
80
+ # print(f"⚠️ Trạng thái âm thanh: {status}")
81
+
82
+ # if self.is_listening:
83
+ # audio_data = np.copy(in_data[:, 0]) # Mono
84
+ # self.vad_processor.process_stream(audio_data, settings.SAMPLE_RATE)
85
+
86
+ # def _process_speech_segment(self, speech_audio: np.ndarray, sample_rate: int):
87
+ # """Xử lý đoạn giọng nói"""
88
+ # if not self.is_listening or len(speech_audio) == 0:
89
+ # return
90
+
91
+ # print(f"🎯 Đang xử lý segment giọng nói ({len(speech_audio)} samples)...")
92
+
93
+ # transcription = self._transcribe_audio(speech_audio, sample_rate)
94
+ # if transcription and len(transcription.strip()) > 0:
95
+ # self.current_transcription = transcription
96
+ # print(f"📝 Transcription: {transcription}")
97
+
98
+ # response = self._generate_ai_response(transcription)
99
+ # tts_audio = self._text_to_speech(response)
100
+
101
+ # if self.callback_handler:
102
+ # self.callback_handler({
103
+ # 'transcription': transcription,
104
+ # 'response': response,
105
+ # 'tts_audio': tts_audio,
106
+ # 'speech_audio': speech_audio
107
+ # })
108
+
109
+ # def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
110
+ # """Chuyển audio -> text"""
111
+ # try:
112
+ # buffer = io.BytesIO()
113
+ # sf.write(buffer, audio_data, sample_rate, format='wav')
114
+ # buffer.seek(0)
115
+
116
+ # transcription = self.client.audio.transcriptions.create(
117
+ # model=settings.WHISPER_MODEL,
118
+ # file=("speech.wav", buffer.read()),
119
+ # response_format="text",
120
+ # language="vi"
121
+ # )
122
+
123
+ # return transcription.strip()
124
+ # except Exception as e:
125
+ # print(f"❌ Lỗi transcription: {e}")
126
+ # return None
127
+
128
+ # def _generate_ai_response(self, user_input: str) -> str:
129
+ # """Sinh phản hồi AI"""
130
+ # try:
131
+ # self.conversation_history.append({"role": "user", "content": user_input})
132
+
133
+ # rag_results = self.rag_system.semantic_search(user_input, top_k=2)
134
+ # context_text = "\n".join([f"- {doc.text}" for doc in rag_results]) if rag_results else ""
135
+
136
+ # system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
137
+ # Hãy trả lời ngắn gọn, tự nhiên và hữu ích.
138
+
139
+ # Thông tin tham khảo:
140
+ # {context_text}
141
+ # """
142
+
143
+ # messages = [{"role": "system", "content": system_prompt}]
144
+ # messages.extend(self.conversation_history[-6:])
145
+
146
+ # completion = self.client.chat.completions.create(
147
+ # model=settings.LLM_MODEL,
148
+ # messages=messages,
149
+ # max_tokens=150,
150
+ # temperature=0.7
151
+ # )
152
+
153
+ # response = completion.choices[0].message.content
154
+ # self.conversation_history.append({"role": "assistant", "content": response})
155
+
156
+ # if len(self.conversation_history) > 10:
157
+ # self.conversation_history = self.conversation_history[-10:]
158
+
159
+ # return response
160
+
161
+ # except Exception as e:
162
+ # return f"Xin lỗi, tôi gặp lỗi: {str(e)}"
163
+
164
+ # def _text_to_speech(self, text: str) -> Optional[str]:
165
+ # """Chuyển văn bản thành giọng nói"""
166
+ # try:
167
+ # tts_bytes = self.tts_service.text_to_speech(text, 'vi')
168
+ # if tts_bytes:
169
+ # return self.tts_service.save_audio_to_file(tts_bytes)
170
+ # except Exception as e:
171
+ # print(f"❌ Lỗi TTS: {e}")
172
+ # return None
173
+
174
+ # def get_conversation_state(self) -> dict:
175
+ # """Lấy trạng thái hội thoại"""
176
+ # return {
177
+ # 'is_listening': self.is_listening,
178
+ # 'history_length': len(self.conversation_history),
179
+ # 'current_transcription': self.current_transcription
180
+ # }
181
  import io
182
  import numpy as np
183
  import soundfile as sf
184
  import threading
185
  import time
186
+ import traceback
187
  from groq import Groq
188
  from typing import Optional, Callable
189
  from config.settings import settings
 
190
  from core.rag_system import EnhancedRAGSystem
191
  from core.tts_service import EnhancedTTSService
192
 
 
196
  self.client = groq_client
197
  self.rag_system = rag_system
198
  self.tts_service = tts_service
 
199
 
200
  # Streaming state
201
  self.is_listening = False
 
202
  self.callback_handler = None
203
 
204
  # Conversation context
205
  self.conversation_history = []
206
  self.current_transcription = ""
207
 
208
+ def process_streaming_audio(self, audio_data: tuple) -> dict:
209
+ """Xử audio streaming từ Gradio microphone component"""
210
+ if not audio_data:
211
+ return {
212
+ 'transcription': "❌ Không có dữ liệu âm thanh",
213
+ 'response': "Vui lòng nói lại",
214
+ 'tts_audio': None
215
+ }
216
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  try:
218
+ # Lấy dữ liệu audio từ Gradio
219
+ sample_rate, audio_array = audio_data
220
+
221
+ print(f"🎯 Nhận audio: {len(audio_array)} samples, SR: {sample_rate}")
222
+
223
+ # Chuyển đổi thành văn bản
224
+ transcription = self._transcribe_audio(audio_array, sample_rate)
225
+
226
+ if not transcription or len(transcription.strip()) == 0:
227
+ return {
228
+ 'transcription': "❌ Không nghe ",
229
+ 'response': "Xin vui lòng nói lại rõ hơn",
230
+ 'tts_audio': None
231
+ }
232
+
233
+ print(f"📝 Đã chuyển đổi: {transcription}")
234
+
235
+ # Tạo phản hồi AI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  response = self._generate_ai_response(transcription)
237
+
238
+ # Tạo TTS
239
+ tts_audio_path = self._text_to_speech(response)
240
+
241
+ return {
242
+ 'transcription': transcription,
243
+ 'response': response,
244
+ 'tts_audio': tts_audio_path
245
+ }
246
+
247
+ except Exception as e:
248
+ print(f"❌ Lỗi xử lý streaming audio: {e}")
249
+ return {
250
+ 'transcription': f"❌ Lỗi: {str(e)}",
251
+ 'response': "Xin lỗi, có lỗi xảy ra",
252
+ 'tts_audio': None
253
+ }
254
 
255
  def _transcribe_audio(self, audio_data: np.ndarray, sample_rate: int) -> Optional[str]:
256
  """Chuyển audio -> text"""
257
  try:
258
+ # Chuẩn hóa audio data
259
+ if audio_data.ndim > 1:
260
+ audio_data = np.mean(audio_data, axis=1) # Chuyển sang mono
261
+
262
+ # Normalize
263
+ if np.max(np.abs(audio_data)) > 0:
264
+ audio_data = audio_data / np.max(np.abs(audio_data))
265
+
266
  buffer = io.BytesIO()
267
+ sf.write(buffer, audio_data, sample_rate, format='wav', subtype='PCM_16')
268
  buffer.seek(0)
269
 
270
  transcription = self.client.audio.transcriptions.create(
271
  model=settings.WHISPER_MODEL,
272
+ file=("speech.wav", buffer.read(), "audio/wav"),
273
  response_format="text",
274
  language="vi"
275
  )
276
 
277
+ # Xử lý response
278
+ if hasattr(transcription, 'text'):
279
+ return transcription.text.strip()
280
+ elif isinstance(transcription, str):
281
+ return transcription.strip()
282
+ else:
283
+ return str(transcription).strip()
284
+
285
  except Exception as e:
286
  print(f"❌ Lỗi transcription: {e}")
287
  return None
 
289
  def _generate_ai_response(self, user_input: str) -> str:
290
  """Sinh phản hồi AI"""
291
  try:
292
+ # Thêm vào lịch sử
293
  self.conversation_history.append({"role": "user", "content": user_input})
294
 
295
+ # Tìm kiếm RAG
296
  rag_results = self.rag_system.semantic_search(user_input, top_k=2)
297
+ context_text = "\n".join([f"- {result.get('text', str(result))}" for result in rag_results]) if rag_results else ""
298
 
299
  system_prompt = f"""Bạn là trợ lý AI thông minh chuyên về tiếng Việt.
300
+ Hãy trả lời ngắn gọn, tự nhiên và hữu ích (dưới 100 từ).
 
301
  Thông tin tham khảo:
302
  {context_text}
303
  """
304
 
305
  messages = [{"role": "system", "content": system_prompt}]
306
+ # Giữ lại 4 tin nhắn gần nhất
307
+ messages.extend(self.conversation_history[-4:])
308
 
309
  completion = self.client.chat.completions.create(
310
+ model="llama-3.1-8b-instant",
311
  messages=messages,
312
  max_tokens=150,
313
  temperature=0.7
 
316
  response = completion.choices[0].message.content
317
  self.conversation_history.append({"role": "assistant", "content": response})
318
 
319
+ # Giới hạn lịch sử
320
+ if len(self.conversation_history) > 8:
321
+ self.conversation_history = self.conversation_history[-8:]
322
 
323
  return response
324
 
 
335
  print(f"❌ Lỗi TTS: {e}")
336
  return None
337
 
338
+ def clear_conversation(self):
339
+ """Xóa lịch sử hội thoại"""
340
+ self.conversation_history = []
341
+ print("🗑️ Đã xóa lịch sử hội thoại")
342
+
343
  def get_conversation_state(self) -> dict:
344
  """Lấy trạng thái hội thoại"""
345
  return {
 
346
  'history_length': len(self.conversation_history),
347
  'current_transcription': self.current_transcription
348
+ }