Spaces:

datbkpro
/

voicebot

Sleeping

App Files Files Community

datbkpro commited on Nov 13, 2025

Commit

0304fae

verified ·

1 Parent(s): 361b4a9

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +121 -41

services/streaming_voice_service.py CHANGED Viewed

@@ -8,13 +8,12 @@ import threading
 import queue
 import json
 import os
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
-from config.settings import settings
-from core.rag_system import EnhancedRAGSystem
-from core.tts_service import EnhancedTTSService
-from core.silero_vad import SileroVAD
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
         self.model = None
@@ -22,10 +21,8 @@ class VoskStreamingASR:
         self.sample_rate = 16000
         self.is_streaming = False
-        # Buffer để tích luỹ audio - QUAN TRỌNG
         self.audio_buffer = []
-        self.buffer_duration = 2.0  # tích luỹ 2 giây audio
-        self.min_samples_for_recognition = 32000  # ít nhất 2 giây audio
         if model_path is None:
             model_path = self._download_vosk_model()
@@ -40,7 +37,38 @@ class VoskStreamingASR:
             except Exception as e:
                 print(f"❌ Lỗi khởi tạo VOSK model: {e}")
         else:
-            print(f"❌ Không tìm thấy VOSK model tại: {model_path}")
     def start_stream(self):
         """Bắt đầu stream mới"""
@@ -59,45 +87,45 @@ class VoskStreamingASR:
             return False
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
-        """Xử lý audio chunk - SIMPLIFIED VERSION"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
         try:
-            # Đơn giản hoá: luôn xử lý, không check âm lượng
             if sample_rate and sample_rate != self.sample_rate:
                 audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
             # Đảm bảo là int16
             if audio_chunk.dtype != np.int16:
-                audio_chunk = audio_chunk.astype(np.int16)
             # THÊM VÀO BUFFER - QUAN TRỌNG
             self.audio_buffer.extend(audio_chunk)
-            # Chỉ xử lý khi có đủ audio
-            if len(self.audio_buffer) < 16000:  # ít nhất 1 giây
                 return {"text": "", "partial": "Đang nghe...", "is_final": False}
-            # Lấy audio từ buffer để xử lý
-            process_audio = np.array(self.audio_buffer[-32000:], dtype=np.int16)  # lấy 2 giây gần nhất
             # Chuyển sang bytes
             audio_bytes = process_audio.tobytes()
-            # Xử lý với VOSK - GỬI NHIỀU LẦN
-            for i in range(0, len(audio_bytes), 8000):  # gửi từng chunk nhỏ
-                chunk = audio_bytes[i:i+8000]
-                if len(chunk) > 0:
-                    if self.recognizer.AcceptWaveform(chunk):
-                        result_json = self.recognizer.Result()
-                        result = json.loads(result_json)
-                        text = result.get('text', '').strip()
-                        if text:
-                            print(f"✅ VOSK Final: '{text}'")
-                            # Reset buffer sau khi có kết quả
-                            self.audio_buffer = []
-                            return {"text": text, "partial": "", "is_final": True}
             # Kiểm tra partial result
             partial_json = self.recognizer.PartialResult()
@@ -107,13 +135,38 @@ class VoskStreamingASR:
             if partial_text:
                 print(f"🎯 VOSK Partial: '{partial_text}'")
                 return {"text": "", "partial": partial_text, "is_final": False}
-            else:
-                return {"text": "", "partial": "Đang xử lý âm thanh...", "is_final": False}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
-        return {"text": "", "partial": "", "is_final": False}
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system, tts_service):
@@ -128,30 +181,39 @@ class StreamingVoiceService:
         self.current_callback = None
     def start_listening(self, speech_callback: Callable) -> bool:
-        """Bắt đầu lắng nghe - ĐƠN GIẢN"""
         if self.is_listening:
             return False
         self.current_callback = speech_callback
         if self.vosk_asr.model is None:
             return False
         if not self.vosk_asr.start_stream():
             return False
         self.is_listening = True
-        print("🎙️ Đã bắt đầu lắng nghe")
         return True
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming - ĐƠN GIẢN VÀ HIỆU QUẢ"""
         if not audio_data:
-            return {"transcription": "Không có âm thanh", "response": "", "tts_audio": None, "status": "error"}
         try:
             sample_rate, audio_array = audio_data
             # Đảm bảo VOSK stream đang chạy
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
@@ -159,7 +221,7 @@ class StreamingVoiceService:
             # Xử lý với VOSK
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
-            # LUÔN trả về kết quả partial để hiển thị real-time
             if result['partial']:
                 return {
                     'transcription': result['partial'],
@@ -169,30 +231,36 @@ class StreamingVoiceService:
                 }
             elif result['is_final'] and result['text']:
                 # Có kết quả cuối - tạo phản hồi AI
                 response = self._generate_ai_response(result['text'])
                 return {
                     'transcription': result['text'],
                     'response': response,
-                    'tts_audio': None,  # có thể thêm TTS sau
                     'status': 'completed'
                 }
             else:
                 return {
-                    'transcription': "🎤 Đang nghe... nói tiếp đi",
                     'response': "",
                     'tts_audio': None,
                     'status': 'listening'
                 }
         except Exception as e:
-            print(f"❌ Lỗi: {e}")
-            return {"transcription": f"Lỗi: {e}", "response": "", "tts_audio": None, "status": "error"}
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI đơn giản"""
         try:
             messages = [
-                {"role": "system", "content": "Bạn là trợ lý AI. Trả lời ngắn gọn bằng tiếng Việt."},
                 {"role": "user", "content": transcription}
             ]
@@ -206,6 +274,7 @@ class StreamingVoiceService:
             return response.choices[0].message.content.strip()
         except Exception as e:
             return "Xin lỗi, tôi không thể trả lời ngay lúc này."
     def stop_listening(self):
@@ -214,6 +283,17 @@ class StreamingVoiceService:
         if self.vosk_asr:
             self.vosk_asr.stop_stream()
         print("🛑 Đã dừng lắng nghe")
 # import io
 # import numpy as np
 # import soundfile as sf

 import queue
 import json
 import os
+import urllib.request
+import zipfile
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
         self.model = None
         self.sample_rate = 16000
         self.is_streaming = False
+        # Buffer để tích luỹ audio
         self.audio_buffer = []
         if model_path is None:
             model_path = self._download_vosk_model()
             except Exception as e:
                 print(f"❌ Lỗi khởi tạo VOSK model: {e}")
         else:
+            print(f"❌ Không tìm thấy VOSK model")
+    def _download_vosk_model(self):
+        """Tải VOSK model tiếng Việt tự động"""
+        try:
+            model_url = "https://alphacephei.com/vosk/models/vosk-model-small-vn-0.4.zip"
+            model_dir = "models/vosk-model-small-vn-0.4"
+            zip_path = "models/vosk-model-small-vn-0.4.zip"
+            # Tạo thư mục nếu chưa có
+            os.makedirs("models", exist_ok=True)
+            if not os.path.exists(model_dir):
+                print("📥 Đang tải VOSK Vietnamese model...")
+                urllib.request.urlretrieve(model_url, zip_path)
+                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                    zip_ref.extractall("models/")
+                # Đảm bảo thư mục tồn tại
+                if os.path.exists("models/vosk-model-small-vn-0.4"):
+                    os.rename("models/vosk-model-small-vn-0.4", model_dir)
+                if os.path.exists(zip_path):
+                    os.remove(zip_path)
+                print("✅ Đã tải VOSK model thành công")
+            return model_dir if os.path.exists(model_dir) else None
+        except Exception as e:
+            print(f"❌ Lỗi tải VOSK model: {e}")
+            return None
     def start_stream(self):
         """Bắt đầu stream mới"""
             return False
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
+        """Xử lý audio chunk - SIMPLE & EFFECTIVE"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
         try:
+            # Resample nếu cần
             if sample_rate and sample_rate != self.sample_rate:
                 audio_chunk = self._resample_audio(audio_chunk, sample_rate, self.sample_rate)
             # Đảm bảo là int16
             if audio_chunk.dtype != np.int16:
+                if audio_chunk.dtype in [np.float32, np.float64]:
+                    audio_chunk = (audio_chunk * 32767).astype(np.int16)
+                else:
+                    audio_chunk = audio_chunk.astype(np.int16)
             # THÊM VÀO BUFFER - QUAN TRỌNG
             self.audio_buffer.extend(audio_chunk)
+            # Chỉ xử lý khi có đủ audio (ít nhất 1 giây)
+            if len(self.audio_buffer) < 16000:
                 return {"text": "", "partial": "Đang nghe...", "is_final": False}
+            # Lấy audio từ buffer để xử lý (2 giây gần nhất)
+            process_audio = np.array(self.audio_buffer[-32000:], dtype=np.int16)
             # Chuyển sang bytes
             audio_bytes = process_audio.tobytes()
+            # Xử lý với VOSK
+            if self.recognizer.AcceptWaveform(audio_bytes):
+                result_json = self.recognizer.Result()
+                result = json.loads(result_json)
+                text = result.get('text', '').strip()
+                if text:
+                    print(f"✅ VOSK Final: '{text}'")
+                    # Reset buffer sau khi có kết quả
+                    self.audio_buffer = []
+                    return {"text": text, "partial": "", "is_final": True}
             # Kiểm tra partial result
             partial_json = self.recognizer.PartialResult()
             if partial_text:
                 print(f"🎯 VOSK Partial: '{partial_text}'")
                 return {"text": "", "partial": partial_text, "is_final": False}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
+        return {"text": "", "partial": "Nói tiếp đi...", "is_final": False}
+    def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio"""
+        if orig_sr == target_sr:
+            return audio
+        try:
+            from scipy import signal
+            num_samples = int(len(audio) * target_sr / orig_sr)
+            resampled_audio = signal.resample(audio, num_samples)
+            return resampled_audio.astype(np.int16)
+        except Exception as e:
+            print(f"❌ Lỗi resample audio: {e}")
+            return audio
+    def stop_stream(self) -> str:
+        """Kết thúc stream và lấy kết quả cuối"""
+        if self.recognizer:
+            try:
+                result_json = self.recognizer.FinalResult()
+                result = json.loads(result_json)
+                text = result.get('text', '').strip()
+                self.is_streaming = False
+                print(f"🛑 VOSK Final: '{text}'")
+                return text
+            except Exception as e:
+                print(f"❌ Lỗi khi dừng VOSK stream: {e}")
+        return ""
 class StreamingVoiceService:
     def __init__(self, groq_client: Groq, rag_system, tts_service):
         self.current_callback = None
     def start_listening(self, speech_callback: Callable) -> bool:
+        """Bắt đầu lắng nghe"""
         if self.is_listening:
             return False
         self.current_callback = speech_callback
         if self.vosk_asr.model is None:
+            print("❌ VOSK model không khả dụng")
             return False
         if not self.vosk_asr.start_stream():
+            print("❌ Không thể khởi động VOSK stream")
             return False
         self.is_listening = True
+        print("🎙️ Đã bắt đầu lắng nghe với VOSK")
         return True
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming - ĐƠN GIẢN & HIỆU QUẢ"""
         if not audio_data:
+            return {
+                'transcription': "Không có âm thanh",
+                'response': "",
+                'tts_audio': None,
+                'status': 'error'
+            }
         try:
             sample_rate, audio_array = audio_data
+            print(f"🎤 Nhận audio: {len(audio_array)} samples")
             # Đảm bảo VOSK stream đang chạy
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
             # Xử lý với VOSK
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
+            # LUÔN trả về text để hiển thị real-time
             if result['partial']:
                 return {
                     'transcription': result['partial'],
                 }
             elif result['is_final'] and result['text']:
                 # Có kết quả cuối - tạo phản hồi AI
+                print(f"📝 Final transcription: '{result['text']}'")
                 response = self._generate_ai_response(result['text'])
                 return {
                     'transcription': result['text'],
                     'response': response,
+                    'tts_audio': None,
                     'status': 'completed'
                 }
             else:
                 return {
+                    'transcription': "🎤 Đang nghe... tiếp tục nói",
                     'response': "",
                     'tts_audio': None,
                     'status': 'listening'
                 }
         except Exception as e:
+            print(f"❌ Lỗi xử lý audio: {e}")
+            return {
+                'transcription': f"Lỗi: {e}",
+                'response': "",
+                'tts_audio': None,
+                'status': 'error'
+            }
     def _generate_ai_response(self, transcription: str) -> str:
         """Tạo phản hồi AI đơn giản"""
         try:
             messages = [
+                {"role": "system", "content": "Bạn là trợ lý AI thân thiện. Trả lời ngắn gọn bằng tiếng Việt."},
                 {"role": "user", "content": transcription}
             ]
             return response.choices[0].message.content.strip()
         except Exception as e:
+            print(f"❌ Lỗi AI: {e}")
             return "Xin lỗi, tôi không thể trả lời ngay lúc này."
     def stop_listening(self):
         if self.vosk_asr:
             self.vosk_asr.stop_stream()
         print("🛑 Đã dừng lắng nghe")
+    def clear_conversation(self):
+        """Xóa lịch sử hội thoại"""
+        print("🗑️ Đã xóa lịch sử hội thoại")
+    def get_conversation_state(self) -> dict:
+        """Lấy trạng thái hội thoại"""
+        return {
+            'is_listening': self.is_listening,
+            'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False
+        }
 # import io
 # import numpy as np
 # import soundfile as sf