Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 8, 2025

Commit

4e44ffc

1 Parent(s): 5724c84

commit

Browse files

Files changed (2) hide show

app.py +9 -24
speech_io.py +48 -112

app.py CHANGED Viewed

@@ -14,12 +14,10 @@ from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
-from speech_io import transcribe_audio, synthesize_speech, transcribe_with_groq, detect_voice_activity
 # Cấu hình môi trường
 ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
-USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
-GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
 ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
 VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
@@ -137,15 +135,9 @@ def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) ->
 # TRANSCRIBE WITH OPTIMIZED PIPELINE
 # =====================================================
 def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
-    """Transcribe audio với pipeline tối ưu"""
     if not audio_path or not os.path.exists(audio_path):
         return ""
-    if USE_GROQ and GROQ_MODEL:
-        print("Using Groq for transcription...")
-        return transcribe_with_groq(audio_path, language=language)
-    else:
-        return transcribe_audio(audio_path, language=language)
 # =====================================================
 # CONVERSATIONAL INTELLIGENCE
@@ -251,7 +243,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
     if not text_to_process:
         print("DEBUG: No text to process")
         # Trả về history hiện tại và status
-        status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
         if history is None:
             history = []
         return history, "", None, status_text
@@ -283,7 +275,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
         history.append({"role": "user", "content": text_to_process})
         history.append({"role": "assistant", "content": error_msg})
-    status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
     return history, "", None, status_text
 # =====================================================
@@ -294,13 +286,13 @@ def toggle_vad(use_vad):
     global ENABLE_VAD
     ENABLE_VAD = use_vad
     status = "EIN" if use_vad else "AUS"
-    return f"Voice Activity Detection: {status} | Model: {state.whisper_model}"
 def change_whisper_model(model_size):
     """Đổi Whisper model"""
     state.whisper_model = model_size
     os.environ["WHISPER_MODEL"] = model_size
-    return f"Whisper Model: {model_size} | VAD: {'On' if ENABLE_VAD else 'Off'}"
 def clear_conversation():
     """Xóa hội thoại"""
@@ -597,7 +589,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
                 sources=["microphone"],
                 type="filepath",
                 format="wav",
-                streaming=True,
                 interactive=True,
                 show_label=False,
                 scale=1,
@@ -701,12 +693,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
         outputs=[chat_text, vad_indicator, status_display]
     )
-    # Audio Streaming
-    chat_audio.stream(
-        on_audio_change,
-        inputs=[chat_audio, vad_toggle],
-        outputs=[chat_text, vad_indicator, status_display]
-    )
     # TTS Button
     def handle_tts(history):
@@ -729,6 +716,4 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
     )
 if __name__ == "__main__":
-    demo.queue().launch(ssr_mode=False, show_error=True)

 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
+from speech_io import transcribe_audio, synthesize_speech, detect_voice_activity
 # Cấu hình môi trường
 ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
 ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
 VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
 # TRANSCRIBE WITH OPTIMIZED PIPELINE
 # =====================================================
 def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
     if not audio_path or not os.path.exists(audio_path):
         return ""
+    return transcribe_audio(audio_path, language=language)
 # =====================================================
 # CONVERSATIONAL INTELLIGENCE
     if not text_to_process:
         print("DEBUG: No text to process")
         # Trả về history hiện tại và status
+        status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
         if history is None:
             history = []
         return history, "", None, status_text
         history.append({"role": "user", "content": text_to_process})
         history.append({"role": "assistant", "content": error_msg})
+    status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
     return history, "", None, status_text
 # =====================================================
     global ENABLE_VAD
     ENABLE_VAD = use_vad
     status = "EIN" if use_vad else "AUS"
+    return f"Voice Activity Detection: {status} | Model: OpenAI whisper-1"
 def change_whisper_model(model_size):
     """Đổi Whisper model"""
     state.whisper_model = model_size
     os.environ["WHISPER_MODEL"] = model_size
+    return f"Whisper Model: OpenAI whisper-1 | VAD: {'On' if ENABLE_VAD else 'Off'}"
 def clear_conversation():
     """Xóa hội thoại"""
                 sources=["microphone"],
                 type="filepath",
                 format="wav",
+                streaming=False,
                 interactive=True,
                 show_label=False,
                 scale=1,
         outputs=[chat_text, vad_indicator, status_display]
     )
+    # Streaming handler removed; process on change after user stops recording
     # TTS Button
     def handle_tts(history):
     )
 if __name__ == "__main__":
+    demo.queue().launch(show_error=True)

speech_io.py CHANGED Viewed

@@ -24,10 +24,8 @@ WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
 ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
 TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
-# Groq Configuration
-USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
-GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
-GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
 # VAD Configuration
 ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
@@ -279,65 +277,37 @@ def detect_voice_activity(
 # ========================================================
 # SPEECH-TO-TEXT FUNCTIONS
 # ========================================================
-def get_asr_pipeline():
-    """Lấy ASR pipeline"""
-    global _asr
-    if _asr is None:
-        print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
-        from transformers import pipeline
-        _asr = pipeline(
-            task="automatic-speech-recognition",
-            model=ASR_MODEL_ID,
-            device="cpu",
-            return_timestamps=False,
-            chunk_length_s=8,
-            stride_length_s=(1, 1),
-        )
-    return _asr
-def transcribe_with_groq(audio_path: str, language: Optional[str] = None) -> str:
     """
-    Transcribe audio sử dụng Groq Cloud API
     """
-    if not GROQ_API_KEY:
-        print(">>> Groq API key nicht gefunden. Verwende lokales Modell.")
-        return transcribe_audio(audio_path, language)
     try:
-        import requests
-        with open(audio_path, 'rb') as audio_file:
-            files = {'file': audio_file}
-            data = {'model': GROQ_MODEL}
-            if language and language != 'auto':
-                data['language'] = language
-            headers = {'Authorization': f'Bearer {GROQ_API_KEY}'}
-            print(f">>> Sende Anfrage an Groq API...")
-            response = requests.post(
-                "https://api.groq.com/openai/v1/audio/transcriptions",
-                headers=headers,
-                files=files,
-                data=data,
-                timeout=30
             )
-            if response.status_code == 200:
-                result = response.json()
-                text = result.get('text', '').strip()
-                print(f">>> Groq Transkription: {text}")
-                return text
-            else:
-                print(f">>> Groq Fehler {response.status_code}")
-                return transcribe_audio(audio_path, language)
     except Exception as e:
-        print(f">>> Groq Fehler: {e}")
-        return transcribe_audio(audio_path, language)
 def transcribe_audio(
     audio_path: str,
@@ -421,65 +391,32 @@ def transcribe_audio(
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
-def get_tts_pipeline():
-    """Lấy TTS pipeline"""
-    global _tts
-    if _tts is None:
-        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
-        from transformers import pipeline
-        _tts = pipeline(
-            task="text-to-speech",
-            model=TTS_MODEL_ID,
-        )
-    return _tts
 def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
     """
-    Chuyển text sang speech
     """
-    if not text or not text.strip() or not TTS_ENABLED:
         return None
     try:
-        tts = get_tts_pipeline()
-        out = tts(text)
-        audio = np.array(out["audio"], dtype=np.float32)
-        sr = out.get("sampling_rate", 16000)
-        # Ensure valid sample rate
-        if sr is None or sr <= 0:
-            sr = 16000
-        # Ensure mono
-        if audio.ndim > 1:
-            audio = audio.squeeze()
-        if audio.ndim > 1:
-            audio = audio[:, 0]
-        # Apply processing
-        try:
-            audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
-        except:
-            pass
-        # Normalize
-        max_val = np.max(np.abs(audio))
-        if max_val > 0:
-            audio = audio / max_val
-        # Apply fade
-        audio = apply_fade(audio, sr)
-        # Convert to int16
-        audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
-        return (sr, audio_int16)
     except Exception as e:
-        print(f">>> TTS Fehler: {e}")
         return None
 # ========================================================
@@ -511,7 +448,6 @@ def fix_domain_terms(text: str) -> str:
 # ========================================================
 __all__ = [
     'transcribe_audio',
-    'transcribe_with_groq',
     'synthesize_speech',
     'detect_voice_activity',
     'normalize_audio',

 ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
 TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
+# OpenAI Configuration
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 # VAD Configuration
 ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
 # ========================================================
 # SPEECH-TO-TEXT FUNCTIONS
 # ========================================================
+def transcribe_audio(
+    audio_path: str,
+    language: Optional[str] = None,
+    max_duration_s: int = ASR_MAX_DURATION_S
+) -> str:
     """
+    Transcribe audio bằng OpenAI Whisper API
     """
+    if not audio_path or not os.path.exists(audio_path):
+        print(">>> Kein Audio gefunden.")
+        return ""
+    if not OPENAI_API_KEY:
+        print(">>> OPENAI_API_KEY nicht gesetzt.")
+        return ""
     try:
+        from openai import OpenAI
+        client = OpenAI(api_key=OPENAI_API_KEY)
+        with open(audio_path, "rb") as f:
+            resp = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=f,
+                language=language if language and language != "auto" else None,
+                response_format="text"
             )
+        text = resp.text if hasattr(resp, "text") else (resp.get("text", "") if isinstance(resp, dict) else str(resp))
+        text = fix_domain_terms(text.strip())
+        print(f">>> Transkription (OpenAI): {text}")
+        return text
     except Exception as e:
+        print(f">>> Transkriptionsfehler (OpenAI): {e}")
+        return ""
 def transcribe_audio(
     audio_path: str,
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
 def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
     """
+    Chuyển text sang speech bằng OpenAI TTS
     """
+    if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
         return None
     try:
+        from openai import OpenAI
+        client = OpenAI(api_key=OPENAI_API_KEY)
+        response = client.audio.speech.create(
+            model="tts-1",
+            voice="nova",
+            input=text[:4000],
+            response_format="wav"
+        )
+        import io
+        audio_bytes = response.content
+        with io.BytesIO(audio_bytes) as f:
+            data, sr = sf.read(f)
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
+        if data.dtype == np.float32 or data.dtype == np.float64:
+            data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
+        return (sr, data)
     except Exception as e:
+        print(f">>> TTS Fehler (OpenAI): {e}")
         return None
 # ========================================================
 # ========================================================
 __all__ = [
     'transcribe_audio',
     'synthesize_speech',
     'detect_voice_activity',
     'normalize_audio',