Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 7, 2025

Commit

fcc2090

1 Parent(s): 090a936

commit

Browse files

Files changed (2) hide show

app.py +566 -228
speech_io.py +593 -215

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
-# app.py – Prüfungsrechts-Chatbot (RAG + Sprache, UI kiểu ChatGPT + VAD)
-from __future__ import annotations
 import os
-from dataclasses import dataclass, field
-from typing import Any, List
 import gradio as gr
 from gradio_pdf import PDF
 from load_documents import load_all_documents
 from split_documents import split_documents
@@ -15,9 +14,72 @@ from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
-from speech_io import transcribe_audio, synthesize_speech
-ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")  # "auto" = Auto-Detect
 # =====================================================
 # INITIALISIERUNG (global)
@@ -43,6 +105,121 @@ pdf_meta = next(d.metadata for d in docs if d.metadata.get("type") == "pdf")
 hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
 hg_url = hg_meta.get("viewer_url")
 # =====================================================
 # Quellen formatieren – Markdown für Chat
 # =====================================================
@@ -61,255 +238,416 @@ def format_sources(src):
     return "\n".join(out)
 # =====================================================
-# State Management (wie Gradio Guide)
-# =====================================================
-@dataclass
-class AppState:
-    conversation: list = field(default_factory=list)  # LLM-History (role/content)
-    stopped: bool = False
-    model_outs: Any = None
-# =====================================================
-# CORE CHAT-FUNKTION (Text + Mikro)
 # =====================================================
-def chat_fn(
-    text_input: str,
-    audio_path: str,
-    history: list,
-    state: AppState,
-):
-    # Ensure history is list of dicts
-    if history is None or not isinstance(history, list):
-        history = []
-    # Convert old style [[u, a], ...] → new style messages
-    new_history = []
-    for h in history:
-        if isinstance(h, dict):
-            new_history.append(h)
-        elif isinstance(h, list) and len(h) == 2:
-            new_history.append({"role": "user", "content": h[0]})
-            new_history.append({"role": "assistant", "content": h[1]})
-    history = new_history
     text = (text_input or "").strip()
-    # Audio-only input → transcribe
-    if audio_path and not text:
-        text = transcribe_audio(audio_path, language=ASR_LANGUAGE_HINT).strip()
     if not text:
-        return history, state, "", None
-    # Retrieve RAG answer
-    ans, sources = answer(text, retriever, llm)
     bot_msg = ans + format_sources(sources)
-    # Append user / assistant messages
-    history.append({"role": "user", "content": text})
-    history.append({"role": "assistant", "content": bot_msg})
-    # Also update state
-    state.conversation.append({"role": "user", "content": text})
-    state.conversation.append({"role": "assistant", "content": bot_msg})
-    return history, state, "", None
 # =====================================================
-# CSS + JS (VAD) – nach Gradio Guide adaptiert
 # =====================================================
-CUSTOM_STYLE_AND_VAD = """
-<style>
-html, body {height: auto !important; overflow-y: auto !important;}
-.gradio-container {max-width: 960px; margin: 0 auto; padding: 12px;}
-#chat-wrap {position: relative;}
-#chat-input-row {transform: translateY(-28px); margin-bottom: -28px;}
-/* ChatGPT-like Bottom Bar */
-#chat-input-row {
-    align-items: center;
-    gap: 8px;
-    padding: 8px 12px;
-    border: 1px solid rgba(0,0,0,0.08);
-    border-radius: 9999px;
-    background: var(--background-primary);
-    box-shadow: 0 2px 6px rgba(0,0,0,0.06);
-}
-/* Textbox inside pill */
-#chat-textbox textarea {
-    min-height: 42px;
-    max-height: 120px;
-    border: none !important;
-    background: transparent !important;
-    box-shadow: none !important;
-    resize: none;
-    padding-left: 0;
-}
-/* Icon buttons (plus, mic, send) */
-.icon-btn, .compact-btn {
-    width: 32px;
-    height: 32px;
-    border-radius: 9999px !important;
-    display: inline-flex;
-    align-items: center;
-    justify-content: center;
-    border: 1px solid rgba(0,0,0,0.08) !important;
-    background: #f7f7f8 !important;
-    box-shadow: none !important;
-}
-.send-btn {
-    background: #111 !important;
-    color: #fff !important;
-    border-color: #111 !important;
-}
-/* Make audio mic compact and borderless */
-#chat-audio {min-width: 32px; border: none !important; background: transparent !important;}
-#chat-audio .wrap, #chat-audio .audio-wrap, #chat-audio .audio-controls {max-width: 32px;}
-#chat-textbox textarea {border: none !important; outline: none !important;}
-@media (max-width: 768px) { #chat-input-row {transform: none; margin-bottom: 0;} }
-</style>
-<script>
-/*
- * Voice Activity Detection (VAD) nach Gradio Guide:
- * Nutzt @ricky0123/vad-web, um automatisch auf die
- * .record-button / .stop-button der Audio-Komponente zu klicken.
- */
-async function init_vad() {
-  try {
-    const script1 = document.createElement("script");
-    script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
-    document.head.appendChild(script1);
-    const script2 = document.createElement("script");
-    script2.onload = async () =>  {
-      console.log("VAD JS geladen");
-      const recordButton = document.querySelector('.record-button');
-      if (recordButton) {
-        recordButton.textContent = "Just start talking";
-      }
-      const myvad = await vad.MicVAD.new({
-        onSpeechStart: () => {
-          const record = document.querySelector('.record-button');
-          const player = document.querySelector('#streaming-out');
-          if (record && (!player || player.paused)) {
-            console.log("VAD: speech start → record.click()");
-            record.click();
-          }
-        },
-        onSpeechEnd: (audio) => {
-          const stop = document.querySelector('.stop-button');
-          if (stop) {
-            console.log("VAD: speech end → stop.click()");
-            stop.click();
-          }
-        }
-      });
-      myvad.start();
-    };
-    script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js";
-    document.head.appendChild(script2);
-  } catch (e) {
-    console.log("VAD init Fehler:", e);
-  }
-}
-if (typeof window !== "undefined") {
-  window.addEventListener("load", init_vad);
-}
-</script>
-"""
 # =====================================================
-# UI – GRADIO (ChatGPT-artig + VAD)
 # =====================================================
-with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
-    gr.HTML(CUSTOM_STYLE_AND_VAD)
-    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
-    gr.Markdown(
-        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
-        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW. "
-        "Du kannst Text eingeben oder einfach anfangen zu sprechen – "
-        "die Aufnahme startet/stopt automatisch (Voice Activity Detection)."
-    )
-    with gr.Column(elem_id="chat-wrap"):
         chatbot = gr.Chatbot(
-            label="Chat",
-            height=380,
         )
-        # globaler State für Konversation usw.
-        state = gr.State(value=AppState())
-        # Eingabezeile à la ChatGPT: Plus + Text + Mikro + Senden
-        with gr.Row(elem_id="chat-input-row"):
-            attach_btn = gr.UploadButton(
-                "＋",
-                file_types=["file"],
-                file_count="multiple",
-                elem_classes=["icon-btn"],
-                scale=1,
             )
             chat_text = gr.Textbox(
-                elem_id="chat-textbox",
                 label=None,
-                placeholder="Stelle irgendeine Frage oder sprich einfach los …",
                 lines=1,
-                max_lines=6,
-                autofocus=True,
                 scale=8,
             )
             chat_audio = gr.Audio(
-                elem_id="chat-audio",
-                label="🎤",
                 sources=["microphone"],
                 type="filepath",
                 format="wav",
-                streaming=False,  # wichtig: record/stop Buttons für VAD
                 interactive=True,
-                scale=1,
                 show_label=False,
             )
-            send_btn = gr.Button(
-                "➤",
-                elem_classes=["compact-btn", "send-btn"],
-                scale=1,
-            )
-        # Senden bei Enter (Text)
-        chat_text.submit(
-            chat_fn,
-            [chat_text, chat_audio, chatbot, state],
-            [chatbot, state, chat_text, chat_audio],
-        )
-        # Audio-Stop (manuell oder durch VAD) → ganze Pipeline
-        chat_audio.change(
-            chat_fn,
-            [chat_text, chat_audio, chatbot, state],
-            [chatbot, state, chat_text, chat_audio],
-        )
-        # Senden-Button
-        send_btn.click(
-            chat_fn,
-            [chat_text, chat_audio, chatbot, state],
-            [chatbot, state, chat_text, chat_audio],
-        )
-        # Quellen & Dokumente kompakt unterhalb
-        with gr.Accordion("Quellen & Dokumente", open=False):
-            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
-            PDF(pdf_meta["pdf_url"], height=250)
-            gr.Markdown("### 📘 Hochschulgesetz NRW")
-            if isinstance(hg_url, str) and hg_url.startswith("http"):
-                gr.Markdown(f"[Im Viewer öffnen]({hg_url})")
-            else:
-                gr.Markdown("Viewer-Link nicht verfügbar.")
 if __name__ == "__main__":
-    demo.queue().launch(ssr_mode=False, show_error=True)

+# app.py – Prüfungsrechts-Chatbot (RAG + Sprache, UI kiểu ChatGPT) với các tính năng nâng cao
+#
 import os
+import time
+from dataclasses import dataclass
+from typing import Optional, Dict, Any
 import gradio as gr
 from gradio_pdf import PDF
+import numpy as np
 from load_documents import load_all_documents
 from split_documents import split_documents
 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
+from speech_io import transcribe_audio, synthesize_speech, transcribe_with_groq, detect_voice_activity
+# Cấu hình môi trường
+ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
+USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
+GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
+ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
+VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.5"))
+# =====================================================
+# STATE MANAGEMENT - Quản lý trạng thái hội thoại liền mạch
+# =====================================================
+@dataclass
+class ConversationState:
+    """Quản lý trạng thái hội thoại"""
+    messages: list
+    last_audio_time: float
+    is_listening: bool
+    vad_confidence: float
+    conversation_context: str
+    whisper_model: str
+    language: str
+    def __init__(self):
+        self.messages = []
+        self.last_audio_time = 0
+        self.is_listening = False
+        self.vad_confidence = 0.0
+        self.conversation_context = ""
+        self.whisper_model = os.getenv("WHISPER_MODEL", "base")
+        self.language = ASR_LANGUAGE_HINT
+    def add_message(self, role: str, content: str):
+        """Thêm message vào hội thoại"""
+        self.messages.append({
+            "role": role,
+            "content": content,
+            "timestamp": time.time()
+        })
+        # Cập nhật context (giữ lại 5 message gần nhất)
+        if len(self.messages) > 10:
+            self.messages = self.messages[-10:]
+        # Cập nhật context cho hội thoại
+        self._update_context()
+    def _update_context(self):
+        """Cập nhật context từ hội thoại"""
+        context_parts = []
+        for msg in self.messages[-5:]:  # Giữ 5 message gần nhất
+            prefix = "User" if msg["role"] == "user" else "Assistant"
+            context_parts.append(f"{prefix}: {msg['content']}")
+        self.conversation_context = "\n".join(context_parts)
+    def get_recent_context(self, num_messages: int = 3) -> str:
+        """Lấy context gần đây"""
+        recent = self.messages[-num_messages:] if self.messages else []
+        return "\n".join([f"{m['role']}: {m['content']}" for m in recent])
+    def reset(self):
+        """Reset trạng thái hội thoại"""
+        self.messages = []
+        self.conversation_context = ""
+# Khởi tạo state
+state = ConversationState()
 # =====================================================
 # INITIALISIERUNG (global)
 hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
 hg_url = hg_meta.get("viewer_url")
+# =====================================================
+# BENCHMARK WHISPER MODELS
+# =====================================================
+def benchmark_whisper_models(audio_path: str) -> Dict[str, Any]:
+    """Benchmark các model Whisper khác nhau"""
+    import torch
+    from transformers import pipeline
+    models_to_test = ["tiny", "base", "small", "medium"]
+    results = {}
+    for model_size in models_to_test:
+        model_id = f"openai/whisper-{model_size}"
+        try:
+            print(f"Testing {model_id}...")
+            # Measure memory usage
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+            memory_before = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
+            # Load and transcribe
+            start_time = time.time()
+            asr_pipeline = pipeline(
+                task="automatic-speech-recognition",
+                model=model_id,
+                device="cpu",
+                return_timestamps=False,
+                chunk_length_s=8,
+                stride_length_s=(1, 1),
+            )
+            # Load audio
+            import soundfile as sf
+            data, sr = sf.read(audio_path)
+            # Transcribe
+            result = asr_pipeline({"array": data, "sampling_rate": sr})
+            transcription = result.get("text", "")
+            end_time = time.time()
+            # Memory after
+            memory_after = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
+            results[model_size] = {
+                "transcription": transcription,
+                "time_taken": end_time - start_time,
+                "memory_used": memory_after - memory_before,
+                "model_size": model_size
+            }
+            print(f"  Time: {end_time - start_time:.2f}s")
+        except Exception as e:
+            print(f"  Error with {model_id}: {e}")
+            results[model_size] = {"error": str(e)}
+    return results
+# =====================================================
+# VOICE ACTIVITY DETECTION
+# =====================================================
+def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) -> Dict[str, Any]:
+    """Xử lý phát hiện hoạt động giọng nói"""
+    if audio_data is None or len(audio_data) == 0:
+        return {"is_speech": False, "confidence": 0.0}
+    vad_result = detect_voice_activity(audio_data, sample_rate, threshold=VAD_THRESHOLD)
+    # Cập nhật state
+    if vad_result["is_speech"]:
+        state.last_audio_time = time.time()
+        state.vad_confidence = vad_result["confidence"]
+    return vad_result
+# =====================================================
+# TRANSCRIBE WITH OPTIMIZED PIPELINE
+# =====================================================
+def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
+    """Transcribe audio với pipeline tối ưu"""
+    if USE_GROQ:
+        print("Using Groq for transcription...")
+        return transcribe_with_groq(audio_path, language=language)
+    else:
+        return transcribe_audio(audio_path, language=language)
+# =====================================================
+# CONVERSATIONAL INTELLIGENCE
+# =====================================================
+def enhance_conversation_context(user_input: str, history: list) -> str:
+    """Tăng cường context hội thoại với LLM"""
+    # Tạo prompt có context
+    context = state.get_recent_context(3)
+    prompt = f"""Context from previous conversation:
+{context}
+Current user input: {user_input}
+Based on the context, provide a concise summary or additional context that might help answer this question better:"""
+    # Gọi LLM để xử lý context (có thể dùng model nhỏ hơn cho việc này)
+    try:
+        # Ở đây có thể tích hợp với một LLM nhỏ để xử lý context
+        # Tạm thời trả về context đơn giản
+        if context:
+            return f"Context from conversation: {context}\n\nQuestion: {user_input}"
+        else:
+            return user_input
+    except:
+        return user_input
 # =====================================================
 # Quellen formatieren – Markdown für Chat
 # =====================================================
     return "\n".join(out)
 # =====================================================
+# CORE CHAT-FUNKTION với tất cả tính năng mới
 # =====================================================
+def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
+    """
+    text_input: Textbox-Inhalt (str)
+    audio_path: Pfad zu WAV/FLAC vom Mikro (gr.Audio, type="filepath")
+    history: Liste von OpenAI-ähnlichen Messages (role, content)
+    """
     text = (text_input or "").strip()
+    # Xử lý VAD nếu được bật
+    if use_vad and ENABLE_VAD and audio_path:
+        import soundfile as sf
+        try:
+            audio_data, sample_rate = sf.read(audio_path)
+            vad_result = handle_voice_activity(audio_data, sample_rate)
+            if vad_result["is_speech"]:
+                print(f"Voice activity detected with confidence: {vad_result['confidence']:.2f}")
+            else:
+                print("No voice activity detected")
+                if not text:
+                    return history, "", None, "Bereit (keine Sprache erkannt)"
+        except Exception as e:
+            print(f"VAD error: {e}")
+    # Transcribe audio nếu có
+    if (not text) and audio_path:
+        state.last_audio_time = time.time()
+        # Chọn phương thức transcribe
+        if USE_GROQ:
+            spoken = transcribe_with_groq(audio_path, language=lang_sel)
+        else:
+            spoken = transcribe_audio(audio_path, language=lang_sel)
+        text = spoken.strip()
+        if text:
+            # Tăng cường context hội thoại
+            enhanced_text = enhance_conversation_context(text, history)
+            state.add_message("user", text)
+            print(f"✅ Transkribiert: {text}")
     if not text:
+        return history, "", None, "Bereit"
+    # Tăng cường context cho câu hỏi
+    question_with_context = enhance_conversation_context(text, history)
+    # RAG-Antwort berechnen với context
+    ans, sources = answer(question_with_context, retriever, llm)
     bot_msg = ans + format_sources(sources)
+    # Thêm vào state
+    state.add_message("assistant", ans)
+    # History aktualisieren (ChatGPT-Style)
+    history = history + [
+        {"role": "user", "content": text},
+        {"role": "assistant", "content": bot_msg},
+    ]
+    status_text = f"Bereit | Model: {state.whisper_model} | VAD: {'On' if use_vad else 'Off'}"
+    return history, "", None, status_text
+# =====================================================
+# FUNCTIONS FOR UI CONTROLS
+# =====================================================
+def toggle_vad(use_vad):
+    """Toggle Voice Activity Detection"""
+    global ENABLE_VAD
+    ENABLE_VAD = use_vad
+    status = "EIN" if use_vad else "AUS"
+    return f"Voice Activity Detection: {status}"
+def change_whisper_model(model_size):
+    """Đổi Whisper model"""
+    state.whisper_model = model_size
+    os.environ["WHISPER_MODEL"] = model_size
+    return f"Whisper Model: {model_size}"
+def run_benchmark(audio_path):
+    """Chạy benchmark các model Whisper"""
+    if not audio_path:
+        return "Bitte wählen Sie eine Audiodatei für den Benchmark aus."
+    results = benchmark_whisper_models(audio_path)
+    # Format results
+    report = ["## 📊 Whisper Model Benchmark", ""]
+    for model_size, result in results.items():
+        if "error" in result:
+            report.append(f"**{model_size}**: Fehler - {result['error']}")
+        else:
+            report.append(
+                f"**{model_size}**: {result['time_taken']:.2f}s | "
+                f"Speicher: {result['memory_used'] / 1024**2:.1f}MB | "
+                f"Text: {result['transcription'][:100]}..."
+            )
+    return "\n".join(report)
+def clear_conversation():
+    """Xóa hội thoại"""
+    state.reset()
+    return [], "Hội thoại đã được xóa"
 # =====================================================
+# LAST ANSWER → TTS (für Button "Antwort erneut vorlesen")
 # =====================================================
+def read_last_answer(history):
+    if not history:
+        return None
+    for msg in reversed(history):
+        if msg.get("role") == "assistant":
+            return synthesize_speech(msg.get("content", ""))
+    return None
 # =====================================================
+# UI – GRADIO với tất cả tính năng mới
 # =====================================================
+with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced", theme=gr.themes.Soft()) as demo:
+    # CSS Styling nâng cao
+    gr.HTML("""
+    <style>
+    .gradio-container {
+        max-width: 1200px;
+        margin: 0 auto;
+        padding: 20px;
+    }
+    .header {
+        text-align: center;
+        margin-bottom: 30px;
+    }
+    .control-panel {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        padding: 20px;
+        border-radius: 15px;
+        margin-bottom: 20px;
+        color: white;
+    }
+    .stats-bar {
+        background: #f8f9fa;
+        border-radius: 10px;
+        padding: 10px;
+        margin: 10px 0;
+        border-left: 4px solid #667eea;
+    }
+    .vad-indicator {
+        display: inline-block;
+        width: 12px;
+        height: 12px;
+        border-radius: 50%;
+        margin-right: 8px;
+    }
+    .vad-active {
+        background-color: #10b981;
+        box-shadow: 0 0 10px #10b981;
+    }
+    .vad-inactive {
+        background-color: #ef4444;
+    }
+    .model-selector {
+        background: white;
+        padding: 15px;
+        border-radius: 10px;
+        margin: 10px 0;
+    }
+    .chat-container {
+        background: white;
+        border-radius: 15px;
+        padding: 20px;
+        box-shadow: 0 10px 40px rgba(0,0,0,0.1);
+    }
+    .input-row {
+        background: #f8fafc;
+        border-radius: 25px;
+        padding: 5px 20px;
+        border: 2px solid #e2e8f0;
+        transition: all 0.3s ease;
+    }
+    .input-row:focus-within {
+        border-color: #667eea;
+        box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
+    }
+    .feature-badge {
+        display: inline-block;
+        padding: 4px 12px;
+        background: #e0e7ff;
+        color: #4f46e5;
+        border-radius: 20px;
+        font-size: 12px;
+        margin: 2px;
+    }
+    </style>
+    """)
+    # Header
+    with gr.Column(elem_classes=["header"]):
+        gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot - Enhanced")
+        gr.Markdown("### Intelligent Voice Interface with Advanced Features")
+        # Feature badges
+        gr.HTML("""
+        <div style="text-align: center; margin: 10px 0;">
+            <span class="feature-badge">🎤 Voice Activity Detection</span>
+            <span class="feature-badge">⚡ Groq Optimization</span>
+            <span class="feature-badge">🧠 Conversational AI</span>
+            <span class="feature-badge">📊 Model Benchmarking</span>
+            <span class="feature-badge">🔄 State Management</span>
+        </div>
+        """)
+    # Control Panel
+    with gr.Column(elem_classes=["control-panel"]):
+        gr.Markdown("### 🎛️ Control Panel")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Model Selection
+                model_selector = gr.Dropdown(
+                    choices=["tiny", "base", "small", "medium"],
+                    value=state.whisper_model,
+                    label="Whisper Model",
+                    info="Chọn model cho speech recognition"
+                )
+                # VAD Control
+                vad_toggle = gr.Checkbox(
+                    value=ENABLE_VAD,
+                    label="Enable Voice Activity Detection",
+                    info="Tự động phát hiện khi người dùng nói"
+                )
+                # Language Selection
+                lang_selector = gr.Dropdown(
+                    choices=["de", "en", "auto"],
+                    value=ASR_LANGUAGE_HINT,
+                    label="Speech Recognition Language"
+                )
+            with gr.Column(scale=1):
+                # Stats Display
+                status_display = gr.Textbox(
+                    label="System Status",
+                    value="Bereit",
+                    interactive=False
+                )
+                # Clear Conversation Button
+                clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
+                # Benchmark Section
+                benchmark_audio = gr.Audio(
+                    label="Benchmark Audio",
+                    type="filepath",
+                    visible=False
+                )
+                benchmark_btn = gr.Button("📊 Run Model Benchmark", variant="secondary")
+                benchmark_output = gr.Markdown()
+    # Main Chat Interface
+    with gr.Column(elem_classes=["chat-container"]):
         chatbot = gr.Chatbot(
+            label="Conversation",
+            height=400,
+            bubble_full_width=False,
+            show_copy_button=True
         )
+        # Input Row với VAD Indicator
+        with gr.Row(elem_classes=["input-row"]):
+            # VAD Indicator
+            vad_indicator = gr.HTML(
+                f"""
+                <div class="vad-indicator {'vad-active' if state.is_listening else 'vad-inactive'}"></div>
+                <span>VAD: {'Active' if state.is_listening else 'Inactive'}</span>
+                """
             )
+            # Text Input
             chat_text = gr.Textbox(
                 label=None,
+                placeholder="Stelle eine Frage oder spreche ins Mikrofon...",
                 lines=1,
+                max_lines=4,
                 scale=8,
+                container=False
             )
+            # Audio Input
             chat_audio = gr.Audio(
                 sources=["microphone"],
                 type="filepath",
                 format="wav",
+                streaming=True,
                 interactive=True,
                 show_label=False,
+                scale=1
             )
+            # Send Button
+            send_btn = gr.Button("Senden", variant="primary", scale=1)
+        # TTS Controls
+        with gr.Row():
+            tts_btn = gr.Button("🔊 Antwort vorlesen", variant="secondary")
+            tts_audio = gr.Audio(label="Audio Output", interactive=False)
+    # Documents Section
+    with gr.Accordion("📚 Quellen & Dokumente", open=False):
+        with gr.Tabs():
+            with gr.TabItem("Prüfungsordnung (PDF)"):
+                PDF(pdf_meta["pdf_url"], height=300)
+            with gr.TabItem("Hochschulgesetz NRW"):
+                if isinstance(hg_url, str) and hg_url.startswith("http"):
+                    gr.Markdown(f"### [Im Viewer öffnen]({hg_url})")
+                    gr.HTML(f'<iframe src="{hg_url}" width="100%" height="500px"></iframe>')
+                else:
+                    gr.Markdown("Viewer-Link nicht verfügbar.")
+    # Event Handlers
+    # Model Selection
+    model_selector.change(
+        change_whisper_model,
+        inputs=[model_selector],
+        outputs=[status_display]
+    )
+    # VAD Toggle
+    vad_toggle.change(
+        toggle_vad,
+        inputs=[vad_toggle],
+        outputs=[status_display]
+    )
+    # Clear Conversation
+    clear_btn.click(
+        clear_conversation,
+        outputs=[chatbot, status_display]
+    )
+    # Benchmark
+    benchmark_btn.click(
+        run_benchmark,
+        inputs=[benchmark_audio],
+        outputs=[benchmark_output]
+    )
+    # Main Chat Function
+    send_btn.click(
+        chat_fn,
+        inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
+        outputs=[chatbot, chat_text, chat_audio, status_display]
+    )
+    chat_text.submit(
+        chat_fn,
+        inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
+        outputs=[chatbot, chat_text, chat_audio, status_display]
+    )
+    # Real-time transcription với VAD
+    def handle_streaming_audio(audio_path, use_vad):
+        if audio_path and use_vad:
+            import soundfile as sf
+            try:
+                audio_data, sr = sf.read(audio_path)
+                vad_result = handle_voice_activity(audio_data, sr)
+                if vad_result["is_speech"]:
+                    text = transcribe_audio_optimized(audio_path, language=lang_selector.value)
+                    return text, f"VAD Active (Confidence: {vad_result['confidence']:.2f})"
+            except Exception as e:
+                print(f"Streaming error: {e}")
+        return "", status_display.value
+    chat_audio.stream(
+        handle_streaming_audio,
+        inputs=[chat_audio, vad_toggle],
+        outputs=[chat_text, status_display]
+    )
+    # TTS
+    tts_btn.click(
+        read_last_answer,
+        inputs=[chatbot],
+        outputs=[tts_audio]
+    )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

speech_io.py CHANGED Viewed

@@ -1,248 +1,626 @@
 """
-speech_io.py – Final FIXED Version
-✔ No 'prompt' in generate_kwargs
-✔ Fully HF Whisper-compatible
-✔ Supports Groq Whisper
-✔ Stable for HuggingFace Spaces
 """
 import os
 import numpy as np
 import soundfile as sf
-import difflib
 import re
-from scipy.signal import butter, filtfilt, resample
-from transformers import pipeline
-# Optional Groq import
-try:
-    from groq import Groq
-except:
-    Groq = None
-ASR_MODEL_ID = os.getenv("ASR_MODEL_ID", "openai/whisper-tiny")
-USE_GROQ = os.getenv("USE_GROQ_WHISPER", "0").lower() in ("1", "true", "yes")
-GROQ_MODEL = os.getenv("GROQ_WHISPER_MODEL", "whisper-large-v3-turbo")
-ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
-ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
 TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
-TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
-_asr = None
-_tts = None
-_groq = None
-# ======================================================
-# Helpers
-# ======================================================
-def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
-    nyq = 0.5 * fs
-    norm_cutoff = cutoff / nyq
-    b, a = butter(order, norm_cutoff, btype="high")
-    return filtfilt(b, a, data)
-def apply_fade(audio, sr, ms=10):
-    n = int(sr * ms / 1000)
-    if n * 2 >= len(audio):
-        return audio
-    fadein = np.linspace(0, 1, n)
-    fadeout = np.linspace(1, 0, n)
-    audio[:n] *= fadein
-    audio[-n:] *= fadeout
-    return audio
-# ======================================================
-# Whisper LOCAL
-# ======================================================
-def get_asr_pipeline():
-    global _asr
-    if _asr is None:
-        print(f">>> Lade lokales Whisper-Modell: {ASR_MODEL_ID}")
-        _asr = pipeline(
             task="automatic-speech-recognition",
-            model=ASR_MODEL_ID,
             device="cpu",
             return_timestamps=False,
             chunk_length_s=8,
             stride_length_s=(1, 1),
         )
-    return _asr
-# ======================================================
-# Whisper GROQ
-# ======================================================
-def get_groq_client():
-    global _groq
-    if _groq is None:
-        key = os.getenv("GROQ_API_KEY")
-        if not key:
-            raise RuntimeError("GROQ_API_KEY fehlt.")
-        _groq = Groq(api_key=key)
-        print(">>> Groq Client bereit.")
-    return _groq
-def _groq_transcribe(audio_path, language):
-    client = get_groq_client()
-    lang = None
-    if language and language.lower() != "auto":
-        lang = language
-    with open(audio_path, "rb") as f:
-        resp = client.audio.transcriptions.with_raw_response.create(
-            file=("audio.wav", f),
-            model=GROQ_MODEL,
-            response_format="verbose_json",
-            language=lang,
-        ).parse()
-    segments = resp.segments or []
-    if not segments:
-        return ""
-    if segments[0].get("no_speech_prob", 0) > 0.7:
-        return ""
-    return resp.text.strip()
-# ======================================================
-# LOCAL WHISPER STT
-# ======================================================
-def _local_transcribe(audio_path, language, max_duration_s):
-    data, sr = sf.read(audio_path, always_2d=False)
-    if data.ndim > 1:
-        data = data.mean(axis=1)
-    data = np.asarray(data, dtype=np.float32)
-    data = np.clip(data, -1, 1)
     try:
-        data = butter_highpass_filter(data, 60, sr)
-    except:
-        pass
-    m = np.max(np.abs(data))
-    if m > 0:
-        data = data / m
-    rms = float(np.sqrt(np.mean(data ** 2)))
-    if rms < 5e-5:
-        print(">>> Audio zu leise.")
         return ""
-    if sr != 16000:
-        target_len = int(len(data) * 16000 / sr)
-        data = resample(data, target_len)
-        sr = 16000
-    idx = np.where(np.abs(data) > 0.02)[0]
-    if idx.size:
-        data = data[idx[0]: idx[-1] + 1]
-    dur = len(data) / sr
-    if dur < 0.3:
-        print(">>> Audio zu kurz.")
         return ""
-    if len(data) > sr * max_duration_s:
-        data = data[: sr * max_duration_s]
-    asr = get_asr_pipeline()
-    # ---- FIXED generate_kwargs: ALLOWED ONLY ----
-    gen = {
-        "task": "transcribe",
-        "temperature": 0.0,
-        "num_beams": 1,
-        "compression_ratio_threshold": 2.4,
-        "logprob_threshold": -1.0,
-        "no_speech_threshold": 0.6,
-        "no_repeat_ngram_size": 3,
-    }
-    if language and language.lower() != "auto":
-        gen["language"] = language
-    print(">>> Transkribiere Audio (lokal)…")
-    result = asr(
-        {"array": data, "sampling_rate": sr},
-        generate_kwargs=gen,
-    )
-    text = (result.get("text", "") if isinstance(result, dict) else result).strip()
-    # Domain cleanup
-    vocab = [
-        "prüfung","prüfungsordnung","hochschulgesetz","modul","klausur",
-        "immatrikulation","exmatrikulation","anmeldung","wiederholung"
     ]
     tokens = text.split()
-    fixed = []
-    for t in tokens:
-        m = difflib.get_close_matches(t.lower(), vocab, n=1, cutoff=0.82)
-        fixed.append(m[0] if m else t)
-    return " ".join(fixed)
-# ======================================================
-# PUBLIC STT WRAPPER
-# ======================================================
-def transcribe_audio(audio_path, language=None, max_duration_s=ASR_MAX_DURATION_S):
-    if not audio_path:
-        return ""
-    # Try Groq first
-    if USE_GROQ:
         try:
-            return _groq_transcribe(audio_path, language)
         except Exception as e:
-            print("Groq Fehler → fallback lokal:", e)
-    return _local_transcribe(audio_path, language, max_duration_s)
-# ======================================================
-# TTS
-# ======================================================
-def get_tts_pipeline():
-    global _tts
-    if _tts is None:
-        print(">>> Lade TTS:", TTS_MODEL_ID)
-        _tts = pipeline("text-to-speech", model=TTS_MODEL_ID)
-    return _tts
-def synthesize_speech(text: str):
-    if not text or not TTS_ENABLED:
-        return None
-    tts = get_tts_pipeline()
-    out = tts(text)
-    audio = np.array(out["audio"], dtype=np.float32)
-    sr = out.get("sampling_rate", 16000)
-    if audio.ndim > 1:
-        audio = audio.squeeze()
-    try:
-        audio = butter_highpass_filter(audio, 60, sr)
-    except:
-        pass
-    maxv = np.max(np.abs(audio))
-    if maxv > 0:
-        audio = audio / maxv
-    audio = apply_fade(audio, sr)
-    audio = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
-    return sr, audio

 """
+speech_io.py - Enhanced Version
+Sprachbasierte Ein-/Ausgabe với:
+- Speech-to-Text (STT) với Whisper (nhiều phiên bản + Groq)
+- Text-to-Speech (TTS) với MMS-TTS Deutsch
+- Voice Activity Detection (VAD)
+- Model Benchmarking
 """
 import os
+import time
+from typing import Optional, Tuple, Dict, Any, Union
 import numpy as np
 import soundfile as sf
+from scipy.signal import butter, filtfilt, resample, sosfiltfilt
 import re
+import difflib
+import requests
+import json
+from dataclasses import dataclass
+# ========================================================
+# CẤU HÌNH
+# ========================================================
+# Model Selection
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")  # tiny, base, small, medium
+ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
 TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
+# Groq Configuration
+USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
+GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
+GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
+GROQ_API_URL = "https://api.groq.com/openai/v1/audio/transcriptions"
+# VAD Configuration
+ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
+VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.5"))
+# Other Configs
+ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
+TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
+ASR_PROMPT = os.getenv("ASR_PROMPT", "Dies ist ein Diktat in deutscher Sprache.")
+ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
+# Cache for models
+_asr_cache = {}
+_tts = None
+# ========================================================
+# DATA CLASSES
+# ========================================================
+@dataclass
+class TranscriptionResult:
+    text: str
+    confidence: float
+    language: str
+    processing_time: float
+    model: str
+@dataclass
+class VADResult:
+    is_speech: bool
+    confidence: float
+    speech_segments: list
+    energy: float
+# ========================================================
+# MODEL LOADING WITH CACHE
+# ========================================================
+def get_asr_pipeline(model_size: str = None):
+    """Lấy ASR pipeline với cache"""
+    global _asr_cache
+    if model_size is None:
+        model_size = WHISPER_MODEL
+    model_id = f"openai/whisper-{model_size}"
+    if model_id not in _asr_cache:
+        print(f">>> Lade ASR Modell: {model_id}")
+        from transformers import pipeline
+        _asr_cache[model_id] = pipeline(
             task="automatic-speech-recognition",
+            model=model_id,
             device="cpu",
             return_timestamps=False,
             chunk_length_s=8,
             stride_length_s=(1, 1),
         )
+    return _asr_cache[model_id]
+def get_tts_pipeline():
+    """Lấy TTS pipeline"""
+    global _tts
+    if _tts is None:
+        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
+        from transformers import pipeline
+        _tts = pipeline(
+            task="text-to-speech",
+            model=TTS_MODEL_ID,
+        )
+    return _tts
+# ========================================================
+# AUDIO PROCESSING UTILITIES
+# ========================================================
+def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
+    """Highpass filter để loại bỏ noise tần số thấp"""
+    if len(data) == 0:
+        return data
+    nyq = 0.5 * fs
+    norm_cutoff = cutoff / nyq
+    sos = butter(order, norm_cutoff, btype="high", output='sos')
+    return sosfiltfilt(sos, data)
+def apply_fade(audio, sr, fade_in_ms=10, fade_out_ms=10):
+    """Áp dụng fade in/out để tránh pop"""
+    if len(audio) == 0:
+        return audio
+    fade_in_samples = int(sr * fade_in_ms / 1000)
+    fade_out_samples = int(sr * fade_out_ms / 1000)
+    if fade_in_samples * 2 >= len(audio):
+        return audio
+    # Fade in
+    fade_in_curve = np.linspace(0, 1, fade_in_samples)
+    audio[:fade_in_samples] *= fade_in_curve
+    # Fade out
+    fade_out_curve = np.linspace(1, 0, fade_out_samples)
+    audio[-fade_out_samples:] *= fade_out_curve
+    return audio
+def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
+    """Chuẩn hóa audio"""
+    if len(audio_data) == 0:
+        return audio_data
+    # Chuyển đổi sang float32 nếu cần
+    if audio_data.dtype != np.float32:
+        audio_data = audio_data.astype(np.float32)
+    # Normalize về [-1, 1]
+    max_val = np.max(np.abs(audio_data))
+    if max_val > 0:
+        audio_data = audio_data / max_val
+    return audio_data
+def resample_audio(audio_data: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
+    """Resample audio về target sample rate"""
+    if orig_sr == target_sr:
+        return audio_data
+    target_len = int(len(audio_data) * target_sr / orig_sr)
+    return resample(audio_data, target_len)
+# ========================================================
+# VOICE ACTIVITY DETECTION (VAD)
+# ========================================================
+def detect_voice_activity(
+    audio_data: np.ndarray,
+    sample_rate: int,
+    threshold: float = 0.5,
+    frame_duration_ms: int = 30
+) -> VADResult:
+    """
+    Phát hiện hoạt động giọng nói
+    """
+    if len(audio_data) == 0:
+        return VADResult(
+            is_speech=False,
+            confidence=0.0,
+            speech_segments=[],
+            energy=0.0
+        )
+    # Tính toán energy
+    energy = np.mean(audio_data ** 2)
+    # Frame-based analysis
+    frame_size = int(sample_rate * frame_duration_ms / 1000)
+    num_frames = len(audio_data) // frame_size
+    speech_frames = 0
+    speech_segments = []
+    current_segment = None
+    for i in range(num_frames):
+        start_idx = i * frame_size
+        end_idx = start_idx + frame_size
+        frame = audio_data[start_idx:end_idx]
+        # Tính frame energy
+        frame_energy = np.mean(frame ** 2)
+        # Kiểm tra zero-crossing rate (ZCR) để phân biệt speech/noise
+        zcr = np.mean(np.abs(np.diff(np.sign(frame))))
+        # Kết hợp các đặc trưng để phát hiện speech
+        is_speech_frame = (frame_energy > threshold * energy) and (zcr < 0.3)
+        if is_speech_frame:
+            speech_frames += 1
+            if current_segment is None:
+                current_segment = [start_idx / sample_rate, end_idx / sample_rate]
+            else:
+                current_segment[1] = end_idx / sample_rate
+        else:
+            if current_segment is not None:
+                speech_segments.append(current_segment)
+                current_segment = None
+    if current_segment is not None:
+        speech_segments.append(current_segment)
+    # Tính confidence
+    confidence = speech_frames / max(num_frames, 1)
+    is_speech = confidence > 0.1  # Ít nhất 10% frames là speech
+    return VADResult(
+        is_speech=is_speech,
+        confidence=confidence,
+        speech_segments=speech_segments,
+        energy=energy
+    )
+def vad_preprocess(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
+    """Tiền xử lý audio cho VAD"""
+    # Normalize
+    audio_data = normalize_audio(audio_data)
+    # Highpass filter
+    audio_data = butter_highpass_filter(audio_data, cutoff=80, fs=sample_rate)
+    return audio_data
+# ========================================================
+# SPEECH-TO-TEXT CORE FUNCTIONS
+# ========================================================
+def transcribe_with_groq(
+    audio_path: str,
+    language: Optional[str] = None,
+    prompt: Optional[str] = None
+) -> str:
+    """
+    Transcribe audio sử dụng Groq Cloud API
+    """
+    if not GROQ_API_KEY:
+        print(">>> Groq API key nicht gefunden. Verwende lokales Modell.")
+        return transcribe_audio(audio_path, language)
     try:
+        # Đọc audio file
+        with open(audio_path, 'rb') as audio_file:
+            files = {
+                'file': (os.path.basename(audio_path), audio_file, 'audio/wav')
+            }
+            data = {
+                'model': GROQ_MODEL,
+                'response_format': 'json',
+            }
+            if language and language != 'auto':
+                data['language'] = language
+            if prompt:
+                data['prompt'] = prompt
+            headers = {
+                'Authorization': f'Bearer {GROQ_API_KEY}'
+            }
+            print(f">>> Sende Anfrage an Groq API (Modell: {GROQ_MODEL})...")
+            start_time = time.time()
+            response = requests.post(
+                GROQ_API_URL,
+                headers=headers,
+                files=files,
+                data=data,
+                timeout=30
+            )
+            processing_time = time.time() - start_time
+            if response.status_code == 200:
+                result = response.json()
+                text = result.get('text', '').strip()
+                print(f">>> Groq Transkription ({processing_time:.2f}s): {text}")
+                return text
+            else:
+                print(f">>> Groq Fehler {response.status_code}: {response.text}")
+                # Fallback to local model
+                return transcribe_audio(audio_path, language)
+    except Exception as e:
+        print(f">>> Groq Fehler: {e}")
+        return transcribe_audio(audio_path, language)
+def transcribe_audio(
+    audio_path: str,
+    language: Optional[str] = None,
+    model_size: Optional[str] = None,
+    max_duration_s: int = ASR_MAX_DURATION_S
+) -> str:
+    """
+    Transcribe audio với Whisper local
+    """
+    if audio_path is None or not os.path.exists(audio_path):
+        print(">>> Kein Audio gefunden.")
         return ""
+    try:
+        # Đọc audio file
+        data, sr = sf.read(audio_path, always_2d=False)
+        if data is None or data.size == 0:
+            print(">>> Audio leer.")
+            return ""
+        # Chuyển sang mono nếu cần
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
+        # Tiền xử lý audio
+        data = normalize_audio(data)
+        # Resample về 16kHz nếu cần
+        TARGET_SR = 16000
+        if sr != TARGET_SR:
+            data = resample_audio(data, sr, TARGET_SR)
+            sr = TARGET_SR
+        # Lọc noise
+        try:
+            data = butter_highpass_filter(data, cutoff=60, fs=sr)
+        except:
+            pass
+        # Kiểm tra audio quality
+        duration_s = len(data) / sr
+        rms = float(np.sqrt(np.mean(data ** 2)))
+        peak = float(np.max(np.abs(data)))
+        print(f">>> Audio stats – Dauer: {duration_s:.2f}s, RMS: {rms:.6f}, Peak: {peak:.6f}")
+        # Kiểm tra điều kiện tối thiểu
+        if duration_s < 0.3 or rms < 3e-4 or peak < 8e-4:
+            print(">>> Audio zu kurz oder zu leise.")
+            return ""
+        # Giới hạn độ dài
+        MAX_SAMPLES = sr * max_duration_s
+        if len(data) > MAX_SAMPLES:
+            data = data[:MAX_SAMPLES]
+            print(f">>> Audio auf {max_duration_s}s gekürzt.")
+        # Chọn model
+        if model_size is None:
+            model_size = WHISPER_MODEL
+        asr = get_asr_pipeline(model_size)
+        # Cấu hình transcribe
+        lang = language
+        if not lang and ASR_DEFAULT_LANGUAGE and ASR_DEFAULT_LANGUAGE.lower() != "auto":
+            lang = ASR_DEFAULT_LANGUAGE
+        if isinstance(lang, str) and lang.lower() == "auto":
+            lang = None
+        call_kwargs = {}
+        # Dynamic token budget based on audio length
+        token_budget = min(120, int(duration_s * 20))
+        if duration_s < 2.0:
+            token_budget = 60
+        if duration_s < 1.0:
+            token_budget = 36
+        if lang:
+            call_kwargs["generate_kwargs"] = {
+                "language": lang,
+                "task": "transcribe",
+                "max_new_tokens": token_budget,
+                "temperature": 0.0,
+                "num_beams": 1,
+                "compression_ratio_threshold": 2.4,
+                "logprob_threshold": -1.0,
+                "no_speech_threshold": 0.6,
+                "no_repeat_ngram_size": 3,
+            }
+        print(f">>> Transkribiere mit Whisper-{model_size}...")
+        start_time = time.time()
+        result = asr({"array": data, "sampling_rate": sr}, **call_kwargs)
+        processing_time = time.time() - start_time
+        text = result.get("text", "") if isinstance(result, dict) else str(result)
+        text = text.strip()
+        # Sửa lỗi domain terms
+        text = fix_domain_terms(text)
+        print(f">>> Transkription ({processing_time:.2f}s): {text}")
+        return text
+    except Exception as e:
+        print(f">>> Transkriptionsfehler: {e}")
         return ""
+# ========================================================
+# TEXT-TO-SPEECH (TTS)
+# ========================================================
+def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
+    """
+    Chuyển text sang speech
+    """
+    if not text or not text.strip() or not TTS_ENABLED:
+        return None
+    try:
+        tts = get_tts_pipeline()
+        # TTS inference
+        out = tts(text)
+        # Extract audio data
+        audio = np.array(out["audio"], dtype=np.float32)
+        sr = out.get("sampling_rate", 16000)
+        # Ensure valid sample rate
+        if sr is None or sr <= 0 or sr > 65535:
+            sr = 16000
+        # Ensure mono
+        if audio.ndim > 1:
+            audio = audio.squeeze()
+        if audio.ndim > 1:
+            audio = audio[:, 0]
+        # Apply processing
+        try:
+            audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
+        except:
+            pass
+        # Normalize
+        max_val = np.max(np.abs(audio))
+        if max_val > 0:
+            audio = audio / max_val
+        # Apply fade
+        audio = apply_fade(audio, sr)
+        # Convert to int16
+        audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
+        return (sr, audio_int16)
+    except Exception as e:
+        print(f">>> TTS Fehler: {e}")
+        return None
+# ========================================================
+# DOMAIN-SPECIFIC TEXT PROCESSING
+# ========================================================
+def fix_domain_terms(text: str) -> str:
+    """
+    Sửa lỗi các thuật ngữ chuyên ngành
+    """
+    if not text:
+        return text
+    # Common mis-transcriptions in German academic/legal context
+    correction_pairs = [
+        (r"\bbriefe\s*um\b", "prüfung"),
+        (r"\bbrieft\s*um\b", "prüfung"),
+        (r"\bbriefung\b", "prüfung"),
+        (r"\bpruefung\b", "prüfung"),
+        (r"\bhochschule\s*gesetz\b", "hochschulgesetz"),
+        (r"\bmodule\b", "modul"),
+        (r"\bklausuren\b", "klausur"),
+        (r"\bimmatrikulations\b", "immatrikulation"),
+        (r"\bexmatrikulations\b", "exmatrikulation"),
     ]
+    for pattern, replacement in correction_pairs:
+        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+    # Vocabulary matching for domain terms
+    domain_vocabulary = [
+        "prüfung", "prüfungsordnung", "hochschulgesetz", "modul", "klausur",
+        "immatrikulation", "exmatrikulation", "anmeldung", "wiederholung",
+        "noten", "semester", "vorlesung", "übung", "praktikum",
+        "bachelor", "master", "promotion", "habilitation"
+    ]
+    # Simple word-by-word correction
     tokens = text.split()
+    corrected_tokens = []
+    for token in tokens:
+        # Check if token is likely a domain term
+        if len(token) > 3:  # Only check longer tokens
+            matches = difflib.get_close_matches(
+                token.lower(),
+                domain_vocabulary,
+                n=1,
+                cutoff=0.8
+            )
+            if matches:
+                corrected_tokens.append(matches[0])
+            else:
+                corrected_tokens.append(token)
+        else:
+            corrected_tokens.append(token)
+    return " ".join(corrected_tokens)
+# ========================================================
+# BENCHMARKING UTILITIES
+# ========================================================
+def benchmark_transcription(
+    audio_path: str,
+    models: list = ["tiny", "base", "small", "medium"]
+) -> Dict[str, Dict[str, Any]]:
+    """
+    Benchmark các model Whisper khác nhau
+    """
+    results = {}
+    for model_size in models:
         try:
+            print(f"\n>>> Benchmarking Whisper-{model_size}...")
+            start_time = time.time()
+            text = transcribe_audio(audio_path, model_size=model_size)
+            processing_time = time.time() - start_time
+            # Đánh giá chất lượng (đơn giản)
+            quality_score = estimate_transcription_quality(text)
+            results[model_size] = {
+                "text": text,
+                "time": processing_time,
+                "quality_score": quality_score,
+                "word_count": len(text.split()),
+                "chars_per_second": len(text) / max(processing_time, 0.001)
+            }
+            print(f"  Time: {processing_time:.2f}s, Quality: {quality_score:.2f}")
         except Exception as e:
+            print(f"  Error: {e}")
+            results[model_size] = {"error": str(e)}
+    return results
+def estimate_transcription_quality(text: str) -> float:
+    """
+    Ước tính chất lượng transcription dựa trên các heuristic
+    """
+    if not text:
+        return 0.0
+    score = 0.0
+    # Length-based score
+    word_count = len(text.split())
+    if word_count > 3:
+        score += 0.3
+    # Domain terms presence
+    domain_terms = ["prüfung", "hochschul", "gesetz", "ordnung", "modul"]
+    found_terms = sum(1 for term in domain_terms if term in text.lower())
+    score += min(0.3, found_terms * 0.1)
+    # Grammar/syntax indicators (German)
+    # Check for capital nouns, common sentence endings
+    if any(marker in text for marker in [". ", "? ", "! ", ", "]):
+        score += 0.2
+    # Word length consistency
+    words = text.split()
+    avg_word_len = np.mean([len(w) for w in words]) if words else 0
+    if 4 <= avg_word_len <= 10:
+        score += 0.2
+    return min(1.0, score)
+# ========================================================
+# MAIN EXPORT
+# ========================================================
+__all__ = [
+    'transcribe_audio',
+    'transcribe_with_groq',
+    'synthesize_speech',
+    'detect_voice_activity',
+    'benchmark_transcription',
+    'fix_domain_terms',
+    'TranscriptionResult',
+    'VADResult'
+]
+if __name__ == "__main__":
+    # Test functionality
+    print("Speech IO Module - Enhanced Version")
+    print(f"Whisper Model: {WHISPER_MODEL}")
+    print(f"Groq Enabled: {USE_GROQ}")
+    print(f"VAD Enabled: {ENABLE_VAD}")