Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 8, 2025

Commit

3a9ed51

1 Parent(s): 6b33840

commit

Browse files

Files changed (2) hide show

app.py +161 -659
speech_io.py +121 -418

app.py CHANGED Viewed

@@ -1,709 +1,211 @@
-# app.py – Prüfungsrechts-Chatbot (RAG + Sprache, UI kiểu ChatGPT) với các tính năng nâng cao
-#
-import os
-import time
-from dataclasses import dataclass, field
-from typing import Optional, Dict, Any
 import gradio as gr
 from gradio_pdf import PDF
-import numpy as np
-from load_documents import load_all_documents
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
-from rag_pipeline import answer
-from speech_io import transcribe_audio, synthesize_speech, detect_voice_activity
-# Cấu hình môi trường
-ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
-ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
-VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
-# =====================================================
-# STATE MANAGEMENT - Quản lý trạng thái hội thoại liền mạch
-# =====================================================
-@dataclass
-class ConversationState:
-    """Quản lý trạng thái hội thoại"""
-    messages: list = field(default_factory=list)
-    last_audio_time: float = field(default_factory=time.time)
-    is_listening: bool = False
-    vad_confidence: float = 0.0
-    conversation_context: str = ""
-    whisper_model: str = field(default_factory=lambda: os.getenv("WHISPER_MODEL", "base"))
-    language: str = field(default_factory=lambda: ASR_LANGUAGE_HINT)
-    current_audio_path: Optional[str] = None
-    def add_message(self, role: str, content: str):
-        """Thêm message vào hội thoại"""
-        self.messages.append({
-            "role": role,
-            "content": content,
-            "timestamp": time.time()
-        })
-        # Giới hạn lịch sử
-        if len(self.messages) > 20:
-            self.messages = self.messages[-20:]
-        # Cập nhật context
-        self._update_context()
-    def _update_context(self):
-        """Cập nhật context từ hội thoại"""
-        if not self.messages:
-            self.conversation_context = ""
-            return
-        context_parts = []
-        for msg in self.messages[-5:]:  # Giữ 5 message gần nhất
-            prefix = "User" if msg["role"] == "user" else "Assistant"
-            context_parts.append(f"{prefix}: {msg['content'][:200]}")  # Giới hạn độ dài
-        self.conversation_context = "\n".join(context_parts)
-    def get_recent_context(self, num_messages: int = 3) -> str:
-        """Lấy context gần đây"""
-        if not self.messages or num_messages <= 0:
-            return ""
-        recent = self.messages[-num_messages:] if len(self.messages) >= num_messages else self.messages
-        return "\n".join([f"{m['role']}: {m['content']}" for m in recent])
-    def reset(self):
-        """Reset trạng thái hội thoại"""
-        self.messages = []
-        self.conversation_context = ""
-        self.is_listening = False
-        self.vad_confidence = 0.0
-        self.current_audio_path = None
-# Khởi tạo state
-state = ConversationState()
 # =====================================================
 # INITIALISIERUNG (global)
 # =====================================================
-print("📚 Lade Dokumente…")
-docs = load_all_documents()
-print("🔪 Splitte Dokumente…")
-chunks = split_documents(docs)
-print("🔍 Erstelle VectorStore…")
-vs = build_vectorstore(chunks)
-print("🔎 Erzeuge Retriever…")
-retriever = get_retriever(vs)
-print("🤖 Lade LLM…")
-llm = load_llm()
-# Dokument-Metadaten für UI
-pdf_meta = next(d.metadata for d in docs if d.metadata.get("type") == "pdf")
-hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
-hg_url = hg_meta.get("viewer_url")
 # =====================================================
-# VOICE ACTIVITY DETECTION
 # =====================================================
-def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) -> Dict[str, Any]:
-    """Xử lý phát hiện hoạt động giọng nói"""
-    if audio_data is None or len(audio_data) == 0:
-        return {"is_speech": False, "confidence": 0.0, "status": "No audio data"}
-    try:
-        vad_result = detect_voice_activity(audio_data, sample_rate, threshold=VAD_THRESHOLD)
-        # Cập nhật state
-        state.is_listening = vad_result["is_speech"]
-        if vad_result["is_speech"]:
-            state.last_audio_time = time.time()
-            state.vad_confidence = vad_result["confidence"]
-        return {
-            "is_speech": vad_result["is_speech"],
-            "confidence": vad_result["confidence"],
-            "status": f"Speech detected: {vad_result['is_speech']} (conf: {vad_result['confidence']:.2f})"
-        }
-    except Exception as e:
-        print(f"VAD error: {e}")
-        return {"is_speech": False, "confidence": 0.0, "status": f"VAD error: {e}"}
-# =====================================================
-# TRANSCRIBE WITH OPTIMIZED PIPELINE
-# =====================================================
-def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
-    if not audio_path or not os.path.exists(audio_path):
         return ""
-    return transcribe_audio(audio_path, language=language)
-# =====================================================
-# CONVERSATIONAL INTELLIGENCE
-# =====================================================
-def enhance_conversation_context(user_input: str, history: list) -> str:
-    """Tăng cường context hội thoại"""
-    if not user_input:
-        return user_input
-    # Thêm context đơn giản từ history
-    if history and len(history) > 0:
-        # Lấy 3 tin nhắn gần nhất từ history
-        recent_history = history[-3:] if len(history) >= 3 else history
-        context_parts = ["Previous conversation:"]
-        for msg in recent_history:
-            role = "User" if msg.get("role") == "user" else "Assistant"
-            content = msg.get("content", "")[:100]  # Giới hạn độ dài
-            context_parts.append(f"{role}: {content}")
-        context = "\n".join(context_parts)
-        return f"{context}\n\nCurrent question: {user_input}"
-    return user_input
-# =====================================================
-# Quellen formatieren – Markdown für Chat
-# =====================================================
-def format_sources(src):
-    if not src:
-        return ""
-    out = ["", "## 📚 Quellen"]
-    for s in src:
-        line = f"- [{s['source']}]({s['url']})"
-        if s.get("page") is not None:
-            line += f" (Seite {s['page']})"
-        out.append(line)
-    return "\n".join(out)
-# =====================================================
-# CORE CHAT-FUNKTION với tất cả tính năng mới
-# =====================================================
-def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
-    print(f"DEBUG: chat_fn called - text_input: '{text_input}', audio_path: {audio_path}, history length: {len(history) if history else 0}")
-    # Chuẩn hóa history về dạng list các cặp [user, assistant]
-    def to_pairs(h):
-        if not h:
-            return []
-        if isinstance(h[0], dict):
-            pairs = []
-            current = [None, None]
-            for m in h:
-                if m.get("role") == "user":
-                    if current != [None, None]:
-                        pairs.append(current)
-                    current = [m.get("content", ""), None]
-                elif m.get("role") == "assistant":
-                    if current[0] is None:
-                        pairs.append([None, m.get("content", "")])
-                    else:
-                        current[1] = m.get("content", "")
-                        pairs.append(current)
-                        current = [None, None]
-            if current != [None, None]:
-                pairs.append(current)
-            return pairs
-        return h
-    pairs = to_pairs(history)
-    text_to_process = ""
-    # Lấy audio_path nếu chưa có, dùng bản ghi cuối cùng
-    if (not audio_path) and state.current_audio_path and os.path.exists(state.current_audio_path):
-        audio_path = state.current_audio_path
-    # Xử lý audio nếu có
-    if audio_path and os.path.exists(audio_path):
-        print(f"DEBUG: Processing audio file: {audio_path}")
-        state.current_audio_path = audio_path
-        if use_vad and ENABLE_VAD:
-            try:
-                import soundfile as sf
-                audio_data, sample_rate = sf.read(audio_path)
-                vad_result = handle_voice_activity(audio_data, sample_rate)
-                print(f"DEBUG: VAD result: {vad_result}")
-                if vad_result.get("is_speech", True):
-                    transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
-                    if transcribed_text and transcribed_text.strip():
-                        text_to_process = transcribed_text.strip()
-                        print(f"DEBUG: Transcribed text: {text_to_process}")
-            except Exception as e:
-                print(f"DEBUG: Error in VAD/transcription: {e}")
-                transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
-                if transcribed_text and transcribed_text.strip():
-                    text_to_process = transcribed_text.strip()
-        else:
-            transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
-            if transcribed_text and transcribed_text.strip():
-                text_to_process = transcribed_text.strip()
-                print(f"DEBUG: Transcribed text (no VAD): {text_to_process}")
-    # Nếu có text input từ textbox, ưu tiên sử dụng nó
-    if text_input and text_input.strip():
-        text_to_process = text_input.strip()
-        print(f"DEBUG: Using text input: {text_to_process}")
-    # Không có text để xử lý
-    if not text_to_process:
-        status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
-        return pairs, "", None, status_text
-    print(f"DEBUG: Processing text: {text_to_process}")
-    enhanced_question = enhance_conversation_context(text_to_process, pairs)
-    try:
-        ans, sources = answer(enhanced_question, retriever, llm)
-        bot_msg = ans + format_sources(sources)
-        state.add_message("user", text_to_process)
-        state.add_message("assistant", ans)
-        pairs.append([text_to_process, bot_msg])
-    except Exception as e:
-        print(f"DEBUG: Error in RAG pipeline: {e}")
-        error_msg = "Entschuldigung, es gab einen Fehler bei der Verarbeitung Ihrer Anfrage. Bitte versuchen Sie es erneut."
-        pairs.append([text_to_process, error_msg])
-    status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
-    return pairs, "", None, status_text
 # =====================================================
-# FUNCTIONS FOR UI CONTROLS
 # =====================================================
-def toggle_vad(use_vad):
-    """Toggle Voice Activity Detection"""
-    global ENABLE_VAD
-    ENABLE_VAD = use_vad
-    status = "EIN" if use_vad else "AUS"
-    return f"Voice Activity Detection: {status} | Model: OpenAI whisper-1"
-def change_whisper_model(model_size):
-    """Đổi Whisper model"""
-    state.whisper_model = model_size
-    os.environ["WHISPER_MODEL"] = model_size
-    return f"Whisper Model: OpenAI whisper-1 | VAD: {'On' if ENABLE_VAD else 'Off'}"
-def clear_conversation():
-    """Xóa hội thoại"""
-    state.reset()
-    return [], "Konversation gelöscht | Bereit"
-def update_vad_indicator():
-    """Cập nhật VAD indicator"""
-    if state.is_listening:
-        indicator_html = """
-        <div style="display: flex; align-items: center; gap: 8px;">
-            <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #10b981; box-shadow: 0 0 10px #10b981; animation: pulse 1.5s infinite;"></div>
-            <span style="color: #10b981; font-weight: bold;">Sprache erkannt</span>
-        </div>
-        <style>
-        @keyframes pulse {
-            0% { opacity: 0.7; }
-            50% { opacity: 1; }
-            100% { opacity: 0.7; }
-        }
-        </style>
-        """
-    else:
-        indicator_html = """
-        <div style="display: flex; align-items: center; gap: 8px;">
-            <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #6b7280;"></div>
-            <span>Bereit</span>
-        </div>
-        """
-    return indicator_html
 # =====================================================
-# AUDIO STREAMING HANDLER
 # =====================================================
-def handle_audio_stream(audio_path, use_vad):
-    """Xử lý audio streaming real-time"""
-    if not audio_path or not os.path.exists(audio_path):
-        return "", update_vad_indicator(), "Keine Audiodatei"
-    try:
-        import soundfile as sf
-        audio_data, sample_rate = sf.read(audio_path)
-        # Cập nhật VAD indicator
-        vad_html = update_vad_indicator()
-        if use_vad and ENABLE_VAD:
-            vad_result = handle_voice_activity(audio_data, sample_rate)
-            if vad_result.get("is_speech", False):
-                # Nếu phát hiện giọng nói, transcribe
-                text = transcribe_audio_optimized(audio_path, language=state.language)
-                status = f"Sprache erkannt ({vad_result.get('confidence', 0):.2f})"
-                return text, vad_html, status
-            else:
-                status = "Keine Sprache erkannt"
-                return "", vad_html, status
-        else:
-            # Nếu VAD không bật, vẫn transcribe nhưng hiển thị trạng thái khác
-            text = transcribe_audio_optimized(audio_path, language=state.language)
-            status = "Transkription (VAD aus)"
-            return text, vad_html, status
-    except Exception as e:
-        print(f"Error in audio stream handler: {e}")
-        return "", update_vad_indicator(), f"Fehler: {str(e)[:50]}"
 # =====================================================
-# TTS FUNCTION
 # =====================================================
 def read_last_answer(history):
     if not history:
-        print("DEBUG: No history for TTS")
         return None
     for msg in reversed(history):
-        if isinstance(msg, (list, tuple)) and len(msg) == 2 and msg[1]:
-            content = msg[1]
-            if "## 📚 Quellen" in content:
-                content = content.split("## 📚 Quellen")[0].strip()
-            print(f"DEBUG: Synthesizing speech for: {content[:100]}...")
-            audio_result = synthesize_speech(content)
-            if audio_result:
-                print("DEBUG: TTS successful")
-                return audio_result
-    print("DEBUG: No assistant message found for TTS")
     return None
 # =====================================================
-# UI – GRADIO với tất cả tính năng mới
 # =====================================================
-with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as demo:
-    # CSS Styling nâng cao
-    gr.HTML("""
-    <style>
-    .gradio-container {
-        max-width: 1200px;
-        margin: 0 auto;
-        padding: 20px;
-        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
-    }
-    .header {
-        text-align: center;
-        margin-bottom: 30px;
-        padding: 20px;
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 15px;
-        color: white;
-    }
-    .control-panel {
-        background: #f8f9fa;
-        padding: 20px;
-        border-radius: 15px;
-        margin-bottom: 20px;
-        border: 1px solid #e2e8f0;
-    }
-    .chat-container {
-        background: white;
-        border-radius: 15px;
-        padding: 20px;
-        box-shadow: 0 4px 20px rgba(0,0,0,0.1);
-        margin-bottom: 20px;
-    }
-    .input-row {
-        background: #f8fafc;
-        border-radius: 25px;
-        padding: 10px 20px;
-        border: 2px solid #e2e8f0;
-        transition: all 0.3s ease;
-        display: flex;
-        align-items: center;
-        gap: 10px;
-    }
-    .input-row:focus-within {
-        border-color: #667eea;
-        box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
-    }
-    .send-btn {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-        color: white !important;
-        border: none !important;
-        border-radius: 50% !important;
-        width: 44px !important;
-        height: 44px !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: center !important;
-        cursor: pointer !important;
-    }
-    .send-btn:hover {
-        transform: scale(1.05);
-        box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
-    }
-    .vad-indicator-container {
-        padding: 10px;
-        background: #f1f5f9;
-        border-radius: 10px;
-        margin: 10px 0;
-        display: flex;
-        align-items: center;
-        gap: 10px;
-    }
-    .feature-badge {
-        display: inline-block;
-        padding: 4px 12px;
-        background: #e0e7ff;
-        color: #4f46e5;
-        border-radius: 20px;
-        font-size: 12px;
-        font-weight: 500;
-        margin: 2px;
-    }
-    .chatbot {
-        min-height: 400px;
-        max-height: 500px;
-        overflow-y: auto;
-    }
-    /* Responsive design */
-    @media (max-width: 768px) {
-        .gradio-container {
-            padding: 10px;
-        }
-        .input-row {
-            flex-direction: column;
-            gap: 10px;
-        }
-        .send-btn {
-            width: 100% !important;
-            height: 44px !important;
-            border-radius: 10px !important;
-        }
-    }
-    </style>
-    """)
-    # Header
-    with gr.Column(elem_classes=["header"]):
-        gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
-        gr.Markdown("### Intelligent Voice Interface with Advanced Features")
-        # Feature badges
-        gr.HTML("""
-        <div style="text-align: center; margin: 10px 0;">
-            <span class="feature-badge">🎤 Voice Activity Detection</span>
-            <span class="feature-badge">⚡ Fast Transcription</span>
-            <span class="feature-badge">🧠 Conversational AI</span>
-            <span class="feature-badge">📚 Document RAG</span>
-        </div>
-        """)
-    # Control Panel
-    with gr.Column(elem_classes=["control-panel"]):
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Model Selection
-                model_selector = gr.Dropdown(
-                    choices=["tiny", "base", "small", "medium"],
-                    value=state.whisper_model,
-                    label="Whisper Model",
-                    info="Wählen Sie das Modell für Spracherkennung"
-                )
-                # VAD Control
-                vad_toggle = gr.Checkbox(
-                    value=ENABLE_VAD,
-                    label="Voice Activity Detection aktivieren",
-                    info="Automatische Spracherkennung"
-                )
-                # Language Selection
-                lang_selector = gr.Dropdown(
-                    choices=["de", "en", "auto"],
-                    value=ASR_LANGUAGE_HINT,
-                    label="Spracherkennung Sprache"
-                )
-            with gr.Column(scale=1):
-                # Status Display
-                status_display = gr.Textbox(
-                    label="System Status",
-                    value="Bereit",
-                    interactive=False
-                )
-                # Clear Conversation Button
-                clear_btn = gr.Button("🗑️ Konversation löschen", variant="secondary", size="sm")
-                # VAD Indicator
-                vad_indicator = gr.HTML(value=update_vad_indicator(), label="VAD Status")
-    # Main Chat Interface
-    with gr.Column(elem_classes=["chat-container"]):
-        # Chatbot Display
-        chatbot = gr.Chatbot(
-            label="Konversation",
-            height=400,
-            avatar_images=(None, "🤖")
-        )
-        # Input Row với VAD Indicator
-        with gr.Row(elem_classes=["input-row"]):
-            # Text Input
-            chat_text = gr.Textbox(
-                label=None,
-                placeholder="Stellen Sie eine Frage oder sprechen Sie ins Mikrofon...",
-                lines=1,
-                max_lines=4,
-                scale=8,
-                container=False,
-                show_label=False
             )
-            # Audio Input
-            chat_audio = gr.Audio(
-                sources=["microphone"],
-                type="filepath",
-                format="wav",
-                streaming=False,
-                interactive=True,
-                show_label=False,
-                scale=1,
-                elem_id="audio-input"
             )
-            # Send Button
-            send_btn = gr.Button("➤", variant="primary", elem_classes=["send-btn"], scale=1)
-        # TTS Controls
-        with gr.Row():
-            tts_btn = gr.Button("🔊 Antwort vorlesen", variant="secondary", size="sm")
-            tts_audio = gr.Audio(label="Audio Ausgabe", interactive=False, visible=False)
-            tts_status = gr.Textbox(label="TTS Status", interactive=False, visible=False)
-    # Documents Section
-    with gr.Accordion("📚 Quellen & Dokumente", open=False):
-        with gr.Tabs():
-            with gr.TabItem("📄 Prüfungsordnung (PDF)"):
-                PDF(pdf_meta["pdf_url"], height=300)
-            with gr.TabItem("📘 Hochschulgesetz NRW"):
-                if isinstance(hg_url, str) and hg_url.startswith("http"):
-                    gr.Markdown(f"### [Im Viewer öffnen]({hg_url})")
-                    gr.HTML(f'<iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>')
-                else:
-                    gr.Markdown("Viewer-Link nicht verfügbar.")
-    # =====================================================
-    # EVENT HANDLERS
-    # =====================================================
-    # Model Selection
-    model_selector.change(
-        change_whisper_model,
-        inputs=[model_selector],
-        outputs=[status_display]
-    )
-    # VAD Toggle
-    vad_toggle.change(
-        toggle_vad,
-        inputs=[vad_toggle],
-        outputs=[status_display]
-    )
-    # Clear Conversation
-    clear_btn.click(
-        clear_conversation,
-        outputs=[chatbot, status_display]
-    ).then(
-        lambda: update_vad_indicator(),
-        outputs=[vad_indicator]
-    )
-    # Main Chat Function
-    def process_chat(text_input, audio_path, history, lang_sel, use_vad):
-        """Wrapper function để xử lý chat"""
-        try:
-            return chat_fn(text_input, audio_path, history, lang_sel, use_vad)
-        except Exception as e:
-            print(f"Error in process_chat: {e}")
-            error_msg = f"Fehler: {str(e)}"
-            if history is None:
-                history = []
-            return history, "", None, error_msg
-    # Send Button Click
-    send_btn.click(
-        process_chat,
-        inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
-        outputs=[chatbot, chat_text, chat_audio, status_display]
-    ).then(
-        lambda: update_vad_indicator(),
-        outputs=[vad_indicator]
-    )
-    # Text Submit (Enter key)
-    chat_text.submit(
-        process_chat,
-        inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
-        outputs=[chatbot, chat_text, chat_audio, status_display]
-    ).then(
-        lambda: update_vad_indicator(),
-        outputs=[vad_indicator]
-    )
-    # Audio Change Handler
-    def on_audio_change(audio_path, use_vad):
-        """Xử lý khi audio thay đổi"""
-        if audio_path:
-            print(f"DEBUG: Audio changed: {audio_path}")
-            # Lưu lại đường dẫn bản ghi để nút Gửi có thể dùng
-            state.current_audio_path = audio_path
-            # Xử lý streaming
-            text, vad_html, status = handle_audio_stream(audio_path, use_vad)
-            return text, vad_html, status
-        return "", update_vad_indicator(), "Bereit"
-    chat_audio.change(
-        on_audio_change,
-        inputs=[chat_audio, vad_toggle],
-        outputs=[chat_text, vad_indicator, status_display]
-    )
-    # Process immediately when user stops recording
-    def on_audio_stop(audio_path, history, lang_sel, use_vad):
-        print(f"DEBUG: stop_recording with audio_path={audio_path}")
-        state.current_audio_path = audio_path
-        return chat_fn("", audio_path, history, lang_sel, use_vad)
-    chat_audio.stop_recording(
-        on_audio_stop,
-        inputs=[chat_audio, chatbot, lang_selector, vad_toggle],
-        outputs=[chatbot, chat_text, chat_audio, status_display]
-    )
-    # Streaming handler removed; process on change after user stops recording
-    # TTS Button
-    def handle_tts(history):
-        """Xử lý TTS"""
-        audio_result = read_last_answer(history)
-        if audio_result:
-            return audio_result, "Audio wird abgespielt..."
-        return None, "Keine Antwort zum Vorlesen gefunden"
-    tts_btn.click(
-        handle_tts,
-        inputs=[chatbot],
-        outputs=[tts_audio, tts_status]
-    ).then(
-        lambda: gr.Audio(visible=True),
-        outputs=[tts_audio]
-    ).then(
-        lambda: gr.Textbox(visible=True),
-        outputs=[tts_status]
-    )
 if __name__ == "__main__":
-    demo.queue().launch(show_error=True)

+# app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
+# Version 26.11 – ohne Modi, stabil für Text + Voice
 import gradio as gr
 from gradio_pdf import PDF
+from huggingface_hub import hf_hub_download
+from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
+from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
+from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
 # INITIALISIERUNG (global)
 # =====================================================
+print("🔹 Lade Dokumente ...")
+_docs = load_documents()
+print("🔹 Splitte Dokumente ...")
+_chunks = split_documents(_docs)
+print("🔹 Baue VectorStore (FAISS) ...")
+_vs = build_vectorstore(_chunks)
+print("🔹 Erzeuge Retriever ...")
+_retriever = get_retriever(_vs)
+print("🔹 Lade LLM ...")
+_llm = load_llm()
+print("🔹 Lade Dateien für Viewer …")
+_pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
+_html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 # =====================================================
+# Quellen formatieren – Markdown für Chat
 # =====================================================
+def format_sources_markdown(sources):
+    if not sources:
         return ""
+    lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
+    for s in sources:
+        sid = s["id"]
+        src = s["source"]
+        page = s["page"]
+        url = s["url"]
+        snippet = s["snippet"]
+        title = f"Quelle {sid} – {src}"
+        if url:
+            base = f"- [{title}]({url})"
+        else:
+            base = f"- {title}"
+        if page and "Prüfungsordnung" in src:
+            base += f", Seite {page}"
+        lines.append(base)
+        if snippet:
+            lines.append(f"  > {snippet}")
+    return "\n".join(lines)
 # =====================================================
+# TEXT CHATBOT
 # =====================================================
+def chatbot_text(user_message, history):
+    if not user_message:
+        return history, ""
+    answer_text, sources = answer(
+        question=user_message,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    history = history + [
+        {"role": "user", "content": user_message},
+        {"role": "assistant", "content": answer_text + quellen_block},
+    ]
+    return history, ""
 # =====================================================
+# VOICE CHATBOT
 # =====================================================
+def chatbot_voice(audio_path, history):
+    # 1. Speech → Text
+    text = transcribe_audio(audio_path)
+    if not text:
+        return history, None, ""
+    # Lưu vào lịch sử chat
+    history = history + [{"role": "user", "content": text}]
+    # 2. RAG trả lời
+    answer_text, sources = answer(
+        question=text,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    bot_msg = answer_text + quellen_block
+    history = history + [{"role": "assistant", "content": bot_msg}]
+    # 3. Text → Speech
+    audio = synthesize_speech(bot_msg)
+    return history, audio, ""
 # =====================================================
+# LAST ANSWER → TTS
 # =====================================================
 def read_last_answer(history):
     if not history:
         return None
     for msg in reversed(history):
+        if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"])
     return None
 # =====================================================
+# UI – GRADIO
 # =====================================================
+with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
+    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
+    gr.Markdown(
+        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
+        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
+        "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(type="messages", label="Chat", height=500)
+            msg = gr.Textbox(
+                label="Frage eingeben",
+                placeholder="Stelle deine Frage zum Prüfungsrecht …",
             )
+            # TEXT SENDEN
+            msg.submit(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            send_btn = gr.Button("Senden (Text)")
+            send_btn.click(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            # SPRACHEINGABE
+            gr.Markdown("### 🎙️ Spracheingabe")
+            voice_in = gr.Audio(sources=["microphone"], type="filepath")
+            voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
+            voice_btn = gr.Button("Sprechen & senden")
+            voice_btn.click(
+                chatbot_voice,
+                [voice_in, chatbot],
+                [chatbot, voice_out, msg]
+            )
+            read_btn = gr.Button("🔁 Antwort erneut vorlesen")
+            read_btn.click(
+                read_last_answer,
+                [chatbot],
+                [voice_out]
+            )
+            clear_btn = gr.Button("Chat zurücksetzen")
+            clear_btn.click(lambda: [], None, chatbot)
+        # =====================
+        # RECHTE SPALTE: Viewer
+        # =====================
+        with gr.Column(scale=1):
+            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
+            PDF(_pdf_path, height=350)
+            gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
+            gr.HTML(
+                f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
             )
 if __name__ == "__main__":
+    demo.launch()

speech_io.py CHANGED Viewed

@@ -1,455 +1,158 @@
 """
-speech_io.py - Enhanced Version with working VAD
-Sprachbasierte Ein-/Ausgabe với:
-- Speech-to-Text (STT) với Whisper
-- Text-to-Speech (TTS)
-- Voice Activity Detection (VAD) hoạt động
 """
-import os
-import time
-from typing import Optional, Tuple, Dict, Any
 import numpy as np
 import soundfile as sf
-from scipy.signal import butter, filtfilt, resample
-import re
-import difflib
 # ========================================================
-# CẤU HÌNH
 # ========================================================
-# Model Selection
-WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
-ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
-TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
-# OpenAI Configuration
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-# VAD Configuration
-ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
-VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
-VAD_MIN_DURATION = float(os.getenv("VAD_MIN_DURATION", "0.1"))
-# Other Configs
-ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
-TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
-ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
-# Cache for models
-_asr = None
-_tts = None
 # ========================================================
-# AUDIO PROCESSING UTILITIES
 # ========================================================
 def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
-    """Highpass filter để loại bỏ noise tần số thấp"""
-    if len(data) == 0:
-        return data
     nyq = 0.5 * fs
-    normal_cutoff = cutoff / nyq
-    b, a = butter(order, normal_cutoff, btype='high', analog=False)
     return filtfilt(b, a, data)
-def apply_fade(audio, sr, fade_in_ms=10, fade_out_ms=10):
-    """Áp dụng fade in/out để tránh pop"""
-    if len(audio) == 0:
-        return audio
-    fade_in_samples = int(sr * fade_in_ms / 1000)
-    fade_out_samples = int(sr * fade_out_ms / 1000)
-    # Đảm bảo có đủ samples
-    if len(audio) < fade_in_samples + fade_out_samples:
         return audio
-    # Fade in
-    if fade_in_samples > 0:
-        fade_in_curve = np.linspace(0, 1, fade_in_samples)
-        audio[:fade_in_samples] *= fade_in_curve
-    # Fade out
-    if fade_out_samples > 0:
-        fade_out_curve = np.linspace(1, 0, fade_out_samples)
-        audio[-fade_out_samples:] *= fade_out_curve
-    return audio
-def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
-    """Chuẩn hóa audio về [-1, 1]"""
-    if len(audio_data) == 0:
-        return audio_data
-    # Chuyển đổi sang float32
-    if audio_data.dtype != np.float32:
-        audio_data = audio_data.astype(np.float32)
-    # Normalize
-    max_val = np.max(np.abs(audio_data))
-    if max_val > 0:
-        audio_data = audio_data / max_val
-    return audio_data
-def preprocess_audio_for_vad(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
-    """Tiền xử lý audio cho VAD"""
-    if len(audio_data) == 0:
-        return audio_data
-    # Chuyển sang mono nếu cần
-    if len(audio_data.shape) > 1:
-        audio_data = np.mean(audio_data, axis=1)
-    # Normalize
-    audio_data = normalize_audio(audio_data)
-    # Highpass filter để loại bỏ noise tần số thấp
-    try:
-        audio_data = butter_highpass_filter(audio_data, cutoff=80, fs=sample_rate)
-    except:
-        pass
-    return audio_data
-# ========================================================
-# VOICE ACTIVITY DETECTION (VAD) - FIXED VERSION
-# ========================================================
-def detect_voice_activity(
-    audio_data: np.ndarray,
-    sample_rate: int,
-    threshold: float = 0.3,
-    min_duration: float = 0.1
-) -> Dict[str, Any]:
-    """
-    Phát hiện hoạt động giọng nói - Phiên bản đơn giản và hoạt động
-    Args:
-        audio_data: Mảng numpy chứa audio samples
-        sample_rate: Tần số lấy mẫu
-        threshold: Ngưỡng phát hiện (0-1)
-        min_duration: Thời gian tối thiểu để xác định là speech (giây)
-    Returns:
-        Dict với thông tin phát hiện
-    """
-    if len(audio_data) == 0:
-        return {
-            "is_speech": False,
-            "confidence": 0.0,
-            "speech_segments": [],
-            "energy": 0.0,
-            "message": "Empty audio data"
-        }
-    try:
-        # Tiền xử lý audio
-        processed_audio = preprocess_audio_for_vad(audio_data, sample_rate)
-        # Tính toán các đặc trưng
-        duration = len(processed_audio) / sample_rate
-        # 1. Tính RMS energy
-        rms_energy = np.sqrt(np.mean(processed_audio ** 2))
-        # 2. Tính zero-crossing rate
-        zero_crossings = np.sum(np.abs(np.diff(np.sign(processed_audio)))) / (2 * len(processed_audio))
-        # 3. Tính spectral centroid (đơn giản)
-        # Sử dụng FFT để tính phân bố tần số
-        if len(processed_audio) >= 256:
-            fft_size = min(2048, len(processed_audio))
-            spectrum = np.abs(np.fft.rfft(processed_audio[:fft_size]))
-            frequencies = np.fft.rfftfreq(fft_size, 1/sample_rate)
-            if np.sum(spectrum) > 0:
-                spectral_centroid = np.sum(frequencies * spectrum) / np.sum(spectrum)
-            else:
-                spectral_centroid = 0
-        else:
-            spectral_centroid = 0
-        # 4. Frame-based analysis
-        frame_length = int(sample_rate * 0.03)  # 30ms frame
-        hop_length = int(frame_length / 2)
-        if len(processed_audio) > frame_length:
-            num_frames = 1 + (len(processed_audio) - frame_length) // hop_length
-            frame_energies = []
-            for i in range(num_frames):
-                start = i * hop_length
-                end = start + frame_length
-                frame = processed_audio[start:end]
-                frame_energy = np.sqrt(np.mean(frame ** 2))
-                frame_energies.append(frame_energy)
-            # Tính speech ratio
-            if frame_energies:
-                energy_threshold = np.percentile(frame_energies, 30) + threshold * (np.max(frame_energies) - np.percentile(frame_energies, 30))
-                speech_frames = sum(1 for e in frame_energies if e > energy_threshold)
-                speech_ratio = speech_frames / len(frame_energies)
-            else:
-                speech_ratio = 0
-        else:
-            speech_ratio = 0
-        # 5. Kết hợp các đặc trưng để tính confidence
-        # Speech thường có:
-        # - RMS energy cao
-        # - Zero-crossing rate trung bình (không quá cao như noise, không quá thấp như silence)
-        # - Spectral centroid trong khoảng 100-3000 Hz cho giọng nói
-        # - Speech ratio cao
-        # Tính confidence score
-        energy_score = min(1.0, rms_energy * 10)  # Scale energy
-        # Zero-crossing rate score: lý tưởng khoảng 0.1-0.3 cho speech
-        if 0.05 < zero_crossings < 0.4:
-            zcr_score = 1.0 - 2 * abs(zero_crossings - 0.2)  # Peak ở 0.2
-        else:
-            zcr_score = 0.0
-        # Spectral centroid score: lý tưởng 100-3000 Hz
-        if 100 < spectral_centroid < 3000:
-            centroid_score = 1.0
-        elif 50 < spectral_centroid < 5000:
-            centroid_score = 0.5
-        else:
-            centroid_score = 0.0
-        # Speech ratio score
-        speech_ratio_score = speech_ratio
-        # Kết hợp các score
-        weights = [0.4, 0.2, 0.2, 0.2]  # energy, zcr, centroid, speech_ratio
-        confidence = (
-            weights[0] * energy_score +
-            weights[1] * zcr_score +
-            weights[2] * centroid_score +
-            weights[3] * speech_ratio_score
-        )
-        # Áp dụng ngưỡng
-        is_speech = confidence > threshold
-        # Kiểm tra duration tối thiểu
-        if duration < min_duration:
-            is_speech = False
-            confidence = max(0, confidence - 0.2)
-        # Debug info
-        debug_info = {
-            "duration": duration,
-            "rms_energy": rms_energy,
-            "zero_crossings": zero_crossings,
-            "spectral_centroid": spectral_centroid,
-            "speech_ratio": speech_ratio,
-            "energy_score": energy_score,
-            "zcr_score": zcr_score,
-            "centroid_score": centroid_score,
-            "speech_ratio_score": speech_ratio_score,
-            "final_confidence": confidence,
-            "is_speech": is_speech
-        }
-        print(f"VAD Debug: {debug_info}")
-        return {
-            "is_speech": is_speech,
-            "confidence": float(confidence),
-            "speech_segments": [[0, duration]] if is_speech else [],
-            "energy": float(rms_energy),
-            "message": f"Speech: {is_speech}, Confidence: {confidence:.3f}"
-        }
-    except Exception as e:
-        print(f"VAD processing error: {e}")
-        return {
-            "is_speech": False,
-            "confidence": 0.0,
-            "speech_segments": [],
-            "energy": 0.0,
-            "message": f"Error: {str(e)}"
-        }
 # ========================================================
-# SPEECH-TO-TEXT FUNCTIONS
 # ========================================================
-def transcribe_audio(
-    audio_path: str,
-    language: Optional[str] = None,
-    max_duration_s: int = ASR_MAX_DURATION_S
-) -> str:
-    """
-    Transcribe audio bằng OpenAI Whisper API
-    """
-    if not audio_path or not os.path.exists(audio_path):
-        print(">>> Kein Audio gefunden.")
-        return ""
-    if not OPENAI_API_KEY:
-        print(">>> OPENAI_API_KEY nicht gesetzt.")
-        return ""
-    try:
-        from openai import OpenAI
-        client = OpenAI(api_key=OPENAI_API_KEY)
-        with open(audio_path, "rb") as f:
-            resp = client.audio.transcriptions.create(
-                model="whisper-1",
-                file=f,
-                language=language if language and language != "auto" else None,
-                response_format="text"
-            )
-        text = resp.text if hasattr(resp, "text") else (resp.get("text", "") if isinstance(resp, dict) else str(resp))
-        text = fix_domain_terms(text.strip())
-        print(f">>> Transkription (OpenAI): {text}")
-        return text
-    except Exception as e:
-        print(f">>> Transkriptionsfehler (OpenAI): {e}")
-        return ""
-def transcribe_audio(
-    audio_path: str,
-    language: Optional[str] = None,
-    max_duration_s: int = ASR_MAX_DURATION_S
-) -> str:
     """
-    Transcribe audio với Whisper local
     """
-    if not audio_path or not os.path.exists(audio_path):
-        print(">>> Kein Audio gefunden.")
-        return ""
-    try:
-        # Đọc audio file
-        data, sr = sf.read(audio_path, always_2d=False)
-        if data is None or data.size == 0:
-            print(">>> Audio leer.")
-            return ""
-        # Chuyển sang mono
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
-        # Tiền xử lý
-        data = data.astype(np.float32)
-        max_val = np.max(np.abs(data))
-        if max_val > 0:
-            data = data / max_val
-        # Resample về 16kHz nếu cần
-        TARGET_SR = 16000
-        if sr != TARGET_SR:
-            target_len = int(len(data) * TARGET_SR / sr)
-            data = resample(data, target_len)
-            sr = TARGET_SR
-        # Giới hạn độ dài
-        MAX_SAMPLES = sr * max_duration_s
-        if len(data) > MAX_SAMPLES:
-            data = data[:MAX_SAMPLES]
-        # Lấy pipeline
-        asr = get_asr_pipeline()
-        # Cấu hình language
-        lang = language
-        if not lang and ASR_DEFAULT_LANGUAGE and ASR_DEFAULT_LANGUAGE.lower() != "auto":
-            lang = ASR_DEFAULT_LANGUAGE
-        if isinstance(lang, str) and lang.lower() == "auto":
-            lang = None
-        # Transcribe
-        print(f">>> Transkribiere mit Whisper-{WHISPER_MODEL}...")
-        call_kwargs = {}
-        if lang:
-            call_kwargs["generate_kwargs"] = {
-                "language": lang,
-                "task": "transcribe",
-                "max_new_tokens": 120,
-                "temperature": 0.0,
-            }
-        result = asr({"array": data, "sampling_rate": sr}, **call_kwargs)
-        text = result.get("text", "") if isinstance(result, dict) else str(result)
-        text = text.strip()
-        # Sửa lỗi domain terms
-        text = fix_domain_terms(text)
-        print(f">>> Transkription: {text}")
-        return text
-    except Exception as e:
-        print(f">>> Transkriptionsfehler: {e}")
         return ""
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
-def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
-    """
-    Chuyển text sang speech bằng OpenAI TTS
-    """
-    if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
         return None
     try:
-        from openai import OpenAI
-        client = OpenAI(api_key=OPENAI_API_KEY)
-        response = client.audio.speech.create(
-            model="tts-1",
-            voice="nova",
-            input=text[:4000],
-            response_format="wav"
-        )
-        import io
-        audio_bytes = response.content
-        with io.BytesIO(audio_bytes) as f:
-            data, sr = sf.read(f)
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
-        if data.dtype == np.float32 or data.dtype == np.float64:
-            data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
-        return (sr, data)
-    except Exception as e:
-        print(f">>> TTS Fehler (OpenAI): {e}")
-        return None
-# ========================================================
-# DOMAIN-SPECIFIC TEXT PROCESSING
-# ========================================================
-def fix_domain_terms(text: str) -> str:
-    """
-    Sửa lỗi các thuật ngữ chuyên ngành
-    """
-    if not text:
-        return text
-    # Common mis-transcriptions
-    correction_pairs = [
-        (r"\bbriefe\s*um\b", "prüfung"),
-        (r"\bbrieft\s*um\b", "prüfung"),
-        (r"\bbriefung\b", "prüfung"),
-        (r"\bpruefung\b", "prüfung"),
-        (r"\bhochschule\s*gesetz\b", "hochschulgesetz"),
-    ]
-    for pattern, replacement in correction_pairs:
-        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
-    return text
-# ========================================================
-# MAIN EXPORT
-# ========================================================
-__all__ = [
-    'transcribe_audio',
-    'synthesize_speech',
-    'detect_voice_activity',
-    'normalize_audio',
-    'preprocess_audio_for_vad'
-]

 """
+speech_io.py
+Sprachbasierte Ein-/Ausgabe:
+- Speech-to-Text (STT) mit Whisper (transformers.pipeline)
+- Text-to-Speech (TTS) mit MMS-TTS Deutsch
+Dieses File ist 100% stabil für HuggingFace Spaces.
 """
+from typing import Optional, Tuple
 import numpy as np
 import soundfile as sf
+from scipy.signal import butter, filtfilt
+from transformers import pipeline
+# Modelle
+ASR_MODEL_ID = "openai/whisper-small"
+TTS_MODEL_ID = "facebook/mms-tts-deu"
+_asr = None
+_tts = None
 # ========================================================
+# STT PIPELINE
 # ========================================================
+def get_asr_pipeline():
+    global _asr
+    if _asr is None:
+        print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
+        _asr = pipeline(
+            task="automatic-speech-recognition",
+            model=ASR_MODEL_ID,
+            device="cpu",
+            return_timestamps=True,   # wichtig
+            chunk_length_s=30         # auto-chunk für lange audio
+        )
+    return _asr
+# ========================================================
+# TTS PIPELINE
+# ========================================================
+def get_tts_pipeline():
+    global _tts
+    if _tts is None:
+        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
+        _tts = pipeline(
+            task="text-to-speech",
+            model=TTS_MODEL_ID,
+        )
+    return _tts
 # ========================================================
+# AUDIO FILTER – Noise Reduction + Highpass
 # ========================================================
 def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
     nyq = 0.5 * fs
+    norm_cutoff = cutoff / nyq
+    b, a = butter(order, norm_cutoff, btype="high")
     return filtfilt(b, a, data)
+def apply_fade(audio, sr, duration_ms=10):
+    fade_samples = int(sr * duration_ms / 1000)
+    if fade_samples * 2 >= len(audio):
         return audio
+    fade_in_curve = np.linspace(0, 1, fade_samples)
+    audio[:fade_samples] *= fade_in_curve
+    fade_out_curve = np.linspace(1, 0, fade_samples)
+    audio[-fade_samples:] *= fade_out_curve
+    return audio
 # ========================================================
+# SPEECH-TO-TEXT (STT)
 # ========================================================
+def transcribe_audio(audio_path: str) -> str:
     """
+    audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
     """
+    if audio_path is None:
         return ""
+    # WAV einlesen (soundfile garantiert PCM korrekt)
+    data, sr = sf.read(audio_path)
+    # immer Mono
+    if len(data.shape) > 1:
+        data = data[:, 0]
+    # Whisper >30s vermeiden
+    MAX_SAMPLES = sr * 30
+    if len(data) > MAX_SAMPLES:
+        data = data[:MAX_SAMPLES]
+    asr = get_asr_pipeline()
+    print(">>> Transkribiere Audio...")
+    result = asr(
+        {"array": data, "sampling_rate": sr},
+    )
+    text = result.get("text", "").strip()
+    print("ASR:", text)
+    return text
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
+def synthesize_speech(text: str):
+    if not text or not text.strip():
         return None
+    tts = get_tts_pipeline()
+    out = tts(text)
+    # rohes Audio from MMS (float32 [-1, 1])
+    audio = np.array(out["audio"], dtype=np.float32)
+    sr = out.get("sampling_rate", 16000)
+    # ===== FIX sample_rate =====
+    if sr is None or sr <= 0 or sr > 65535:
+        sr = 16000
+    # ===== Mono erzwingen =====
+    if audio.ndim > 1:
+        audio = audio.squeeze()
+    if audio.ndim > 1:
+        audio = audio[:, 0]
+    # ===== Noise reduction =====
     try:
+        audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
+    except:
+        pass
+    # ===== Normalize =====
+    max_val = np.max(np.abs(audio))
+    if max_val > 0:
+        audio = audio / max_val
+    # ===== Fade gegen pop =====
+    audio = apply_fade(audio, sr)
+    # ===== int16 =====
+    audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
+    # Rückgabe: (sr, np.int16 array)
+    return (sr, audio_int16)