Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

Nguyen5 commited on Dec 8, 2025

Commit

ed0df67

1 Parent(s): e2535a9

commit

Browse files

Files changed (3) hide show

app.py +332 -729
realtime_server.py +145 -97
speech_io.py +74 -422

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
-# app.py – Prüfungsrechts-Chatbot với OpenAI Realtime API và Voice Agents
 import os
 import time
-import json
-import asyncio
-import threading
-from dataclasses import dataclass, field
-from typing import Optional, Dict, Any, List
 import gradio as gr
 from gradio_pdf import PDF
 import numpy as np
-import queue
 from openai import OpenAI
@@ -19,69 +16,20 @@ from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
 # =====================================================
 # CONFIGURATION
 # =====================================================
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    raise RuntimeError("OPENAI_API_KEY is required for Realtime API")
 # Initialize OpenAI client
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
-# =====================================================
-# STATE MANAGEMENT
-# =====================================================
-@dataclass
-class ConversationState:
-    """Quản lý trạng thái hội thoại"""
-    messages: list = field(default_factory=list)
-    is_streaming: bool = False
-    realtime_session_id: Optional[str] = None
-    audio_queue: queue.Queue = field(default_factory=queue.Queue)
-    text_queue: queue.Queue = field(default_factory=queue.Queue)
-    conversation_context: str = ""
-    def add_message(self, role: str, content: str):
-        """Thêm message vào hội thoại"""
-        self.messages.append({
-            "role": role,
-            "content": content,
-            "timestamp": time.time()
-        })
-        # Giới hạn lịch sử
-        if len(self.messages) > 20:
-            self.messages = self.messages[-20:]
-        # Cập nhật context
-        self._update_context()
-    def _update_context(self):
-        """Cập nhật context từ hội thoại"""
-        if not self.messages:
-            self.conversation_context = ""
-            return
-        context_parts = []
-        for msg in self.messages[-5:]:  # Giữ 5 message gần nhất
-            prefix = "User" if msg["role"] == "user" else "Assistant"
-            context_parts.append(f"{prefix}: {msg['content'][:200]}")
-        self.conversation_context = "\n".join(context_parts)
-    def reset(self):
-        """Reset trạng thái hội thoại"""
-        self.messages = []
-        self.conversation_context = ""
-        self.is_streaming = False
-        self.realtime_session_id = None
-        while not self.audio_queue.empty():
-            self.audio_queue.get()
-        while not self.text_queue.empty():
-            self.text_queue.get()
-# Khởi tạo state
-state = ConversationState()
 # =====================================================
 # INITIALIZATION - RAG Components
@@ -107,396 +55,145 @@ hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
 hg_url = hg_meta.get("viewer_url")
 # =====================================================
-# OPENAI REALTIME API HANDLERS
 # =====================================================
-class RealtimeEventHandler:
-    """Handler cho OpenAI Realtime API events"""
-    def __init__(self, state_ref):
-        self.state = state_ref
-        self.current_text = ""
-    def on_text_delta(self, delta, snapshot=None):
-        """Xử lý text delta từ Realtime API"""
-        if delta.value:
-            self.current_text += delta.value
-            # Thêm vào text queue để hiển thị
-            self.state.text_queue.put({
-                "type": "text_delta",
-                "content": delta.value
-            })
-    def on_audio_transcript_delta(self, delta, snapshot=None):
-        """Xử lý audio transcript từ Realtime API"""
-        if delta.text:
-            # Thêm vào text queue
-            self.state.text_queue.put({
-                "type": "transcript",
-                "content": delta.text
-            })
-    def on_audio_delta(self, delta, snapshot=None):
-        """Xử lý audio data từ Realtime API"""
-        if delta.data:
-            # Thêm vào audio queue để phát
-            self.state.audio_queue.put({
-                "type": "audio",
-                "data": delta.data
-            })
-    def on_response_created(self, response=None):
-        """Khi response được tạo"""
-        print("DEBUG: Response created")
-    def on_response_done(self, response=None):
-        """Khi response hoàn thành"""
-        print(f"DEBUG: Response done, final text: {self.current_text[:100]}...")
-        if self.current_text:
-            # Thêm message vào history
-            self.state.add_message("assistant", self.current_text)
-            # Signal end of response
-            self.state.text_queue.put({
-                "type": "response_end",
-                "content": self.current_text
-            })
-        self.current_text = ""
-        self.state.is_streaming = False
-    def on_error(self, error):
-        """Xử lý lỗi"""
-        print(f"DEBUG: Realtime API error: {error}")
-        self.state.is_streaming = False
-        self.state.text_queue.put({
-            "type": "error",
-            "content": f"Error: {str(error)}"
         })
-def start_realtime_conversation():
-    """Bắt đầu cuộc hội thoại Realtime API"""
-    try:
-        # Tạo Realtime session
-        session = openai_client.realtime.sessions.create(
-            model="gpt-4o-realtime-preview",
-            voice="shimmer",  # Có thể chọn: alloy, echo, fable, onyx, nova, shimmer
-            modalities=["text", "audio"],
-            instructions="""Du bist ein juristischer Assistent für Prüfungsrecht.
-            Du hilfst Studenten mit Fragen zu Prüfungsordnung und Hochschulgesetz NRW.
-            Antworte präzise, freundlich und professionell.
-            Bei unsicheren Fragen, verweise auf die offiziellen Dokumente."""
-        )
-        state.realtime_session_id = session.id
-        state.is_streaming = True
-        print(f"DEBUG: Realtime session started: {session.id}")
-        # Bắt đầu streaming
-        with openai_client.realtime.connect(
-            session_id=session.id,
-            event_handler=RealtimeEventHandler(state)
-        ) as connection:
-            # Keep connection alive
-            while state.is_streaming:
-                time.sleep(0.1)
-    except Exception as e:
-        print(f"DEBUG: Error in realtime conversation: {e}")
-        state.is_streaming = False
-def stop_realtime_conversation():
-    """Dừng cuộc hội thoại Realtime"""
-    state.is_streaming = False
-    if state.realtime_session_id:
-        try:
-            openai_client.realtime.sessions.delete(state.realtime_session_id)
-        except:
-            pass
-        state.realtime_session_id = None
-def send_text_to_realtime(text: str):
-    """Gửi text đến Realtime API"""
-    if not state.is_streaming or not state.realtime_session_id:
-        return False
-    try:
-        # Tạo Realtime client mới để gửi message
-        with openai_client.realtime.connect(session_id=state.realtime_session_id) as connection:
-            connection.send({
-                "type": "response.create",
-                "response": {
-                    "modalities": ["text", "audio"],
-                    "instructions": f"Antworte auf: {text}"
-                }
-            })
-        return True
-    except Exception as e:
-        print(f"DEBUG: Error sending to realtime: {e}")
-        return False
 # =====================================================
-# TOOLS (FUNCTION CALLS) FOR VOICE AGENT
 # =====================================================
-def search_documents_tool(query: str) -> Dict[str, Any]:
-    """Tool để tìm kiếm tài liệu"""
     try:
-        # Sử dụng retriever để tìm tài liệu liên quan
-        docs = retriever.invoke(query)
-        if not docs:
-            return {
-                "success": False,
-                "message": "Keine relevanten Dokumente gefunden.",
-                "documents": []
-            }
-        # Format kết quả
-        results = []
-        for i, doc in enumerate(docs[:3], 1):
-            meta = doc.metadata
-            source_type = meta.get("type", "unknown")
-            if source_type == "pdf":
-                source_info = {
-                    "type": "Prüfungsordnung",
-                    "page": meta.get("page"),
-                    "url": meta.get("pdf_url")
-                }
-            elif source_type == "hg":
-                source_info = {
-                    "type": "Hochschulgesetz NRW",
-                    "paragraph": meta.get("title"),
-                    "url": meta.get("viewer_url")
-                }
-            else:
-                source_info = {"type": "unknown"}
-            results.append({
-                "id": i,
-                "content": doc.page_content[:500] + "...",
-                "source": source_info
-            })
-        return {
-            "success": True,
-            "message": f"{len(results)} Dokumente gefunden",
-            "documents": results
-        }
     except Exception as e:
-        return {
-            "success": False,
-            "message": f"Fehler bei der Suche: {str(e)}",
-            "documents": []
-        }
-def get_legal_advice_tool(question: str) -> Dict[str, Any]:
-    """Tool để nhận tư vấn pháp lý từ RAG"""
     try:
-        # Sử dụng RAG pipeline
-        ans, sources = answer(question, retriever, llm)
-        # Format sources
-        formatted_sources = []
-        for src in sources:
-            formatted_sources.append({
-                "source": src["source"],
-                "page": src.get("page"),
-                "url": src["url"]
-            })
-        return {
-            "success": True,
-            "answer": ans,
-            "sources": formatted_sources,
-            "has_relevant_info": len(sources) > 0
-        }
     except Exception as e:
-        return {
-            "success": False,
-            "answer": f"Fehler: {str(e)}",
-            "sources": [],
-            "has_relevant_info": False
-        }
-# =====================================================
-# VOICE AGENT WITH TOOLS
-# =====================================================
-class VoiceAgent:
-    """Voice Agent sử dụng OpenAI Realtime API với Tools"""
-    def __init__(self, openai_client):
-        self.client = openai_client
-        self.session_id = None
-        self.is_active = False
-    def start_session(self):
-        """Bắt đầu session với tools"""
-        try:
-            # Tạo session với tools definition
-            session = self.client.realtime.sessions.create(
-                model="gpt-4o-realtime-preview-2024-12-17",
-                voice="shimmer",
-                modalities=["text", "audio"],
-                instructions="""Du bist ein juristischer Voice Agent.
-                Du kannst:
-                1. Dokumente durchsuchen (search_documents)
-                2. Rechtliche Beratung geben (get_legal_advice)
-                Sei präzise, freundlich und hilfreich.
-                Verweise immer auf die Quellen.""",
-                tools=[
-                    {
-                        "type": "function",
-                        "name": "search_documents",
-                        "description": "Durchsucht die Prüfungsordnung und das Hochschulgesetz nach relevanten Informationen",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "query": {
-                                    "type": "string",
-                                    "description": "Suchbegriff oder Frage"
-                                }
-                            },
-                            "required": ["query"]
-                        }
-                    },
-                    {
-                        "type": "function",
-                        "name": "get_legal_advice",
-                        "description": "Gibt juristische Beratung basierend auf den Dokumenten",
-                        "parameters": {
-                            "type": "object",
-                            "properties": {
-                                "question": {
-                                    "type": "string",
-                                    "description": "Juristische Frage"
-                                }
-                            },
-                            "required": ["question"]
-                        }
-                    }
-                ],
-                tool_choice="auto"
-            )
-            self.session_id = session.id
-            self.is_active = True
-            # Start event handling thread
-            threading.Thread(target=self._handle_events, daemon=True).start()
-            return True
-        except Exception as e:
-            print(f"DEBUG: Error starting voice agent: {e}")
-            return False
-    def _handle_events(self):
-        """Xử lý events từ Realtime API"""
-        try:
-            with self.client.realtime.connect(
-                session_id=self.session_id,
-                event_handler=VoiceAgentEventHandler(self)
-            ) as connection:
-                while self.is_active:
-                    time.sleep(0.1)
-        except Exception as e:
-            print(f"DEBUG: Error in event handler: {e}")
-            self.is_active = False
-    def stop_session(self):
-        """Dừng session"""
-        self.is_active = False
-        if self.session_id:
-            try:
-                self.client.realtime.sessions.delete(self.session_id)
-            except:
-                pass
-            self.session_id = None
-    def process_tool_call(self, tool_name: str, arguments: Dict) -> Dict:
-        """Xử lý tool calls"""
-        try:
-            if tool_name == "search_documents":
-                query = arguments.get("query", "")
-                return search_documents_tool(query)
-            elif tool_name == "get_legal_advice":
-                question = arguments.get("question", "")
-                return get_legal_advice_tool(question)
-            else:
-                return {
-                    "success": False,
-                    "message": f"Unbekanntes Tool: {tool_name}"
-                }
-        except Exception as e:
-            return {
-                "success": False,
-                "message": f"Tool Fehler: {str(e)}"
-            }
-class VoiceAgentEventHandler:
-    """Event handler cho Voice Agent"""
-    def __init__(self, agent):
-        self.agent = agent
-        self.current_text = ""
-    def on_text_delta(self, delta, snapshot=None):
-        """Xử lý text delta"""
-        if delta.value:
-            self.current_text += delta.value
-            # Thêm vào state text queue
-            state.text_queue.put({
-                "type": "agent_text",
-                "content": delta.value
-            })
-    def on_function_call_arguments_delta(self, delta, snapshot=None):
-        """Xử lý function call arguments"""
-        print(f"DEBUG: Function call arguments: {delta}")
-    def on_function_call_done(self, function_call, snapshot=None):
-        """Khi function call hoàn thành"""
-        try:
-            tool_name = function_call.name
-            arguments = json.loads(function_call.arguments)
-            print(f"DEBUG: Processing tool call: {tool_name}, args: {arguments}")
-            # Process tool call
-            result = self.agent.process_tool_call(tool_name, arguments)
-            # Gửi kết quả trở lại
-            with openai_client.realtime.connect(session_id=self.agent.session_id) as conn:
-                conn.send({
-                    "type": "response.function_call_arguments",
-                    "function_call_id": function_call.id,
-                    "output": json.dumps(result)
-                })
-        except Exception as e:
-            print(f"DEBUG: Error processing function call: {e}")
-    def on_response_done(self, response=None):
-        """Khi response hoàn thành"""
-        if self.current_text:
-            state.add_message("assistant", self.current_text)
-        self.current_text = ""
-# Khởi tạo Voice Agent
-voice_agent = VoiceAgent(openai_client)
-# =====================================================
-# GRADIO UI COMPONENTS
-# =====================================================
 def format_sources(src):
     """Format sources cho display"""
     if not src:
@@ -512,386 +209,292 @@ def format_sources(src):
     return "\n".join(out)
-def update_chat_display(history, new_text=""):
-    """Cập nhật chat display với streaming text"""
     if not history:
-        history = []
-    if new_text:
-        # Nếu last message là của assistant, append text
-        if history and history[-1]["role"] == "assistant":
-            history[-1]["content"] += new_text
-        else:
-            history.append({"role": "assistant", "content": new_text})
-    return history
-def process_queue_updates():
-    """Process queue updates cho streaming"""
-    updates = []
-    # Process text queue
-    while not state.text_queue.empty():
-        try:
-            item = state.text_queue.get_nowait()
-            updates.append(("text", item.get("content", "")))
-        except queue.Empty:
-            break
-    # Process audio queue (simplified - trong thực tế cần xử lý audio)
-    while not state.audio_queue.empty():
-        try:
-            item = state.audio_queue.get_nowait()
-            # Có thể xử lý audio data ở đây
-            pass
-        except queue.Empty:
-            break
-    return updates
 # =====================================================
-# UI – GRADIO INTERFACE
 # =====================================================
-with gr.Blocks(title="Prüfungsrechts-Chatbot mit OpenAI Realtime API") as demo:
-    # CSS Styling
     gr.HTML("""
     <style>
     .gradio-container {
-        max-width: 1200px;
         margin: 0 auto;
         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
     }
     .header {
         text-align: center;
         margin-bottom: 30px;
-        padding: 25px;
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 15px;
         color: white;
     }
-    .control-panel {
         background: #f8f9fa;
-        padding: 20px;
-        border-radius: 15px;
-        margin-bottom: 20px;
-        border: 1px solid #e2e8f0;
-    }
-    .status-indicator {
-        padding: 10px 15px;
         border-radius: 10px;
-        font-weight: 500;
-        display: inline-flex;
         align-items: center;
-        gap: 8px;
     }
-    .status-active {
-        background: #d1fae5;
-        color: #065f46;
     }
-    .status-inactive {
-        background: #f3f4f6;
-        color: #6b7280;
     }
-    .voice-btn {
-        padding: 12px 24px;
-        border-radius: 25px;
-        font-weight: 600;
-        transition: all 0.3s;
-        border: none;
     }
-    .voice-btn-start {
-        background: linear-gradient(135deg, #10b981 0%, #059669 100%);
-        color: white;
     }
-    .voice-btn-stop {
-        background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
         color: white;
     }
-    .chat-message {
-        padding: 15px;
-        border-radius: 15px;
-        margin: 10px 0;
-        max-width: 85%;
     }
-    .user-message {
-        background: #e0e7ff;
-        margin-left: auto;
     }
-    .assistant-message {
-        background: #f3f4f6;
-        margin-right: auto;
     }
     </style>
     """)
-    # Header
     with gr.Column(elem_classes=["header"]):
-        gr.Markdown("# 🎤 Voice Agent mit OpenAI Realtime API")
-        gr.Markdown("### Live Sprachkonversation mit juristischer Beratung")
-    # Control Panel
-    with gr.Column(elem_classes=["control-panel"]):
-        with gr.Row():
-            # Status Display
-            status_display = gr.HTML(
-                value='<div class="status-indicator status-inactive">🔴 Voice Agent inaktiv</div>',
-                label="Status"
-            )
-            # Voice Controls
-            with gr.Column(scale=1):
-                start_voice_btn = gr.Button(
-                    "🎤 Start Voice Conversation",
-                    variant="primary",
-                    elem_classes=["voice-btn", "voice-btn-start"]
-                )
-                stop_voice_btn = gr.Button(
-                    "⏹️ Stop Voice Conversation",
-                    variant="secondary",
-                    elem_classes=["voice-btn", "voice-btn-stop"],
-                    visible=False
-                )
-        # Mode Selection
         with gr.Row():
             mode_selector = gr.Radio(
-                choices=["Voice Agent (Live Conversation)", "Text Chat (RAG)"],
-                value="Text Chat (RAG)",
-                label="Modus"
             )
     # Main Chat Interface
-    with gr.Column():
-        # Chatbot Display
-        chatbot = gr.Chatbot(
-            label="Konversation",
-            height=500,
-            avatar_images=(None, "🤖")
         )
-        # Input Area (cho Text Mode)
-        with gr.Row(visible=True) as text_input_row:
-            chat_input = gr.Textbox(
-                label="Ihre Frage",
-                placeholder="Stellen Sie eine juristische Frage...",
-                lines=2,
-                max_lines=4,
-                scale=8
-            )
-            send_btn = gr.Button("Senden", variant="primary", scale=1)
-        # Voice Interface (cho Voice Mode)
-        with gr.Row(visible=False) as voice_interface:
-            gr.Markdown("### 🎤 Sprechen Sie jetzt...")
-            voice_status = gr.Textbox(
-                label="Status",
-                value="Bereit für Sprachaufnahme",
-                interactive=False
-            )
-            voice_output = gr.Textbox(label="Transkription", interactive=False)
-            chat_audio = gr.Audio(
-                sources=["microphone"],
-                type="numpy",
-                streaming=True,
-                show_label=False,
-                interactive=True
-            )
-    # Documents Section
-    with gr.Accordion("📚 Dokumente & Quellen", open=False):
         with gr.Tabs():
             with gr.TabItem("📄 Prüfungsordnung"):
-                PDF(pdf_meta["pdf_url"], height=400)
             with gr.TabItem("📘 Hochschulgesetz NRW"):
                 if hg_url:
                     gr.HTML(f'''
-                    <div style="padding: 20px;">
-                        <h3>Hochschulgesetz NRW Viewer</h3>
-                        <a href="{hg_url}" target="_blank" style="display: inline-block; padding: 10px 20px; background: #3b82f6; color: white; text-decoration: none; border-radius: 5px; margin-bottom: 15px;">
-                            Im Viewer öffnen
                         </a>
-                        <iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>
                     </div>
                     ''')
     # =====================================================
     # EVENT HANDLERS
     # =====================================================
-    def toggle_mode(mode):
-        """Chuyển đổi giữa Voice và Text mode"""
-        if "Voice Agent" in mode:
-            return (
-                gr.Row(visible=False),  # text_input_row
-                gr.Row(visible=True),   # voice_interface
-                '<div class="status-indicator status-inactive">🔴 Bitte Voice Agent starten</div>'
-            )
-        else:
-            stop_voice_agent()
-            return (
-                gr.Row(visible=True),   # text_input_row
-                gr.Row(visible=False),  # voice_interface
-                '<div class="status-indicator status-inactive">🔴 Text Mode aktiv</div>'
-            )
-    def start_voice_agent():
-        """Bắt đầu Voice Agent"""
-        state.is_streaming = True
-        return (
-            gr.Button(visible=False),  # start_voice_btn
-            gr.Button(visible=True),   # stop_voice_btn
-            '<div class="status-indicator status-active">🟢 Voice Agent aktiv - Sprechen Sie jetzt</div>',
-            "Voice Agent gestartet. Sie können jetzt sprechen..."
-        )
-    def stop_voice_agent():
-        """Dừng Voice Agent"""
-        state.is_streaming = False
-        state.reset()
-        return (
-            gr.Button(visible=True),   # start_voice_btn
-            gr.Button(visible=False),  # stop_voice_btn
-            '<div class="status-indicator status-inactive">🔴 Voice Agent gestoppt</div>',
-            "Voice Agent gestoppt"
-        )
-    def process_text_chat(message, history):
-        """Xử lý text chat với RAG"""
-        if not message:
-            return history, ""
-        # Thêm user message
-        history.append({"role": "user", "content": message})
-        try:
-            # Get RAG answer
-            ans, sources = answer(message, retriever, llm)
-            full_response = ans + format_sources(sources)
-            # Add assistant message
-            history.append({"role": "assistant", "content": full_response})
-            # Add to state
-            state.add_message("user", message)
-            state.add_message("assistant", ans)
-        except Exception as e:
-            error_msg = f"Fehler: {str(e)[:100]}"
-            history.append({"role": "assistant", "content": error_msg})
-        return history, ""
-    def process_voice_chunk(audio_in, history):
-        import tempfile
-        import soundfile as sf
-        if audio_in is None:
-            return history, "", "Keine Datei erhalten"
-        temp_path = None
-        if isinstance(audio_in, str):
-            temp_path = audio_in
-        else:
-            try:
-                sr, data = audio_in
-                import numpy as np
-                dur = 0.0 if sr is None else (len(data) / float(sr))
-                rms = float(np.sqrt(np.mean((data.astype('float32')) ** 2))) if len(data) else 0.0
-                peak = float(np.max(np.abs(data))) if len(data) else 0.0
-                temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-                sf.write(temp_path, data.astype('float32'), int(sr))
-                status_pref = f"Aufnahme {dur:.2f}s · RMS {rms:.5f} · Peak {peak:.5f}"
-            except Exception:
-                temp_path = None
-                status_pref = "Aufnahme fehlgeschlagen"
-        try:
-            text = transcribe_audio_optimized(temp_path, language=ASR_LANGUAGE_HINT) if temp_path else ""
-        except Exception:
-            text = ""
-        if not text:
-            return history, "", (status_pref + " · Keine Sprache erkannt" if 'status_pref' in locals() else "Keine Sprache erkannt")
-        history.append({"role": "user", "content": text})
-        try:
-            ans, sources = answer(text, retriever, llm)
-            full_response = ans + format_sources(sources)
-            history.append({"role": "assistant", "content": full_response})
-            state.add_message("user", text)
-            state.add_message("assistant", ans)
-            status = (status_pref + " · Gesendet" if 'status_pref' in locals() else "Transkription und Antwort gesendet")
-        except Exception as e:
-            history.append({"role": "assistant", "content": f"Fehler: {str(e)[:100]}"})
-            status = "Fehler bei Verarbeitung"
-        return history, text, status
-    def update_streaming_display(history):
-        """Cập nhật display với streaming text"""
-        updates = process_queue_updates()
-        if not updates:
-            return history
-        for update_type, content in updates:
-            if update_type == "text" and content:
-                history = update_chat_display(history, content)
-        return history
     # Mode toggle
     mode_selector.change(
-        toggle_mode,
-        inputs=[mode_selector],
-        outputs=[text_input_row, voice_interface, status_display]
     )
-    # Voice Agent controls
-    start_voice_btn.click(
-        start_voice_agent,
-        outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
     )
-    stop_voice_btn.click(
-        stop_voice_agent,
-        outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
-    )
-    # Voice streaming: chunk → transcript → chat
-    chat_audio.stream(
-        process_voice_chunk,
-        inputs=[chat_audio, chatbot],
-        outputs=[chatbot, voice_output, voice_status]
-    )
-    chat_audio.change(
-        process_voice_chunk,
-        inputs=[chat_audio, chatbot],
-        outputs=[chatbot, voice_output, voice_status]
     )
-    # Text chat
-    send_btn.click(
-        process_text_chat,
-        inputs=[chat_input, chatbot],
-        outputs=[chatbot, chat_input]
     )
-    chat_input.submit(
-        process_text_chat,
-        inputs=[chat_input, chatbot],
-        outputs=[chatbot, chat_input]
     )
-    # Streaming updates
-    # Streaming updates disabled (no Timer available in current Gradio)
 if __name__ == "__main__":
     demo.queue().launch(ssr_mode=False, show_error=True)

+# app.py – Prüfungsrechts-Chatbot (Đơn giản như ChatGPT)
 import os
 import time
+import tempfile
+from typing import Optional, Dict, Any
 import gradio as gr
 from gradio_pdf import PDF
 import numpy as np
+import soundfile as sf
 from openai import OpenAI
 from retriever import get_retriever
 from llm import load_llm
 from rag_pipeline import answer
+from speech_io import transcribe_with_openai, synthesize_speech
 # =====================================================
 # CONFIGURATION
 # =====================================================
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY is required")
 # Initialize OpenAI client
 openai_client = OpenAI(api_key=OPENAI_API_KEY)
+# Language configuration
+ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
 # =====================================================
 # INITIALIZATION - RAG Components
 hg_url = hg_meta.get("viewer_url")
 # =====================================================
+# STATE MANAGEMENT
 # =====================================================
+class ConversationState:
+    """Quản lý trạng thái hội thoại đơn giản"""
+    def __init__(self):
+        self.messages = []
+        self.current_mode = "text"  # "text" hoặc "audio"
+        self.is_audio_recording = False
+    def add_message(self, role: str, content: str):
+        """Thêm message vào hội thoại"""
+        self.messages.append({
+            "role": role,
+            "content": content,
+            "timestamp": time.time()
         })
+        # Giới hạn lịch sử
+        if len(self.messages) > 20:
+            self.messages = self.messages[-20:]
+    def get_chat_history(self):
+        """Chuyển đổi sang format cho Gradio Chatbot"""
+        history = []
+        for msg in self.messages:
+            if msg["role"] == "user":
+                history.append([msg["content"], None])
+            elif msg["role"] == "assistant":
+                if history and history[-1][1] is None:
+                    history[-1][1] = msg["content"]
+                else:
+                    history.append([None, msg["content"]])
+        return history
+    def reset(self):
+        """Reset trạng thái hội thoại"""
+        self.messages = []
+        self.is_audio_recording = False
+# Khởi tạo state
+state = ConversationState()
 # =====================================================
+# AUDIO PROCESSING FUNCTIONS
 # =====================================================
+def process_audio_input(audio_data: Optional[tuple], history) -> tuple:
+    """
+    Xử lý audio input từ microphone
+    """
+    if audio_data is None:
+        return history, "", "Warten auf Audioaufnahme..."
     try:
+        # Lấy sample rate và audio data
+        sample_rate, audio_array = audio_data
+        # Tạo file tạm để lưu audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+            temp_path = tmp.name
+            # Lưu audio data
+            sf.write(temp_path, audio_array, int(sample_rate))
+        print("DEBUG: Audio saved to temp file, transcribing...")
+        # Transcribe audio bằng OpenAI Whisper
+        transcribed_text = transcribe_with_openai(temp_path, language=ASR_LANGUAGE_HINT)
+        # Xóa file tạm
+        os.unlink(temp_path)
+        if not transcribed_text or not transcribed_text.strip():
+            return history, "", "Keine Sprache erkannt. Bitte versuchen Sie es erneut."
+        print(f"DEBUG: Transcribed text: {transcribed_text}")
+        # Thêm vào history
+        new_history = history + [[transcribed_text, None]]
+        # Process với RAG
+        ans, sources = answer(transcribed_text, retriever, llm)
+        full_response = ans + format_sources(sources)
+        # Cập nhật history với response
+        new_history[-1][1] = full_response
+        # Thêm vào state
+        state.add_message("user", transcribed_text)
+        state.add_message("assistant", ans)
+        return new_history, transcribed_text, "Antwort generiert ✓"
     except Exception as e:
+        print(f"DEBUG: Error processing audio: {e}")
+        return history, "", f"Fehler: {str(e)[:50]}"
+def toggle_audio_mode(mode_choice: str, history):
+    """Chuyển đổi giữa text và audio mode"""
+    if mode_choice == "Audio (Sprachmodus)":
+        state.current_mode = "audio"
+        state.is_audio_recording = True
+        mode_text = "🎤 Sprachmodus aktiv - Klicken und Sprechen"
+    else:
+        state.current_mode = "text"
+        state.is_audio_recording = False
+        mode_text = "⌨️ Textmodus aktiv"
+    return (
+        gr.Audio(visible=(mode_choice == "Audio (Sprachmodus)")),
+        gr.Textbox(visible=(mode_choice == "Text (Schreibmodus)")),
+        gr.Button(visible=(mode_choice == "Text (Schreibmodus)")),
+        mode_text
+    )
+def process_text_input(message: str, history):
+    """Xử lý text input"""
+    if not message or not message.strip():
+        return history, ""
+    # Thêm vào history
+    new_history = history + [[message, None]]
     try:
+        # Process với RAG
+        ans, sources = answer(message, retriever, llm)
+        full_response = ans + format_sources(sources)
+        # Cập nhật history với response
+        new_history[-1][1] = full_response
+        # Thêm vào state
+        state.add_message("user", message)
+        state.add_message("assistant", ans)
     except Exception as e:
+        error_msg = f"Entschuldigung, es gab einen Fehler: {str(e)[:100]}"
+        new_history[-1][1] = error_msg
+    return new_history, ""
 def format_sources(src):
     """Format sources cho display"""
     if not src:
     return "\n".join(out)
+def clear_conversation():
+    """Xóa hội thoại"""
+    state.reset()
+    return [], "Konversation gelöscht"
+def speak_last_response(history):
+    """Đọc câu trả lời cuối cùng"""
     if not history:
+        return None, "Keine Antwort zum Vorlesen"
+    # Tìm câu trả lời cuối cùng
+    for i in range(len(history)-1, -1, -1):
+        if history[i][1]:  # assistant response exists
+            response_text = history[i][1]
+            # Loại bỏ phần sources
+            if "## 📚 Quellen" in response_text:
+                response_text = response_text.split("## 📚 Quellen")[0].strip()
+            # Tạo speech
+            audio_result = synthesize_speech(response_text[:500])  # Giới hạn độ dài
+            if audio_result:
+                sr, audio_data = audio_result
+                return (sr, audio_data), "Audio wird abgespielt..."
+    return None, "Keine passende Antwort gefunden"
 # =====================================================
+# UI – GRADIO INTERFACE (Đơn giản như ChatGPT)
 # =====================================================
+with gr.Blocks(
+    title="🧑‍⚖️ Prüfungsrechts-Chatbot",
+    theme=gr.themes.Soft()
+) as demo:
+    # CSS Styling đơn giản
     gr.HTML("""
     <style>
     .gradio-container {
+        max-width: 900px;
         margin: 0 auto;
         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+        padding: 20px;
     }
     .header {
         text-align: center;
         margin-bottom: 30px;
+        padding: 20px;
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 12px;
         color: white;
     }
+    .mode-selector {
         background: #f8f9fa;
+        padding: 15px;
         border-radius: 10px;
+        margin-bottom: 20px;
+        display: flex;
         align-items: center;
+        gap: 15px;
+        border: 1px solid #e2e8f0;
     }
+    .mode-indicator {
+        padding: 8px 16px;
+        border-radius: 20px;
+        font-weight: 600;
+        background: #e0e7ff;
+        color: #4f46e5;
     }
+    .input-area {
+        background: white;
+        border-radius: 12px;
+        padding: 15px;
+        border: 2px solid #e2e8f0;
+        margin-top: 20px;
     }
+    .input-row {
+        display: flex;
+        gap: 10px;
+        align-items: center;
     }
+    .audio-visualizer {
+        padding: 10px;
+        text-align: center;
+        color: #666;
+        font-style: italic;
     }
+    .tts-btn {
+        margin-top: 10px;
+        padding: 8px 16px;
+        background: #10b981;
         color: white;
+        border: none;
+        border-radius: 8px;
+        cursor: pointer;
     }
+    .tts-btn:hover {
+        background: #059669;
     }
+    .clear-btn {
+        background: #ef4444;
+        color: white;
+        border: none;
+        border-radius: 8px;
+        padding: 8px 16px;
+        cursor: pointer;
+        margin-left: 10px;
     }
+    .clear-btn:hover {
+        background: #dc2626;
     }
     </style>
     """)
+    # Header đơn giản
     with gr.Column(elem_classes=["header"]):
+        gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
+        gr.Markdown("### Stellen Sie Fragen zu Prüfungsordnung und Hochschulgesetz NRW")
+    # Mode Selector
+    with gr.Column(elem_classes=["mode-selector"]):
         with gr.Row():
             mode_selector = gr.Radio(
+                choices=["Text (Schreibmodus)", "Audio (Sprachmodus)"],
+                value="Text (Schreibmodus)",
+                label="",
+                scale=3,
+                elem_id="mode-selector"
+            )
+            mode_indicator = gr.Textbox(
+                value="⌨️ Textmodus aktiv",
+                label="Status",
+                interactive=False,
+                scale=2
             )
+            clear_btn = gr.Button("🗑️ Löschen", elem_classes=["clear-btn"], scale=1)
     # Main Chat Interface
+    chatbot = gr.Chatbot(
+        label="Konversation",
+        height=500,
+        bubble_full_width=True,
+        show_copy_button=True,
+        avatar_images=(
+            "https://em-content.zobj.net/source/microsoft-teams/363/bust-in-silhouette_1f464.png",
+            "https://em-content.zobj.net/source/microsoft-teams/363/robot_1f916.png"
         )
+    )
+    # Input Area (thay đổi theo mode)
+    with gr.Column(elem_classes=["input-area"], visible=True) as input_area:
+        # Text Input (visible khi text mode)
+        with gr.Column(visible=True) as text_input_container:
+            with gr.Row(elem_classes=["input-row"]):
+                text_input = gr.Textbox(
+                    label="",
+                    placeholder="Stellen Sie eine juristische Frage... (Enter zum Senden)",
+                    lines=2,
+                    max_lines=4,
+                    scale=8,
+                    show_label=False,
+                    container=False
+                )
+                text_send_btn = gr.Button(
+                    "Senden",
+                    variant="primary",
+                    scale=1,
+                    min_width=80
+                )
+        # Audio Input (visible khi audio mode)
+        with gr.Column(visible=False) as audio_input_container:
+            gr.Markdown("### 🎤 Klicken und Sprechen")
+            with gr.Row():
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="numpy",
+                    streaming=False,
+                    show_label=False,
+                    interactive=True,
+                    scale=8
+                )
+                audio_status = gr.Textbox(
+                    label="Status",
+                    value="Warten auf Aufnahme...",
+                    interactive=False,
+                    scale=2
+                )
+            gr.Markdown("*Drücken Sie aufnehmen, sprechen Sie Ihre Frage, dann stoppen*", elem_classes=["audio-visualizer"])
+    # TTS Controls
+    with gr.Row():
+        tts_btn = gr.Button("🔊 Letzte Antwort vorlesen", variant="secondary", size="sm")
+        tts_audio = gr.Audio(label="", interactive=False, visible=False)
+        tts_status = gr.Textbox(label="", interactive=False, visible=False)
+    # Documents Section (Collapsible)
+    with gr.Accordion("📚 Dokumente & Quellen anzeigen", open=False):
         with gr.Tabs():
             with gr.TabItem("📄 Prüfungsordnung"):
+                PDF(pdf_meta["pdf_url"], height=350)
             with gr.TabItem("📘 Hochschulgesetz NRW"):
                 if hg_url:
                     gr.HTML(f'''
+                    <div style="padding: 10px;">
+                        <h4>Hochschulgesetz NRW Viewer</h4>
+                        <a href="{hg_url}" target="_blank" style="display: inline-block; padding: 8px 16px; background: #3b82f6; color: white; text-decoration: none; border-radius: 5px; margin-bottom: 10px;">
+                            Im Viewer öffnen ↗
                         </a>
+                        <iframe src="{hg_url}" width="100%" height="400px" style="border: 1px solid #ddd; border-radius: 6px;"></iframe>
                     </div>
                     ''')
+                else:
+                    gr.Markdown("Viewer-Link nicht verfügbar.")
     # =====================================================
     # EVENT HANDLERS
     # =====================================================
     # Mode toggle
     mode_selector.change(
+        toggle_audio_mode,
+        inputs=[mode_selector, chatbot],
+        outputs=[
+            audio_input_container,
+            text_input_container,
+            text_send_btn,
+            mode_indicator
+        ]
     )
+    # Text input handling
+    text_send_btn.click(
+        process_text_input,
+        inputs=[text_input, chatbot],
+        outputs=[chatbot, text_input]
     )
+    text_input.submit(
+        process_text_input,
+        inputs=[text_input, chatbot],
+        outputs=[chatbot, text_input]
     )
+    # Audio input handling
+    def handle_audio_complete(audio_data, history):
+        """Xử lý khi audio recording hoàn tất"""
+        return process_audio_input(audio_data, history)
+    audio_input.stop_recording(
+        handle_audio_complete,
+        inputs=[audio_input, chatbot],
+        outputs=[chatbot, audio_status, audio_status]
+    ).then(
+        lambda: ("", "Warten auf neue Aufnahme..."),
+        outputs=[audio_input, audio_status]
     )
+    # Clear conversation
+    clear_btn.click(
+        clear_conversation,
+        outputs=[chatbot, mode_indicator]
     )
+    # TTS button
+    tts_btn.click(
+        speak_last_response,
+        inputs=[chatbot],
+        outputs=[tts_audio, tts_status]
+    ).then(
+        lambda: gr.Audio(visible=True),
+        outputs=[tts_audio]
+    ).then(
+        lambda: gr.Textbox(visible=True),
+        outputs=[tts_status]
+    )
 if __name__ == "__main__":
     demo.queue().launch(ssr_mode=False, show_error=True)

realtime_server.py CHANGED Viewed

@@ -1,119 +1,167 @@
 """
-realtime_server.py — v0.2 (2025-12-08)
-OpenAI Realtime WS relay for live voice conversation.
-This server accepts a WebSocket connection from the frontend at `/ws`,
-forwards audio/text frames to OpenAI Realtime WS using the official
-`response.create`, `input_audio_buffer.append`, and `input_audio_buffer.commit`
-messages, and streams assistant responses back to the client in real time.
-Compatibility: standalone, does not break existing Gradio UI. Enable by
-setting USE_REALTIME=true and pointing the frontend to ws://localhost:8000/ws.
 """
 import os
 import asyncio
 import json
 import base64
-import websockets
-import traceback
 from typing import Optional
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from fastapi.responses import JSONResponse
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
 app = FastAPI()
-@app.get("/health")
-async def health():
-    return JSONResponse({"status": "ok"})
-async def _connect_openai_ws():
-    url = f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}"
-    headers = {
-        "Authorization": f"Bearer {OPENAI_API_KEY}",
-        "OpenAI-Beta": "realtime=v1",
-    }
-    return await websockets.connect(url, extra_headers=headers, max_size=None)
 @app.websocket("/ws")
-async def ws_stream(ws: WebSocket):
-    """Frontend WS relay.
-    Client → Server messages (JSON):
-      - {type: "audio_chunk", data: base64_wav_string}
-      - {type: "audio_commit"}
-      - {type: "response", instructions: "..."}
-    Server forwards to OpenAI WS:
-      - input_audio_buffer.append
-      - input_audio_buffer.commit
-      - response.create
-    Server → Client messages: pass-through OpenAI event frames.
-    """
-    if not OPENAI_API_KEY:
-        await ws.accept()
-        await ws.send_text(json.dumps({"type": "error", "message": "OPENAI_API_KEY missing"}))
-        await ws.close()
-        return
-    await ws.accept()
-    openai_conn = None
     try:
-        openai_conn = await _connect_openai_ws()
-        async def forward_openai_to_client():
-            try:
-                async for event in openai_conn:
-                    await ws.send_text(event if isinstance(event, str) else json.dumps(event))
-            except Exception:
-                await ws.send_text(json.dumps({"type": "error", "message": "upstream_closed"}))
-        async def forward_client_to_openai():
-            try:
-                while True:
-                    raw = await ws.receive_text()
-                    msg = json.loads(raw)
-                    t = msg.get("type")
-                    if t == "audio_chunk":
-                        data_b64 = msg.get("data")
-                        await openai_conn.send(json.dumps({
-                            "type": "input_audio_buffer.append",
-                            "audio": {"data": data_b64, "format": "wav"}
-                        }))
-                    elif t == "audio_commit":
-                        await openai_conn.send(json.dumps({"type": "input_audio_buffer.commit"}))
-                    elif t == "response":
-                        instr = msg.get("instructions", "")
-                        await openai_conn.send(json.dumps({
-                            "type": "response.create",
-                            "response": {"modalities": ["text", "audio"], "instructions": instr}
-                        }))
-                    else:
-                        await ws.send_text(json.dumps({"type": "error", "message": "unknown_type"}))
-            except WebSocketDisconnect:
-                pass
-            except Exception:
-                await ws.send_text(json.dumps({"type": "error", "message": "client_read_error"}))
-        await asyncio.gather(forward_openai_to_client(), forward_client_to_openai())
-    except Exception:
-        await ws.send_text(json.dumps({"type": "error", "message": "relay_error", "detail": traceback.format_exc()}))
     finally:
-        try:
-            if openai_conn:
-                await openai_conn.close()
-        except Exception:
-            pass
-        try:
-            await ws.close()
-        except Exception:
-            pass

 """
+realtime_server.py - Optional WebSocket server for real-time audio streaming
+Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000
 """
 import os
 import asyncio
 import json
 import base64
 from typing import Optional
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
+import websockets
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
 app = FastAPI()
+# Simple HTML test page
+html = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Realtime Audio Test</title>
+</head>
+<body>
+    <h1>Realtime Audio Test</h1>
+    <button id="startBtn">Start Recording</button>
+    <button id="stopBtn" disabled>Stop Recording</button>
+    <div id="status">Status: Ready</div>
+    <div id="transcript"></div>
+    <script>
+        let mediaRecorder;
+        let audioChunks = [];
+        document.getElementById('startBtn').onclick = async () => {
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+            mediaRecorder = new MediaRecorder(stream);
+            mediaRecorder.ondataavailable = (event) => {
+                audioChunks.push(event.data);
+            };
+            mediaRecorder.onstop = async () => {
+                const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
+                audioChunks = [];
+                // Convert to base64
+                const reader = new FileReader();
+                reader.readAsDataURL(audioBlob);
+                reader.onloadend = () => {
+                    const base64data = reader.result.split(',')[1];
+                    // Send to server
+                    fetch('/process-audio', {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify({ audio: base64data })
+                    })
+                    .then(response => response.json())
+                    .then(data => {
+                        document.getElementById('transcript').innerHTML =
+                            `<strong>Transkription:</strong> ${data.transcript}`;
+                    });
+                };
+            };
+            mediaRecorder.start();
+            document.getElementById('startBtn').disabled = true;
+            document.getElementById('stopBtn').disabled = false;
+            document.getElementById('status').textContent = 'Status: Recording...';
+        };
+        document.getElementById('stopBtn').onclick = () => {
+            mediaRecorder.stop();
+            document.getElementById('startBtn').disabled = false;
+            document.getElementById('stopBtn').disabled = true;
+            document.getElementById('status').textContent = 'Status: Processing...';
+        };
+    </script>
+</body>
+</html>
+"""
+@app.get("/")
+async def get():
+    return HTMLResponse(html)
+@app.post("/process-audio")
+async def process_audio(request: dict):
+    """Process audio from frontend"""
+    try:
+        audio_data = base64.b64decode(request.get("audio", ""))
+        # Save to temp file
+        import tempfile
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            f.write(audio_data)
+            temp_path = f.name
+        # Transcribe using OpenAI
+        from openai import OpenAI
+        client = OpenAI(api_key=OPENAI_API_KEY)
+        with open(temp_path, "rb") as audio_file:
+            transcript = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file,
+                language="de"
+            )
+        # Clean up
+        import os
+        os.unlink(temp_path)
+        return {"success": True, "transcript": transcript.text}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
 @app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    """WebSocket endpoint for real-time audio streaming"""
+    await websocket.accept()
     try:
+        # Connect to OpenAI Realtime API
+        headers = {
+            "Authorization": f"Bearer {OPENAI_API_KEY}",
+            "OpenAI-Beta": "realtime=v1",
+        }
+        async with websockets.connect(
+            f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}",
+            extra_headers=headers
+        ) as openai_ws:
+            # Forward messages in both directions
+            async def forward_to_openai():
+                try:
+                    while True:
+                        data = await websocket.receive_text()
+                        await openai_ws.send(data)
+                except WebSocketDisconnect:
+                    pass
+            async def forward_to_client():
+                try:
+                    async for message in openai_ws:
+                        await websocket.send_text(message)
+                except:
+                    pass
+            await asyncio.gather(
+                forward_to_openai(),
+                forward_to_client()
+            )
+    except Exception as e:
+        print(f"WebSocket error: {e}")
     finally:
+        await websocket.close()
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

speech_io.py CHANGED Viewed

@@ -1,493 +1,145 @@
 """
-speech_io.py - Enhanced Version with working VAD
-Sprachbasierte Ein-/Ausgabe với:
-- Speech-to-Text (STT) với Whisper
-- Text-to-Speech (TTS)
-- Voice Activity Detection (VAD) hoạt động
 """
 import os
-import time
-from typing import Optional, Tuple, Dict, Any
 import numpy as np
 import soundfile as sf
-from scipy.signal import butter, filtfilt, resample
-import re
-import difflib
 # ========================================================
 # CẤU HÌNH
 # ========================================================
-# Model Selection
-WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
-ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
-TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-# VAD Configuration
-ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
-VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
-VAD_MIN_DURATION = float(os.getenv("VAD_MIN_DURATION", "0.1"))
-# Other Configs
-ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
 TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
-ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
-# Cache for models
-_asr = None
-_tts = None
 # ========================================================
-# AUDIO PROCESSING UTILITIES
 # ========================================================
-def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
-    """Highpass filter để loại bỏ noise tần số thấp"""
-    if len(data) == 0:
-        return data
-    nyq = 0.5 * fs
-    normal_cutoff = cutoff / nyq
-    b, a = butter(order, normal_cutoff, btype='high', analog=False)
-    return filtfilt(b, a, data)
-def apply_fade(audio, sr, fade_in_ms=10, fade_out_ms=10):
-    """Áp dụng fade in/out để tránh pop"""
-    if len(audio) == 0:
-        return audio
-    fade_in_samples = int(sr * fade_in_ms / 1000)
-    fade_out_samples = int(sr * fade_out_ms / 1000)
-    # Đảm bảo có đủ samples
-    if len(audio) < fade_in_samples + fade_out_samples:
-        return audio
-    # Fade in
-    if fade_in_samples > 0:
-        fade_in_curve = np.linspace(0, 1, fade_in_samples)
-        audio[:fade_in_samples] *= fade_in_curve
-    # Fade out
-    if fade_out_samples > 0:
-        fade_out_curve = np.linspace(1, 0, fade_out_samples)
-        audio[-fade_out_samples:] *= fade_out_curve
-    return audio
-def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
-    """Chuẩn hóa audio về [-1, 1]"""
-    if len(audio_data) == 0:
-        return audio_data
-    # Chuyển đổi sang float32
-    if audio_data.dtype != np.float32:
-        audio_data = audio_data.astype(np.float32)
-    # Normalize
-    max_val = np.max(np.abs(audio_data))
-    if max_val > 0:
-        audio_data = audio_data / max_val
-    return audio_data
-def preprocess_audio_for_vad(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
-    """Tiền xử lý audio cho VAD"""
-    if len(audio_data) == 0:
-        return audio_data
-    # Chuyển sang mono nếu cần
-    if len(audio_data.shape) > 1:
-        audio_data = np.mean(audio_data, axis=1)
-    # Normalize
-    audio_data = normalize_audio(audio_data)
-    # Highpass filter để loại bỏ noise tần số thấp
-    try:
-        audio_data = butter_highpass_filter(audio_data, cutoff=80, fs=sample_rate)
-    except:
-        pass
-    return audio_data
-# ========================================================
-# VOICE ACTIVITY DETECTION (VAD) - FIXED VERSION
-# ========================================================
-def detect_voice_activity(
-    audio_data: np.ndarray,
-    sample_rate: int,
-    threshold: float = 0.3,
-    min_duration: float = 0.1
-) -> Dict[str, Any]:
-    """
-    Phát hiện hoạt động giọng nói - Phiên bản đơn giản và hoạt động
-    Args:
-        audio_data: Mảng numpy chứa audio samples
-        sample_rate: Tần số lấy mẫu
-        threshold: Ngưỡng phát hiện (0-1)
-        min_duration: Thời gian tối thiểu để xác định là speech (giây)
-    Returns:
-        Dict với thông tin phát hiện
-    """
-    if len(audio_data) == 0:
-        return {
-            "is_speech": False,
-            "confidence": 0.0,
-            "speech_segments": [],
-            "energy": 0.0,
-            "message": "Empty audio data"
-        }
-    try:
-        # Tiền xử lý audio
-        processed_audio = preprocess_audio_for_vad(audio_data, sample_rate)
-        # Tính toán các đặc trưng
-        duration = len(processed_audio) / sample_rate
-        # 1. Tính RMS energy
-        rms_energy = np.sqrt(np.mean(processed_audio ** 2))
-        # 2. Tính zero-crossing rate
-        zero_crossings = np.sum(np.abs(np.diff(np.sign(processed_audio)))) / (2 * len(processed_audio))
-        # 3. Tính spectral centroid (đơn giản)
-        # Sử dụng FFT để tính phân bố tần số
-        if len(processed_audio) >= 256:
-            fft_size = min(2048, len(processed_audio))
-            spectrum = np.abs(np.fft.rfft(processed_audio[:fft_size]))
-            frequencies = np.fft.rfftfreq(fft_size, 1/sample_rate)
-            if np.sum(spectrum) > 0:
-                spectral_centroid = np.sum(frequencies * spectrum) / np.sum(spectrum)
-            else:
-                spectral_centroid = 0
-        else:
-            spectral_centroid = 0
-        # 4. Frame-based analysis
-        frame_length = int(sample_rate * 0.03)  # 30ms frame
-        hop_length = int(frame_length / 2)
-        if len(processed_audio) > frame_length:
-            num_frames = 1 + (len(processed_audio) - frame_length) // hop_length
-            frame_energies = []
-            for i in range(num_frames):
-                start = i * hop_length
-                end = start + frame_length
-                frame = processed_audio[start:end]
-                frame_energy = np.sqrt(np.mean(frame ** 2))
-                frame_energies.append(frame_energy)
-            # Tính speech ratio
-            if frame_energies:
-                energy_threshold = np.percentile(frame_energies, 30) + threshold * (np.max(frame_energies) - np.percentile(frame_energies, 30))
-                speech_frames = sum(1 for e in frame_energies if e > energy_threshold)
-                speech_ratio = speech_frames / len(frame_energies)
-            else:
-                speech_ratio = 0
-        else:
-            speech_ratio = 0
-        # 5. Kết hợp các đặc trưng để tính confidence
-        # Speech thường có:
-        # - RMS energy cao
-        # - Zero-crossing rate trung bình (không quá cao như noise, không quá thấp như silence)
-        # - Spectral centroid trong khoảng 100-3000 Hz cho giọng nói
-        # - Speech ratio cao
-        # Tính confidence score
-        energy_score = min(1.0, rms_energy * 10)  # Scale energy
-        # Zero-crossing rate score: lý tưởng khoảng 0.1-0.3 cho speech
-        if 0.05 < zero_crossings < 0.4:
-            zcr_score = 1.0 - 2 * abs(zero_crossings - 0.2)  # Peak ở 0.2
-        else:
-            zcr_score = 0.0
-        # Spectral centroid score: lý tưởng 100-3000 Hz
-        if 100 < spectral_centroid < 3000:
-            centroid_score = 1.0
-        elif 50 < spectral_centroid < 5000:
-            centroid_score = 0.5
-        else:
-            centroid_score = 0.0
-        # Speech ratio score
-        speech_ratio_score = speech_ratio
-        # Kết hợp các score
-        weights = [0.4, 0.2, 0.2, 0.2]  # energy, zcr, centroid, speech_ratio
-        confidence = (
-            weights[0] * energy_score +
-            weights[1] * zcr_score +
-            weights[2] * centroid_score +
-            weights[3] * speech_ratio_score
-        )
-        # Áp dụng ngưỡng
-        is_speech = confidence > threshold
-        # Kiểm tra duration tối thiểu
-        if duration < min_duration:
-            is_speech = False
-            confidence = max(0, confidence - 0.2)
-        # Debug info
-        debug_info = {
-            "duration": duration,
-            "rms_energy": rms_energy,
-            "zero_crossings": zero_crossings,
-            "spectral_centroid": spectral_centroid,
-            "speech_ratio": speech_ratio,
-            "energy_score": energy_score,
-            "zcr_score": zcr_score,
-            "centroid_score": centroid_score,
-            "speech_ratio_score": speech_ratio_score,
-            "final_confidence": confidence,
-            "is_speech": is_speech
-        }
-        print(f"VAD Debug: {debug_info}")
-        return {
-            "is_speech": is_speech,
-            "confidence": float(confidence),
-            "speech_segments": [[0, duration]] if is_speech else [],
-            "energy": float(rms_energy),
-            "message": f"Speech: {is_speech}, Confidence: {confidence:.3f}"
-        }
-    except Exception as e:
-        print(f"VAD processing error: {e}")
-        return {
-            "is_speech": False,
-            "confidence": 0.0,
-            "speech_segments": [],
-            "energy": 0.0,
-            "message": f"Error: {str(e)}"
-        }
-# ========================================================
-# SPEECH-TO-TEXT FUNCTIONS
-# ========================================================
-def get_asr_pipeline():
-    """Lấy ASR pipeline"""
-    global _asr
-    if _asr is None:
-        print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
-        from transformers import pipeline
-        _asr = pipeline(
-            task="automatic-speech-recognition",
-            model=ASR_MODEL_ID,
-            device="cpu",
-            return_timestamps=False,
-            chunk_length_s=8,
-            stride_length_s=(1, 1),
-        )
-    return _asr
 def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
-    """Transcribe audio using OpenAI Whisper-1.
-    Falls back to local transcription on error. """
     if not OPENAI_API_KEY:
-        return transcribe_audio(audio_path, language)
     try:
         from openai import OpenAI
         client = OpenAI(api_key=OPENAI_API_KEY)
         with open(audio_path, "rb") as f:
             resp = client.audio.transcriptions.create(
                 model="whisper-1",
                 file=f,
                 language=language if language and language != "auto" else None,
             )
-        txt = getattr(resp, "text", "") or (resp.get("text") if isinstance(resp, dict) else "")
-        return (txt or "").strip()
-    except Exception as e:
-        print(f">>> OpenAI Fehler: {e}")
-        return transcribe_audio(audio_path, language)
-def transcribe_audio(
-    audio_path: str,
-    language: Optional[str] = None,
-    max_duration_s: int = ASR_MAX_DURATION_S
-) -> str:
-    """
-    Transcribe audio với Whisper local
-    """
-    if not audio_path or not os.path.exists(audio_path):
-        print(">>> Kein Audio gefunden.")
-        return ""
-    try:
-        # Đọc audio file
-        data, sr = sf.read(audio_path, always_2d=False)
-        if data is None or data.size == 0:
-            print(">>> Audio leer.")
-            return ""
-        # Chuyển sang mono
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
-        # Tiền xử lý
-        data = data.astype(np.float32)
-        max_val = np.max(np.abs(data))
-        if max_val > 0:
-            data = data / max_val
-        # Resample về 16kHz nếu cần
-        TARGET_SR = 16000
-        if sr != TARGET_SR:
-            target_len = int(len(data) * TARGET_SR / sr)
-            data = resample(data, target_len)
-            sr = TARGET_SR
-        # Giới hạn độ dài
-        MAX_SAMPLES = sr * max_duration_s
-        if len(data) > MAX_SAMPLES:
-            data = data[:MAX_SAMPLES]
-        # Lấy pipeline
-        asr = get_asr_pipeline()
-        # Cấu hình language
-        lang = language
-        if not lang and ASR_DEFAULT_LANGUAGE and ASR_DEFAULT_LANGUAGE.lower() != "auto":
-            lang = ASR_DEFAULT_LANGUAGE
-        if isinstance(lang, str) and lang.lower() == "auto":
-            lang = None
-        # Transcribe
-        print(f">>> Transkribiere mit Whisper-{WHISPER_MODEL}...")
-        call_kwargs = {}
-        if lang:
-            call_kwargs["generate_kwargs"] = {
-                "language": lang,
-                "task": "transcribe",
-                "max_new_tokens": 120,
-                "temperature": 0.0,
-            }
-        result = asr({"array": data, "sampling_rate": sr}, **call_kwargs)
-        text = result.get("text", "") if isinstance(result, dict) else str(result)
         text = text.strip()
-        # Sửa lỗi domain terms
-        text = fix_domain_terms(text)
-        print(f">>> Transkription: {text}")
         return text
     except Exception as e:
-        print(f">>> Transkriptionsfehler: {e}")
         return ""
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
 def get_tts_pipeline():
-    """Lấy TTS pipeline"""
     global _tts
     if _tts is None:
-        print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
-        from transformers import pipeline
-        _tts = pipeline(
-            task="text-to-speech",
-            model=TTS_MODEL_ID,
-        )
     return _tts
 def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
     """
-    Chuyển text sang speech
     """
-    if not text or not text.strip() or not TTS_ENABLED:
         return None
     try:
-        tts = get_tts_pipeline()
-        out = tts(text)
-        audio = np.array(out["audio"], dtype=np.float32)
-        sr = out.get("sampling_rate", 16000)
-        # Ensure valid sample rate
-        if sr is None or sr <= 0:
-            sr = 16000
-        # Ensure mono
-        if audio.ndim > 1:
-            audio = audio.squeeze()
-        if audio.ndim > 1:
-            audio = audio[:, 0]
-        # Apply processing
-        try:
-            audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
-        except:
-            pass
-        # Normalize
-        max_val = np.max(np.abs(audio))
-        if max_val > 0:
-            audio = audio / max_val
-        # Apply fade
-        audio = apply_fade(audio, sr)
         # Convert to int16
-        audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
-        return (sr, audio_int16)
     except Exception as e:
         print(f">>> TTS Fehler: {e}")
         return None
 # ========================================================
-# DOMAIN-SPECIFIC TEXT PROCESSING
 # ========================================================
-def fix_domain_terms(text: str) -> str:
-    """
-    Sửa lỗi các thuật ngữ chuyên ngành
-    """
-    if not text:
-        return text
-    # Common mis-transcriptions
-    correction_pairs = [
-        (r"\bbriefe\s*um\b", "prüfung"),
-        (r"\bbrieft\s*um\b", "prüfung"),
-        (r"\bbriefung\b", "prüfung"),
-        (r"\bpruefung\b", "prüfung"),
-        (r"\bhochschule\s*gesetz\b", "hochschulgesetz"),
-    ]
-    for pattern, replacement in correction_pairs:
-        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
-    return text
 # ========================================================
 # MAIN EXPORT
 # ========================================================
 __all__ = [
-    'transcribe_audio',
     'transcribe_with_openai',
     'synthesize_speech',
-    'detect_voice_activity',
-    'normalize_audio',
-    'preprocess_audio_for_vad'
-]

 """
+speech_io.py - Simplified version for ChatGPT-like interface
 """
 import os
+from typing import Optional, Tuple
 import numpy as np
 import soundfile as sf
+from scipy.signal import butter, filtfilt
 # ========================================================
 # CẤU HÌNH
 # ========================================================
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
 # ========================================================
+# SPEECH-TO-TEXT WITH OPENAI
 # ========================================================
 def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
+    """Transcribe audio using OpenAI Whisper-1."""
     if not OPENAI_API_KEY:
+        raise RuntimeError("OPENAI_API_KEY is required for transcription")
     try:
         from openai import OpenAI
         client = OpenAI(api_key=OPENAI_API_KEY)
         with open(audio_path, "rb") as f:
             resp = client.audio.transcriptions.create(
                 model="whisper-1",
                 file=f,
                 language=language if language and language != "auto" else None,
+                response_format="text"
             )
+        # Lấy text từ response
+        if hasattr(resp, 'text'):
+            text = resp.text
+        elif isinstance(resp, dict):
+            text = resp.get('text', '')
+        else:
+            text = str(resp)
         text = text.strip()
+        print(f">>> Transkription: {text[:100]}...")
         return text
     except Exception as e:
+        print(f">>> OpenAI Transkriptionsfehler: {e}")
+        # Fallback: trả về empty string
         return ""
 # ========================================================
 # TEXT-TO-SPEECH (TTS)
 # ========================================================
+_tts = None
 def get_tts_pipeline():
+    """Lấy TTS pipeline từ OpenAI"""
     global _tts
     if _tts is None:
+        print(">>> Initialisiere OpenAI TTS Client")
+        from openai import OpenAI
+        _tts = OpenAI(api_key=OPENAI_API_KEY)
     return _tts
 def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
     """
+    Chuyển text sang speech sử dụng OpenAI TTS
     """
+    if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
         return None
     try:
+        client = get_tts_pipeline()
+        # Gọi OpenAI TTS API
+        response = client.audio.speech.create(
+            model="tts-1",
+            voice="nova",  # Các lựa chọn: alloy, echo, fable, onyx, nova, shimmer
+            input=text[:4000],  # Giới hạn độ dài
+            response_format="wav"
+        )
+        # Lưu audio vào buffer
+        import io
+        audio_bytes = response.content
+        # Đọc WAV từ bytes
+        import io as io_module
+        with io_module.BytesIO(audio_bytes) as f:
+            data, sr = sf.read(f)
+        # Chuyển sang mono nếu cần
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
         # Convert to int16
+        if data.dtype == np.float32 or data.dtype == np.float64:
+            data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
+        return (sr, data)
     except Exception as e:
         print(f">>> TTS Fehler: {e}")
         return None
 # ========================================================
+# AUDIO PROCESSING UTILITIES
 # ========================================================
+def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
+    """Highpass filter để loại bỏ noise tần số thấp"""
+    if len(data) == 0:
+        return data
+    nyq = 0.5 * fs
+    normal_cutoff = cutoff / nyq
+    b, a = butter(order, normal_cutoff, btype='high', analog=False)
+    return filtfilt(b, a, data)
+def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
+    """Chuẩn hóa audio về [-1, 1]"""
+    if len(audio_data) == 0:
+        return audio_data
+    # Chuyển đổi sang float32
+    if audio_data.dtype != np.float32:
+        audio_data = audio_data.astype(np.float32)
+    # Normalize
+    max_val = np.max(np.abs(audio_data))
+    if max_val > 0:
+        audio_data = audio_data / max_val
+    return audio_data
 # ========================================================
 # MAIN EXPORT
 # ========================================================
 __all__ = [
     'transcribe_with_openai',
     'synthesize_speech',
+    'normalize_audio'
+]