Nguyen5 commited on
Commit
f2e421a
·
1 Parent(s): fd831e8
Files changed (2) hide show
  1. app.py +647 -518
  2. realtime_server.py +0 -97
app.py CHANGED
@@ -1,12 +1,18 @@
1
- # app.py – Prüfungsrechts-Chatbot (RAG + Sprache, UI kiểu ChatGPT) với các tính năng nâng cao
2
- #
3
  import os
4
  import time
 
 
 
5
  from dataclasses import dataclass, field
6
- from typing import Optional, Dict, Any
7
  import gradio as gr
8
  from gradio_pdf import PDF
9
  import numpy as np
 
 
 
 
10
 
11
  from load_documents import load_all_documents
12
  from split_documents import split_documents
@@ -14,30 +20,29 @@ from vectorstore import build_vectorstore
14
  from retriever import get_retriever
15
  from llm import load_llm
16
  from rag_pipeline import answer
17
- from speech_io import transcribe_audio, synthesize_speech, transcribe_with_openai, detect_voice_activity
18
 
19
- # Cấu hình môi trường
20
- ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
21
- USE_OPENAI = os.getenv("USE_OPENAI", "false").lower() == "true"
22
- USE_REALTIME = os.getenv("USE_REALTIME", "false").lower() == "true"
23
- REALTIME_SERVER_URL = os.getenv("REALTIME_SERVER_URL", "ws://localhost:8000/ws")
24
- ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
25
- VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
 
 
26
 
27
  # =====================================================
28
- # STATE MANAGEMENT - Quản lý trạng thái hội thoại liền mạch
29
  # =====================================================
30
  @dataclass
31
  class ConversationState:
32
  """Quản lý trạng thái hội thoại"""
33
  messages: list = field(default_factory=list)
34
- last_audio_time: float = field(default_factory=time.time)
35
- is_listening: bool = False
36
- vad_confidence: float = 0.0
 
37
  conversation_context: str = ""
38
- whisper_model: str = field(default_factory=lambda: os.getenv("WHISPER_MODEL", "base"))
39
- language: str = field(default_factory=lambda: ASR_LANGUAGE_HINT)
40
- current_audio_path: Optional[str] = None
41
 
42
  def add_message(self, role: str, content: str):
43
  """Thêm message vào hội thoại"""
@@ -62,32 +67,26 @@ class ConversationState:
62
  context_parts = []
63
  for msg in self.messages[-5:]: # Giữ 5 message gần nhất
64
  prefix = "User" if msg["role"] == "user" else "Assistant"
65
- context_parts.append(f"{prefix}: {msg['content'][:200]}") # Giới hạn độ dài
66
  self.conversation_context = "\n".join(context_parts)
67
 
68
- def get_recent_context(self, num_messages: int = 3) -> str:
69
- """Lấy context gần đây"""
70
- if not self.messages or num_messages <= 0:
71
- return ""
72
-
73
- recent = self.messages[-num_messages:] if len(self.messages) >= num_messages else self.messages
74
- return "\n".join([f"{m['role']}: {m['content']}" for m in recent])
75
-
76
  def reset(self):
77
  """Reset trạng thái hội thoại"""
78
  self.messages = []
79
  self.conversation_context = ""
80
- self.is_listening = False
81
- self.vad_confidence = 0.0
82
- self.current_audio_path = None
 
 
 
83
 
84
  # Khởi tạo state
85
  state = ConversationState()
86
 
87
  # =====================================================
88
- # INITIALISIERUNG (global)
89
  # =====================================================
90
-
91
  print("📚 Lade Dokumente…")
92
  docs = load_all_documents()
93
 
@@ -109,70 +108,400 @@ hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
109
  hg_url = hg_meta.get("viewer_url")
110
 
111
  # =====================================================
112
- # VOICE ACTIVITY DETECTION
113
  # =====================================================
114
- def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) -> Dict[str, Any]:
115
- """Xử phát hiện hoạt động giọng nói"""
116
- if audio_data is None or len(audio_data) == 0:
117
- return {"is_speech": False, "confidence": 0.0, "status": "No audio data"}
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  try:
120
- vad_result = detect_voice_activity(audio_data, sample_rate, threshold=VAD_THRESHOLD)
 
 
 
 
 
 
 
 
 
121
 
122
- # Cập nhật state
123
- state.is_listening = vad_result["is_speech"]
124
- if vad_result["is_speech"]:
125
- state.last_audio_time = time.time()
126
- state.vad_confidence = vad_result["confidence"]
127
 
128
- return {
129
- "is_speech": vad_result["is_speech"],
130
- "confidence": vad_result["confidence"],
131
- "status": f"Speech detected: {vad_result['is_speech']} (conf: {vad_result['confidence']:.2f})"
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  except Exception as e:
134
- print(f"VAD error: {e}")
135
- return {"is_speech": False, "confidence": 0.0, "status": f"VAD error: {e}"}
136
 
137
  # =====================================================
138
- # TRANSCRIBE WITH OPTIMIZED PIPELINE
139
  # =====================================================
140
- def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
141
- """Transcribe audio với pipeline tối ưu"""
142
- if not audio_path or not os.path.exists(audio_path):
143
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- if USE_OPENAI:
146
- return transcribe_with_openai(audio_path, language=language)
147
- return transcribe_audio(audio_path, language=language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  # =====================================================
150
- # CONVERSATIONAL INTELLIGENCE
151
  # =====================================================
152
- def enhance_conversation_context(user_input: str, history: list) -> str:
153
- """Tăng cường context hội thoại"""
154
- if not user_input:
155
- return user_input
156
-
157
- # Thêm context đơn giản từ history
158
- if history and len(history) > 0:
159
- # Lấy 3 tin nhắn gần nhất từ history
160
- recent_history = history[-3:] if len(history) >= 3 else history
161
- context_parts = ["Previous conversation:"]
162
- for msg in recent_history:
163
- role = "User" if msg.get("role") == "user" else "Assistant"
164
- content = msg.get("content", "")[:100] # Giới hạn độ dài
165
- context_parts.append(f"{role}: {content}")
166
 
167
- context = "\n".join(context_parts)
168
- return f"{context}\n\nCurrent question: {user_input}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- return user_input
 
 
 
 
 
 
 
171
 
172
  # =====================================================
173
- # Quellen formatieren – Markdown für Chat
174
  # =====================================================
175
  def format_sources(src):
 
176
  if not src:
177
  return ""
178
 
@@ -186,232 +515,60 @@ def format_sources(src):
186
 
187
  return "\n".join(out)
188
 
189
- # =====================================================
190
- # CORE CHAT-FUNKTION với tất cả tính năng mới
191
- # =====================================================
192
- def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
193
- """
194
- Main chat function với xử lý VAD và transcription
195
- """
196
- print(f"DEBUG: chat_fn called - text_input: '{text_input}', audio_path: {audio_path}, history length: {len(history) if history else 0}")
197
-
198
- # Khởi tạo history nếu None
199
- if history is None:
200
  history = []
201
 
202
- # Biến để lưu text cần xử lý
203
- text_to_process = ""
204
-
205
- # Xử lý audio nếu có
206
- if audio_path and os.path.exists(audio_path):
207
- print(f"DEBUG: Processing audio file: {audio_path}")
208
-
209
- # Lưu đường dẫn audio vào state
210
- state.current_audio_path = audio_path
211
-
212
- # Kiểm tra VAD nếu được bật
213
- if use_vad and ENABLE_VAD:
214
- try:
215
- import soundfile as sf
216
- audio_data, sample_rate = sf.read(audio_path)
217
- print(f"DEBUG: Audio loaded - shape: {audio_data.shape}, sample_rate: {sample_rate}")
218
-
219
- vad_result = handle_voice_activity(audio_data, sample_rate)
220
- print(f"DEBUG: VAD result: {vad_result}")
221
-
222
- # Nếu VAD phát hiện có giọng nói, hoặc nếu VAD không bật, tiến hành transcribe
223
- if vad_result.get("is_speech", True):
224
- # Transcribe audio
225
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
226
- if transcribed_text and transcribed_text.strip():
227
- text_to_process = transcribed_text.strip()
228
- print(f"DEBUG: Transcribed text: {text_to_process}")
229
- else:
230
- print("DEBUG: VAD detected no speech, skipping transcription")
231
- except Exception as e:
232
- print(f"DEBUG: Error in VAD/transcription: {e}")
233
- # Fallback: transcribe ngay cả khi có lỗi
234
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
235
- if transcribed_text and transcribed_text.strip():
236
- text_to_process = transcribed_text.strip()
237
  else:
238
- # Nếu VAD không bật, transcribe trực tiếp
239
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
240
- if transcribed_text and transcribed_text.strip():
241
- text_to_process = transcribed_text.strip()
242
- print(f"DEBUG: Transcribed text (no VAD): {text_to_process}")
243
-
244
- # Nếu có text input từ textbox, ưu tiên sử dụng nó
245
- if text_input and text_input.strip():
246
- text_to_process = text_input.strip()
247
- print(f"DEBUG: Using text input: {text_to_process}")
248
-
249
- # Nếu không có gì để xử lý
250
- if not text_to_process:
251
- print("DEBUG: No text to process")
252
- # Trả về history hiện tại và status
253
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
254
- if history is None:
255
- history = []
256
- return history, "", None, status_text
257
-
258
- print(f"DEBUG: Processing text: {text_to_process}")
259
-
260
- # Tăng cường context cho câu hỏi
261
- enhanced_question = enhance_conversation_context(text_to_process, history)
262
 
263
- try:
264
- # RAG-Antwort berechnen
265
- ans, sources = answer(enhanced_question, retriever, llm)
266
- bot_msg = ans + format_sources(sources)
267
-
268
- # Thêm vào state
269
- state.add_message("user", text_to_process)
270
- state.add_message("assistant", ans)
271
-
272
- # History aktualisieren (ChatGPT-Style)
273
- history.append({"role": "user", "content": text_to_process})
274
- history.append({"role": "assistant", "content": bot_msg})
275
-
276
- print(f"DEBUG: Answer generated, history length: {len(history)}")
277
-
278
- except Exception as e:
279
- print(f"DEBUG: Error in RAG pipeline: {e}")
280
- # Fallback response
281
- error_msg = "Entschuldigung, es gab einen Fehler bei der Verarbeitung Ihrer Anfrage. Bitte versuchen Sie es erneut."
282
- history.append({"role": "user", "content": text_to_process})
283
- history.append({"role": "assistant", "content": error_msg})
284
-
285
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
286
- return history, "", None, status_text
287
-
288
- # =====================================================
289
- # FUNCTIONS FOR UI CONTROLS
290
- # =====================================================
291
- def toggle_vad(use_vad):
292
- """Toggle Voice Activity Detection"""
293
- global ENABLE_VAD
294
- ENABLE_VAD = use_vad
295
- status = "EIN" if use_vad else "AUS"
296
- return f"Voice Activity Detection: {status} | Model: {state.whisper_model}"
297
-
298
- def change_whisper_model(model_size):
299
- """Đổi Whisper model"""
300
- state.whisper_model = model_size
301
- os.environ["WHISPER_MODEL"] = model_size
302
- return f"Whisper Model: {model_size} | VAD: {'On' if ENABLE_VAD else 'Off'}"
303
-
304
- def clear_conversation():
305
- """Xóa hội thoại"""
306
- state.reset()
307
- return [], "Konversation gelöscht | Bereit"
308
-
309
- def update_vad_indicator():
310
- """Cập nhật VAD indicator"""
311
- if state.is_listening:
312
- indicator_html = """
313
- <div style="display: flex; align-items: center; gap: 8px;">
314
- <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #10b981; box-shadow: 0 0 10px #10b981; animation: pulse 1.5s infinite;"></div>
315
- <span style="color: #10b981; font-weight: bold;">Sprache erkannt</span>
316
- </div>
317
- <style>
318
- @keyframes pulse {
319
- 0% { opacity: 0.7; }
320
- 50% { opacity: 1; }
321
- 100% { opacity: 0.7; }
322
- }
323
- </style>
324
- """
325
- else:
326
- indicator_html = """
327
- <div style="display: flex; align-items: center; gap: 8px;">
328
- <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #6b7280;"></div>
329
- <span>Bereit</span>
330
- </div>
331
- """
332
-
333
- return indicator_html
334
 
335
- # =====================================================
336
- # AUDIO STREAMING HANDLER
337
- # =====================================================
338
- def handle_audio_stream(audio_path, use_vad):
339
- """Xử lý audio streaming real-time"""
340
- if not audio_path or not os.path.exists(audio_path):
341
- return "", update_vad_indicator(), "Keine Audiodatei"
342
 
343
- try:
344
- import soundfile as sf
345
- audio_data, sample_rate = sf.read(audio_path)
346
-
347
- # Cập nhật VAD indicator
348
- vad_html = update_vad_indicator()
349
-
350
- if use_vad and ENABLE_VAD:
351
- vad_result = handle_voice_activity(audio_data, sample_rate)
352
-
353
- if vad_result.get("is_speech", False):
354
- # Nếu phát hiện giọng nói, transcribe
355
- text = transcribe_audio_optimized(audio_path, language=state.language)
356
- status = f"Sprache erkannt ({vad_result.get('confidence', 0):.2f})"
357
- return text, vad_html, status
358
- else:
359
- status = "Keine Sprache erkannt"
360
- return "", vad_html, status
361
- else:
362
- # Nếu VAD không bật, vẫn transcribe nhưng hiển thị trạng thái khác
363
- text = transcribe_audio_optimized(audio_path, language=state.language)
364
- status = "Transkription (VAD aus)"
365
- return text, vad_html, status
366
-
367
- except Exception as e:
368
- print(f"Error in audio stream handler: {e}")
369
- return "", update_vad_indicator(), f"Fehler: {str(e)[:50]}"
370
-
371
- # =====================================================
372
- # TTS FUNCTION
373
- # =====================================================
374
- def read_last_answer(history):
375
- """Đọc câu trả lời cuối cùng"""
376
- if not history:
377
- print("DEBUG: No history for TTS")
378
- return None
379
-
380
- # Tìm câu trả lời cuối cùng của assistant
381
- for msg in reversed(history):
382
- if isinstance(msg, dict) and msg.get("role") == "assistant":
383
- content = msg.get("content", "")
384
- # Loại bỏ phần sources từ câu trả lời
385
- if "## 📚 Quellen" in content:
386
- content = content.split("## 📚 Quellen")[0].strip()
387
-
388
- print(f"DEBUG: Synthesizing speech for: {content[:100]}...")
389
- audio_result = synthesize_speech(content)
390
- if audio_result:
391
- print("DEBUG: TTS successful")
392
- return audio_result
393
-
394
- print("DEBUG: No assistant message found for TTS")
395
- return None
396
 
397
  # =====================================================
398
- # UI – GRADIO với tất cả tính năng mới
399
  # =====================================================
400
- with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as demo:
401
- # CSS Styling nâng cao
402
  gr.HTML("""
403
  <style>
404
  .gradio-container {
405
  max-width: 1200px;
406
  margin: 0 auto;
407
- padding: 20px;
408
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
409
  }
410
 
411
  .header {
412
  text-align: center;
413
  margin-bottom: 30px;
414
- padding: 20px;
415
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
416
  border-radius: 15px;
417
  color: white;
@@ -425,306 +582,278 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
425
  border: 1px solid #e2e8f0;
426
  }
427
 
428
- .chat-container {
429
- background: white;
430
- border-radius: 15px;
431
- padding: 20px;
432
- box-shadow: 0 4px 20px rgba(0,0,0,0.1);
433
- margin-bottom: 20px;
 
434
  }
435
 
436
- .input-row {
437
- background: #f8fafc;
438
- border-radius: 25px;
439
- padding: 10px 20px;
440
- border: 2px solid #e2e8f0;
441
- transition: all 0.3s ease;
442
- display: flex;
443
- align-items: center;
444
- gap: 10px;
445
  }
446
 
447
- .input-row:focus-within {
448
- border-color: #667eea;
449
- box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
450
  }
451
 
452
- .send-btn {
453
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
454
- color: white !important;
455
- border: none !important;
456
- border-radius: 50% !important;
457
- width: 44px !important;
458
- height: 44px !important;
459
- display: flex !important;
460
- align-items: center !important;
461
- justify-content: center !important;
462
- cursor: pointer !important;
463
  }
464
 
465
- .send-btn:hover {
466
- transform: scale(1.05);
467
- box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
468
  }
469
 
470
- .vad-indicator-container {
471
- padding: 10px;
472
- background: #f1f5f9;
473
- border-radius: 10px;
474
- margin: 10px 0;
475
- display: flex;
476
- align-items: center;
477
- gap: 10px;
478
  }
479
 
480
- .feature-badge {
481
- display: inline-block;
482
- padding: 4px 12px;
483
- background: #e0e7ff;
484
- color: #4f46e5;
485
- border-radius: 20px;
486
- font-size: 12px;
487
- font-weight: 500;
488
- margin: 2px;
489
  }
490
 
491
- .chatbot {
492
- min-height: 400px;
493
- max-height: 500px;
494
- overflow-y: auto;
495
  }
496
 
497
- /* Responsive design */
498
- @media (max-width: 768px) {
499
- .gradio-container {
500
- padding: 10px;
501
- }
502
-
503
- .input-row {
504
- flex-direction: column;
505
- gap: 10px;
506
- }
507
-
508
- .send-btn {
509
- width: 100% !important;
510
- height: 44px !important;
511
- border-radius: 10px !important;
512
- }
513
  }
514
  </style>
515
  """)
516
 
517
  # Header
518
  with gr.Column(elem_classes=["header"]):
519
- gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
520
- gr.Markdown("### Intelligent Voice Interface with Advanced Features")
521
-
522
- # Feature badges
523
- gr.HTML("""
524
- <div style="text-align: center; margin: 10px 0;">
525
- <span class="feature-badge">🎤 Voice Activity Detection</span>
526
- <span class="feature-badge">⚡ Fast Transcription</span>
527
- <span class="feature-badge">🧠 Conversational AI</span>
528
- <span class="feature-badge">📚 Document RAG</span>
529
- </div>
530
- """)
531
 
532
  # Control Panel
533
  with gr.Column(elem_classes=["control-panel"]):
534
  with gr.Row():
535
- with gr.Column(scale=2):
536
- # Model Selection
537
- model_selector = gr.Dropdown(
538
- choices=["tiny", "base", "small", "medium"],
539
- value=state.whisper_model,
540
- label="Whisper Model",
541
- info="Wählen Sie das Modell für Spracherkennung"
542
- )
543
-
544
- # VAD Control
545
- vad_toggle = gr.Checkbox(
546
- value=ENABLE_VAD,
547
- label="Voice Activity Detection aktivieren",
548
- info="Automatische Spracherkennung"
549
- )
550
-
551
- # Language Selection
552
- lang_selector = gr.Dropdown(
553
- choices=["de", "en", "auto"],
554
- value=ASR_LANGUAGE_HINT,
555
- label="Spracherkennung Sprache"
556
- )
557
 
 
558
  with gr.Column(scale=1):
559
- # Status Display
560
- status_display = gr.Textbox(
561
- label="System Status",
562
- value="Bereit",
563
- interactive=False
564
  )
565
-
566
- # Clear Conversation Button
567
- clear_btn = gr.Button("🗑️ Konversation löschen", variant="secondary", size="sm")
568
-
569
- # VAD Indicator
570
- vad_indicator = gr.HTML(value=update_vad_indicator(), label="VAD Status")
 
 
 
 
 
 
 
 
571
 
572
  # Main Chat Interface
573
- with gr.Column(elem_classes=["chat-container"]):
574
  # Chatbot Display
575
  chatbot = gr.Chatbot(
576
  label="Konversation",
577
- height=400,
578
- avatar_images=(None, "🤖")
 
 
579
  )
580
 
581
- # Input Row với VAD Indicator
582
- with gr.Row(elem_classes=["input-row"]):
583
- # Text Input
584
- chat_text = gr.Textbox(
585
- label=None,
586
- placeholder="Stellen Sie eine Frage oder sprechen Sie ins Mikrofon...",
587
- lines=1,
588
  max_lines=4,
589
- scale=8,
590
- container=False,
591
- show_label=False
592
  )
593
-
594
- # Audio Input
595
- chat_audio = gr.Audio(
596
- sources=["microphone"],
597
- type="filepath",
598
- format="wav",
599
- streaming=True,
600
- interactive=True,
601
- show_label=False,
602
- scale=1,
603
- elem_id="audio-input"
604
- )
605
-
606
- # Send Button
607
- send_btn = gr.Button("➤", variant="primary", elem_classes=["send-btn"], scale=1)
608
 
609
- # TTS Controls
610
- with gr.Row():
611
- tts_btn = gr.Button("🔊 Antwort vorlesen", variant="secondary", size="sm")
612
- tts_audio = gr.Audio(label="Audio Ausgabe", interactive=False, visible=False)
613
- tts_status = gr.Textbox(label="TTS Status", interactive=False, visible=False)
 
 
 
 
614
 
615
  # Documents Section
616
- with gr.Accordion("📚 Quellen & Dokumente", open=False):
617
  with gr.Tabs():
618
- with gr.TabItem("📄 Prüfungsordnung (PDF)"):
619
- PDF(pdf_meta["pdf_url"], height=300)
620
 
621
  with gr.TabItem("📘 Hochschulgesetz NRW"):
622
- if isinstance(hg_url, str) and hg_url.startswith("http"):
623
- gr.Markdown(f"### [Im Viewer öffnen]({hg_url})")
624
- gr.HTML(f'<iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>')
625
- else:
626
- gr.Markdown("Viewer-Link nicht verfügbar.")
 
 
 
 
 
 
 
 
627
 
628
  # =====================================================
629
  # EVENT HANDLERS
630
  # =====================================================
631
 
632
- # Model Selection
633
- model_selector.change(
634
- change_whisper_model,
635
- inputs=[model_selector],
636
- outputs=[status_display]
637
- )
 
 
 
 
 
 
 
 
 
638
 
639
- # VAD Toggle
640
- vad_toggle.change(
641
- toggle_vad,
642
- inputs=[vad_toggle],
643
- outputs=[status_display]
644
- )
 
 
 
 
 
 
 
 
 
 
 
 
645
 
646
- # Clear Conversation
647
- clear_btn.click(
648
- clear_conversation,
649
- outputs=[chatbot, status_display]
650
- ).then(
651
- lambda: update_vad_indicator(),
652
- outputs=[vad_indicator]
653
- )
 
 
 
654
 
655
- # Main Chat Function
656
- def process_chat(text_input, audio_path, history, lang_sel, use_vad):
657
- """Wrapper function để xử lý chat"""
 
 
 
 
 
658
  try:
659
- return chat_fn(text_input, audio_path, history, lang_sel, use_vad)
 
 
 
 
 
 
 
 
 
 
660
  except Exception as e:
661
- print(f"Error in process_chat: {e}")
662
- error_msg = f"Fehler: {str(e)}"
663
- if history is None:
664
- history = []
665
- return history, "", None, error_msg
666
 
667
- # Send Button Click
668
- send_btn.click(
669
- process_chat,
670
- inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
671
- outputs=[chatbot, chat_text, chat_audio, status_display]
672
- ).then(
673
- lambda: update_vad_indicator(),
674
- outputs=[vad_indicator]
 
 
 
 
 
 
 
 
 
 
675
  )
676
 
677
- # Text Submit (Enter key)
678
- chat_text.submit(
679
- process_chat,
680
- inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
681
- outputs=[chatbot, chat_text, chat_audio, status_display]
682
- ).then(
683
- lambda: update_vad_indicator(),
684
- outputs=[vad_indicator]
685
  )
686
 
687
- # Audio Change Handler
688
- def on_audio_change(audio_path, use_vad):
689
- """Xử khi audio thay đổi"""
690
- if audio_path:
691
- print(f"DEBUG: Audio changed: {audio_path}")
692
- # Xử lý streaming
693
- text, vad_html, status = handle_audio_stream(audio_path, use_vad)
694
- return text, vad_html, status
695
- return "", update_vad_indicator(), "Bereit"
696
-
697
- chat_audio.change(
698
- on_audio_change,
699
- inputs=[chat_audio, vad_toggle],
700
- outputs=[chat_text, vad_indicator, status_display]
701
  )
702
 
703
- # Audio Streaming
704
- chat_audio.stream(
705
- on_audio_change,
706
- inputs=[chat_audio, vad_toggle],
707
- outputs=[chat_text, vad_indicator, status_display]
708
  )
709
 
710
- # TTS Button
711
- def handle_tts(history):
712
- """Xử lý TTS"""
713
- audio_result = read_last_answer(history)
714
- if audio_result:
715
- return audio_result, "Audio wird abgespielt..."
716
- return None, "Keine Antwort zum Vorlesen gefunden"
717
 
718
- tts_btn.click(
719
- handle_tts,
 
720
  inputs=[chatbot],
721
- outputs=[tts_audio, tts_status]
722
- ).then(
723
- lambda: gr.Audio(visible=True),
724
- outputs=[tts_audio]
725
- ).then(
726
- lambda: gr.Textbox(visible=True),
727
- outputs=[tts_status]
728
  )
729
 
730
  if __name__ == "__main__":
 
1
+ # app.py – Prüfungsrechts-Chatbot với OpenAI Realtime API Voice Agents
 
2
  import os
3
  import time
4
+ import json
5
+ import asyncio
6
+ import threading
7
  from dataclasses import dataclass, field
8
+ from typing import Optional, Dict, Any, List
9
  import gradio as gr
10
  from gradio_pdf import PDF
11
  import numpy as np
12
+ import queue
13
+
14
+ from openai import OpenAI
15
+ from openai._streaming import AssistantEventHandler
16
 
17
  from load_documents import load_all_documents
18
  from split_documents import split_documents
 
20
  from retriever import get_retriever
21
  from llm import load_llm
22
  from rag_pipeline import answer
 
23
 
24
+ # =====================================================
25
+ # CONFIGURATION
26
+ # =====================================================
27
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
28
+ if not OPENAI_API_KEY:
29
+ raise RuntimeError("OPENAI_API_KEY is required for Realtime API")
30
+
31
+ # Initialize OpenAI client
32
+ openai_client = OpenAI(api_key=OPENAI_API_KEY)
33
 
34
  # =====================================================
35
+ # STATE MANAGEMENT
36
  # =====================================================
37
  @dataclass
38
  class ConversationState:
39
  """Quản lý trạng thái hội thoại"""
40
  messages: list = field(default_factory=list)
41
+ is_streaming: bool = False
42
+ realtime_session_id: Optional[str] = None
43
+ audio_queue: queue.Queue = field(default_factory=queue.Queue)
44
+ text_queue: queue.Queue = field(default_factory=queue.Queue)
45
  conversation_context: str = ""
 
 
 
46
 
47
  def add_message(self, role: str, content: str):
48
  """Thêm message vào hội thoại"""
 
67
  context_parts = []
68
  for msg in self.messages[-5:]: # Giữ 5 message gần nhất
69
  prefix = "User" if msg["role"] == "user" else "Assistant"
70
+ context_parts.append(f"{prefix}: {msg['content'][:200]}")
71
  self.conversation_context = "\n".join(context_parts)
72
 
 
 
 
 
 
 
 
 
73
  def reset(self):
74
  """Reset trạng thái hội thoại"""
75
  self.messages = []
76
  self.conversation_context = ""
77
+ self.is_streaming = False
78
+ self.realtime_session_id = None
79
+ while not self.audio_queue.empty():
80
+ self.audio_queue.get()
81
+ while not self.text_queue.empty():
82
+ self.text_queue.get()
83
 
84
  # Khởi tạo state
85
  state = ConversationState()
86
 
87
  # =====================================================
88
+ # INITIALIZATION - RAG Components
89
  # =====================================================
 
90
  print("📚 Lade Dokumente…")
91
  docs = load_all_documents()
92
 
 
108
  hg_url = hg_meta.get("viewer_url")
109
 
110
  # =====================================================
111
+ # OPENAI REALTIME API HANDLERS
112
  # =====================================================
113
+ class RealtimeEventHandler(AssistantEventHandler):
114
+ """Handler cho OpenAI Realtime API events"""
 
 
115
 
116
+ def __init__(self, state_ref):
117
+ super().__init__()
118
+ self.state = state_ref
119
+ self.current_text = ""
120
+
121
+ def on_text_delta(self, delta, snapshot):
122
+ """Xử lý text delta từ Realtime API"""
123
+ if delta.value:
124
+ self.current_text += delta.value
125
+ # Thêm vào text queue để hiển thị
126
+ self.state.text_queue.put({
127
+ "type": "text_delta",
128
+ "content": delta.value
129
+ })
130
+
131
+ def on_audio_transcript_delta(self, delta, snapshot):
132
+ """Xử lý audio transcript từ Realtime API"""
133
+ if delta.text:
134
+ # Thêm vào text queue
135
+ self.state.text_queue.put({
136
+ "type": "transcript",
137
+ "content": delta.text
138
+ })
139
+
140
+ def on_audio_delta(self, delta, snapshot):
141
+ """Xử lý audio data từ Realtime API"""
142
+ if delta.data:
143
+ # Thêm vào audio queue để phát
144
+ self.state.audio_queue.put({
145
+ "type": "audio",
146
+ "data": delta.data
147
+ })
148
+
149
+ def on_response_created(self, response):
150
+ """Khi response được tạo"""
151
+ print("DEBUG: Response created")
152
+
153
+ def on_response_done(self, response):
154
+ """Khi response hoàn thành"""
155
+ print(f"DEBUG: Response done, final text: {self.current_text[:100]}...")
156
+ if self.current_text:
157
+ # Thêm message vào history
158
+ self.state.add_message("assistant", self.current_text)
159
+
160
+ # Signal end of response
161
+ self.state.text_queue.put({
162
+ "type": "response_end",
163
+ "content": self.current_text
164
+ })
165
+
166
+ self.current_text = ""
167
+ self.state.is_streaming = False
168
+
169
+ def on_error(self, error):
170
+ """Xử lý lỗi"""
171
+ print(f"DEBUG: Realtime API error: {error}")
172
+ self.state.is_streaming = False
173
+ self.state.text_queue.put({
174
+ "type": "error",
175
+ "content": f"Error: {str(error)}"
176
+ })
177
+
178
+ def start_realtime_conversation():
179
+ """Bắt đầu cuộc hội thoại Realtime API"""
180
  try:
181
+ # Tạo Realtime session
182
+ session = openai_client.realtime.sessions.create(
183
+ model="gpt-4o-realtime-preview",
184
+ voice="shimmer", # Có thể chọn: alloy, echo, fable, onyx, nova, shimmer
185
+ modalities=["text", "audio"],
186
+ instructions="""Du bist ein juristischer Assistent für Prüfungsrecht.
187
+ Du hilfst Studenten mit Fragen zu Prüfungsordnung und Hochschulgesetz NRW.
188
+ Antworte präzise, freundlich und professionell.
189
+ Bei unsicheren Fragen, verweise auf die offiziellen Dokumente."""
190
+ )
191
 
192
+ state.realtime_session_id = session.id
193
+ state.is_streaming = True
 
 
 
194
 
195
+ print(f"DEBUG: Realtime session started: {session.id}")
196
+
197
+ # Bắt đầu streaming
198
+ with openai_client.realtime.connect(
199
+ session_id=session.id,
200
+ event_handler=RealtimeEventHandler(state)
201
+ ) as connection:
202
+ # Keep connection alive
203
+ while state.is_streaming:
204
+ time.sleep(0.1)
205
+
206
+ except Exception as e:
207
+ print(f"DEBUG: Error in realtime conversation: {e}")
208
+ state.is_streaming = False
209
+
210
+ def stop_realtime_conversation():
211
+ """Dừng cuộc hội thoại Realtime"""
212
+ state.is_streaming = False
213
+ if state.realtime_session_id:
214
+ try:
215
+ openai_client.realtime.sessions.delete(state.realtime_session_id)
216
+ except:
217
+ pass
218
+ state.realtime_session_id = None
219
+
220
+ def send_text_to_realtime(text: str):
221
+ """Gửi text đến Realtime API"""
222
+ if not state.is_streaming or not state.realtime_session_id:
223
+ return False
224
+
225
+ try:
226
+ # Tạo Realtime client mới để gửi message
227
+ with openai_client.realtime.connect(session_id=state.realtime_session_id) as connection:
228
+ connection.send({
229
+ "type": "response.create",
230
+ "response": {
231
+ "modalities": ["text", "audio"],
232
+ "instructions": f"Antworte auf: {text}"
233
+ }
234
+ })
235
+ return True
236
  except Exception as e:
237
+ print(f"DEBUG: Error sending to realtime: {e}")
238
+ return False
239
 
240
  # =====================================================
241
+ # TOOLS (FUNCTION CALLS) FOR VOICE AGENT
242
  # =====================================================
243
+ def search_documents_tool(query: str) -> Dict[str, Any]:
244
+ """Tool để tìm kiếm tài liệu"""
245
+ try:
246
+ # Sử dụng retriever để tìm tài liệu liên quan
247
+ docs = retriever.invoke(query)
248
+
249
+ if not docs:
250
+ return {
251
+ "success": False,
252
+ "message": "Keine relevanten Dokumente gefunden.",
253
+ "documents": []
254
+ }
255
+
256
+ # Format kết quả
257
+ results = []
258
+ for i, doc in enumerate(docs[:3], 1):
259
+ meta = doc.metadata
260
+ source_type = meta.get("type", "unknown")
261
+
262
+ if source_type == "pdf":
263
+ source_info = {
264
+ "type": "Prüfungsordnung",
265
+ "page": meta.get("page"),
266
+ "url": meta.get("pdf_url")
267
+ }
268
+ elif source_type == "hg":
269
+ source_info = {
270
+ "type": "Hochschulgesetz NRW",
271
+ "paragraph": meta.get("title"),
272
+ "url": meta.get("viewer_url")
273
+ }
274
+ else:
275
+ source_info = {"type": "unknown"}
276
+
277
+ results.append({
278
+ "id": i,
279
+ "content": doc.page_content[:500] + "...",
280
+ "source": source_info
281
+ })
282
+
283
+ return {
284
+ "success": True,
285
+ "message": f"{len(results)} Dokumente gefunden",
286
+ "documents": results
287
+ }
288
 
289
+ except Exception as e:
290
+ return {
291
+ "success": False,
292
+ "message": f"Fehler bei der Suche: {str(e)}",
293
+ "documents": []
294
+ }
295
+
296
+ def get_legal_advice_tool(question: str) -> Dict[str, Any]:
297
+ """Tool để nhận tư vấn pháp lý từ RAG"""
298
+ try:
299
+ # Sử dụng RAG pipeline
300
+ ans, sources = answer(question, retriever, llm)
301
+
302
+ # Format sources
303
+ formatted_sources = []
304
+ for src in sources:
305
+ formatted_sources.append({
306
+ "source": src["source"],
307
+ "page": src.get("page"),
308
+ "url": src["url"]
309
+ })
310
+
311
+ return {
312
+ "success": True,
313
+ "answer": ans,
314
+ "sources": formatted_sources,
315
+ "has_relevant_info": len(sources) > 0
316
+ }
317
+
318
+ except Exception as e:
319
+ return {
320
+ "success": False,
321
+ "answer": f"Fehler: {str(e)}",
322
+ "sources": [],
323
+ "has_relevant_info": False
324
+ }
325
 
326
  # =====================================================
327
+ # VOICE AGENT WITH TOOLS
328
  # =====================================================
329
+ class VoiceAgent:
330
+ """Voice Agent sử dụng OpenAI Realtime API với Tools"""
331
+
332
+ def __init__(self, openai_client):
333
+ self.client = openai_client
334
+ self.session_id = None
335
+ self.is_active = False
 
 
 
 
 
 
 
336
 
337
+ def start_session(self):
338
+ """Bắt đầu session với tools"""
339
+ try:
340
+ # Tạo session với tools definition
341
+ session = self.client.realtime.sessions.create(
342
+ model="gpt-4o-realtime-preview-2024-12-17",
343
+ voice="shimmer",
344
+ modalities=["text", "audio"],
345
+ instructions="""Du bist ein juristischer Voice Agent.
346
+ Du kannst:
347
+ 1. Dokumente durchsuchen (search_documents)
348
+ 2. Rechtliche Beratung geben (get_legal_advice)
349
+
350
+ Sei präzise, freundlich und hilfreich.
351
+ Verweise immer auf die Quellen.""",
352
+ tools=[
353
+ {
354
+ "type": "function",
355
+ "name": "search_documents",
356
+ "description": "Durchsucht die Prüfungsordnung und das Hochschulgesetz nach relevanten Informationen",
357
+ "parameters": {
358
+ "type": "object",
359
+ "properties": {
360
+ "query": {
361
+ "type": "string",
362
+ "description": "Suchbegriff oder Frage"
363
+ }
364
+ },
365
+ "required": ["query"]
366
+ }
367
+ },
368
+ {
369
+ "type": "function",
370
+ "name": "get_legal_advice",
371
+ "description": "Gibt juristische Beratung basierend auf den Dokumenten",
372
+ "parameters": {
373
+ "type": "object",
374
+ "properties": {
375
+ "question": {
376
+ "type": "string",
377
+ "description": "Juristische Frage"
378
+ }
379
+ },
380
+ "required": ["question"]
381
+ }
382
+ }
383
+ ],
384
+ tool_choice="auto"
385
+ )
386
+
387
+ self.session_id = session.id
388
+ self.is_active = True
389
+
390
+ # Start event handling thread
391
+ threading.Thread(target=self._handle_events, daemon=True).start()
392
+
393
+ return True
394
+
395
+ except Exception as e:
396
+ print(f"DEBUG: Error starting voice agent: {e}")
397
+ return False
398
+
399
+ def _handle_events(self):
400
+ """Xử lý events từ Realtime API"""
401
+ try:
402
+ with self.client.realtime.connect(
403
+ session_id=self.session_id,
404
+ event_handler=VoiceAgentEventHandler(self)
405
+ ) as connection:
406
+
407
+ while self.is_active:
408
+ time.sleep(0.1)
409
+
410
+ except Exception as e:
411
+ print(f"DEBUG: Error in event handler: {e}")
412
+ self.is_active = False
413
+
414
+ def stop_session(self):
415
+ """Dừng session"""
416
+ self.is_active = False
417
+ if self.session_id:
418
+ try:
419
+ self.client.realtime.sessions.delete(self.session_id)
420
+ except:
421
+ pass
422
+ self.session_id = None
423
+
424
+ def process_tool_call(self, tool_name: str, arguments: Dict) -> Dict:
425
+ """Xử lý tool calls"""
426
+ try:
427
+ if tool_name == "search_documents":
428
+ query = arguments.get("query", "")
429
+ return search_documents_tool(query)
430
+
431
+ elif tool_name == "get_legal_advice":
432
+ question = arguments.get("question", "")
433
+ return get_legal_advice_tool(question)
434
+
435
+ else:
436
+ return {
437
+ "success": False,
438
+ "message": f"Unbekanntes Tool: {tool_name}"
439
+ }
440
+
441
+ except Exception as e:
442
+ return {
443
+ "success": False,
444
+ "message": f"Tool Fehler: {str(e)}"
445
+ }
446
+
447
+ class VoiceAgentEventHandler(AssistantEventHandler):
448
+ """Event handler cho Voice Agent"""
449
+
450
+ def __init__(self, agent):
451
+ super().__init__()
452
+ self.agent = agent
453
+ self.current_text = ""
454
+
455
+ def on_text_delta(self, delta, snapshot):
456
+ """Xử lý text delta"""
457
+ if delta.value:
458
+ self.current_text += delta.value
459
+ # Thêm vào state text queue
460
+ state.text_queue.put({
461
+ "type": "agent_text",
462
+ "content": delta.value
463
+ })
464
+
465
+ def on_function_call_arguments_delta(self, delta, snapshot):
466
+ """Xử lý function call arguments"""
467
+ print(f"DEBUG: Function call arguments: {delta}")
468
+
469
+ def on_function_call_done(self, function_call, snapshot):
470
+ """Khi function call hoàn thành"""
471
+ try:
472
+ tool_name = function_call.name
473
+ arguments = json.loads(function_call.arguments)
474
+
475
+ print(f"DEBUG: Processing tool call: {tool_name}, args: {arguments}")
476
+
477
+ # Process tool call
478
+ result = self.agent.process_tool_call(tool_name, arguments)
479
+
480
+ # Gửi kết quả trở lại
481
+ with openai_client.realtime.connect(session_id=self.agent.session_id) as conn:
482
+ conn.send({
483
+ "type": "response.function_call_arguments",
484
+ "function_call_id": function_call.id,
485
+ "output": json.dumps(result)
486
+ })
487
+
488
+ except Exception as e:
489
+ print(f"DEBUG: Error processing function call: {e}")
490
 
491
+ def on_response_done(self, response):
492
+ """Khi response hoàn thành"""
493
+ if self.current_text:
494
+ state.add_message("assistant", self.current_text)
495
+ self.current_text = ""
496
+
497
+ # Khởi tạo Voice Agent
498
+ voice_agent = VoiceAgent(openai_client)
499
 
500
  # =====================================================
501
+ # GRADIO UI COMPONENTS
502
  # =====================================================
503
  def format_sources(src):
504
+ """Format sources cho display"""
505
  if not src:
506
  return ""
507
 
 
515
 
516
  return "\n".join(out)
517
 
518
+ def update_chat_display(history, new_text=""):
519
+ """Cập nhật chat display với streaming text"""
520
+ if not history:
 
 
 
 
 
 
 
 
521
  history = []
522
 
523
+ if new_text:
524
+ # Nếu last message là của assistant, append text
525
+ if history and history[-1]["role"] == "assistant":
526
+ history[-1]["content"] += new_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
  else:
528
+ history.append({"role": "assistant", "content": new_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
+ return history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
+ def process_queue_updates():
533
+ """Process queue updates cho streaming"""
534
+ updates = []
 
 
 
 
535
 
536
+ # Process text queue
537
+ while not state.text_queue.empty():
538
+ try:
539
+ item = state.text_queue.get_nowait()
540
+ updates.append(("text", item.get("content", "")))
541
+ except queue.Empty:
542
+ break
543
+
544
+ # Process audio queue (simplified - trong thực tế cần xử lý audio)
545
+ while not state.audio_queue.empty():
546
+ try:
547
+ item = state.audio_queue.get_nowait()
548
+ # thể xử lý audio data ở đây
549
+ pass
550
+ except queue.Empty:
551
+ break
552
+
553
+ return updates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
  # =====================================================
556
+ # UI – GRADIO INTERFACE
557
  # =====================================================
558
+ with gr.Blocks(title="Prüfungsrechts-Chatbot mit OpenAI Realtime API", theme=gr.themes.Soft()) as demo:
559
+ # CSS Styling
560
  gr.HTML("""
561
  <style>
562
  .gradio-container {
563
  max-width: 1200px;
564
  margin: 0 auto;
 
565
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
566
  }
567
 
568
  .header {
569
  text-align: center;
570
  margin-bottom: 30px;
571
+ padding: 25px;
572
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
573
  border-radius: 15px;
574
  color: white;
 
582
  border: 1px solid #e2e8f0;
583
  }
584
 
585
+ .status-indicator {
586
+ padding: 10px 15px;
587
+ border-radius: 10px;
588
+ font-weight: 500;
589
+ display: inline-flex;
590
+ align-items: center;
591
+ gap: 8px;
592
  }
593
 
594
+ .status-active {
595
+ background: #d1fae5;
596
+ color: #065f46;
 
 
 
 
 
 
597
  }
598
 
599
+ .status-inactive {
600
+ background: #f3f4f6;
601
+ color: #6b7280;
602
  }
603
 
604
+ .voice-btn {
605
+ padding: 12px 24px;
606
+ border-radius: 25px;
607
+ font-weight: 600;
608
+ transition: all 0.3s;
609
+ border: none;
 
 
 
 
 
610
  }
611
 
612
+ .voice-btn-start {
613
+ background: linear-gradient(135deg, #10b981 0%, #059669 100%);
614
+ color: white;
615
  }
616
 
617
+ .voice-btn-stop {
618
+ background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
619
+ color: white;
 
 
 
 
 
620
  }
621
 
622
+ .chat-message {
623
+ padding: 15px;
624
+ border-radius: 15px;
625
+ margin: 10px 0;
626
+ max-width: 85%;
 
 
 
 
627
  }
628
 
629
+ .user-message {
630
+ background: #e0e7ff;
631
+ margin-left: auto;
 
632
  }
633
 
634
+ .assistant-message {
635
+ background: #f3f4f6;
636
+ margin-right: auto;
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  }
638
  </style>
639
  """)
640
 
641
  # Header
642
  with gr.Column(elem_classes=["header"]):
643
+ gr.Markdown("# 🎤 Voice Agent mit OpenAI Realtime API")
644
+ gr.Markdown("### Live Sprachkonversation mit juristischer Beratung")
 
 
 
 
 
 
 
 
 
 
645
 
646
  # Control Panel
647
  with gr.Column(elem_classes=["control-panel"]):
648
  with gr.Row():
649
+ # Status Display
650
+ status_display = gr.HTML(
651
+ value='<div class="status-indicator status-inactive">🔴 Voice Agent inaktiv</div>',
652
+ label="Status"
653
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
+ # Voice Controls
656
  with gr.Column(scale=1):
657
+ start_voice_btn = gr.Button(
658
+ "🎤 Start Voice Conversation",
659
+ variant="primary",
660
+ elem_classes=["voice-btn", "voice-btn-start"]
 
661
  )
662
+ stop_voice_btn = gr.Button(
663
+ "⏹️ Stop Voice Conversation",
664
+ variant="secondary",
665
+ elem_classes=["voice-btn", "voice-btn-stop"],
666
+ visible=False
667
+ )
668
+
669
+ # Mode Selection
670
+ with gr.Row():
671
+ mode_selector = gr.Radio(
672
+ choices=["Voice Agent (Live Conversation)", "Text Chat (RAG)"],
673
+ value="Text Chat (RAG)",
674
+ label="Modus"
675
+ )
676
 
677
  # Main Chat Interface
678
+ with gr.Column():
679
  # Chatbot Display
680
  chatbot = gr.Chatbot(
681
  label="Konversation",
682
+ height=500,
683
+ avatar_images=(None, "🤖"),
684
+ bubble_full_width=False,
685
+ show_copy_button=True
686
  )
687
 
688
+ # Input Area (cho Text Mode)
689
+ with gr.Row(visible=True) as text_input_row:
690
+ chat_input = gr.Textbox(
691
+ label="Ihre Frage",
692
+ placeholder="Stellen Sie eine juristische Frage...",
693
+ lines=2,
 
694
  max_lines=4,
695
+ scale=8
 
 
696
  )
697
+ send_btn = gr.Button("Senden", variant="primary", scale=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
 
699
+ # Voice Interface (cho Voice Mode)
700
+ with gr.Row(visible=False) as voice_interface:
701
+ gr.Markdown("### 🎤 Sprechen Sie jetzt...")
702
+ voice_status = gr.Textbox(
703
+ label="Status",
704
+ value="Bereit für Sprachaufnahme",
705
+ interactive=False
706
+ )
707
+ voice_output = gr.Textbox(label="Transkription", interactive=False)
708
 
709
  # Documents Section
710
+ with gr.Accordion("📚 Dokumente & Quellen", open=False):
711
  with gr.Tabs():
712
+ with gr.TabItem("📄 Prüfungsordnung"):
713
+ PDF(pdf_meta["pdf_url"], height=400)
714
 
715
  with gr.TabItem("📘 Hochschulgesetz NRW"):
716
+ if hg_url:
717
+ gr.HTML(f'''
718
+ <div style="padding: 20px;">
719
+ <h3>Hochschulgesetz NRW Viewer</h3>
720
+ <a href="{hg_url}" target="_blank" style="display: inline-block; padding: 10px 20px; background: #3b82f6; color: white; text-decoration: none; border-radius: 5px; margin-bottom: 15px;">
721
+ Im Viewer öffnen
722
+ </a>
723
+ <iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>
724
+ </div>
725
+ ''')
726
+
727
+ # Queue Processing (cho streaming updates)
728
+ interval_component = gr.Interval(value=100, interactive=False)
729
 
730
  # =====================================================
731
  # EVENT HANDLERS
732
  # =====================================================
733
 
734
+ def toggle_mode(mode):
735
+ """Chuyển đổi giữa Voice và Text mode"""
736
+ if "Voice Agent" in mode:
737
+ return (
738
+ gr.Row(visible=False), # text_input_row
739
+ gr.Row(visible=True), # voice_interface
740
+ '<div class="status-indicator status-inactive">🔴 Bitte Voice Agent starten</div>'
741
+ )
742
+ else:
743
+ stop_voice_agent()
744
+ return (
745
+ gr.Row(visible=True), # text_input_row
746
+ gr.Row(visible=False), # voice_interface
747
+ '<div class="status-indicator status-inactive">🔴 Text Mode aktiv</div>'
748
+ )
749
 
750
+ def start_voice_agent():
751
+ """Bắt đầu Voice Agent"""
752
+ success = voice_agent.start_session()
753
+ if success:
754
+ state.is_streaming = True
755
+ return (
756
+ gr.Button(visible=False), # start_voice_btn
757
+ gr.Button(visible=True), # stop_voice_btn
758
+ '<div class="status-indicator status-active">🟢 Voice Agent aktiv - Sprechen Sie jetzt</div>',
759
+ "Voice Agent gestartet. Sie können jetzt sprechen..."
760
+ )
761
+ else:
762
+ return (
763
+ gr.Button(visible=True),
764
+ gr.Button(visible=False),
765
+ '<div class="status-indicator status-inactive">🔴 Fehler beim Starten</div>',
766
+ "Fehler beim Starten des Voice Agents"
767
+ )
768
 
769
+ def stop_voice_agent():
770
+ """Dừng Voice Agent"""
771
+ voice_agent.stop_session()
772
+ state.is_streaming = False
773
+ state.reset()
774
+ return (
775
+ gr.Button(visible=True), # start_voice_btn
776
+ gr.Button(visible=False), # stop_voice_btn
777
+ '<div class="status-indicator status-inactive">🔴 Voice Agent gestoppt</div>',
778
+ "Voice Agent gestoppt"
779
+ )
780
 
781
+ def process_text_chat(message, history):
782
+ """Xử text chat với RAG"""
783
+ if not message:
784
+ return history, ""
785
+
786
+ # Thêm user message
787
+ history.append({"role": "user", "content": message})
788
+
789
  try:
790
+ # Get RAG answer
791
+ ans, sources = answer(message, retriever, llm)
792
+ full_response = ans + format_sources(sources)
793
+
794
+ # Add assistant message
795
+ history.append({"role": "assistant", "content": full_response})
796
+
797
+ # Add to state
798
+ state.add_message("user", message)
799
+ state.add_message("assistant", ans)
800
+
801
  except Exception as e:
802
+ error_msg = f"Fehler: {str(e)[:100]}"
803
+ history.append({"role": "assistant", "content": error_msg})
804
+
805
+ return history, ""
 
806
 
807
+ def update_streaming_display(history):
808
+ """Cập nhật display với streaming text"""
809
+ updates = process_queue_updates()
810
+
811
+ if not updates:
812
+ return history
813
+
814
+ for update_type, content in updates:
815
+ if update_type == "text" and content:
816
+ history = update_chat_display(history, content)
817
+
818
+ return history
819
+
820
+ # Mode toggle
821
+ mode_selector.change(
822
+ toggle_mode,
823
+ inputs=[mode_selector],
824
+ outputs=[text_input_row, voice_interface, status_display]
825
  )
826
 
827
+ # Voice Agent controls
828
+ start_voice_btn.click(
829
+ start_voice_agent,
830
+ outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
 
 
 
 
831
  )
832
 
833
+ stop_voice_btn.click(
834
+ stop_voice_agent,
835
+ outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
 
 
 
 
 
 
 
 
 
 
 
836
  )
837
 
838
+ # Text chat
839
+ send_btn.click(
840
+ process_text_chat,
841
+ inputs=[chat_input, chatbot],
842
+ outputs=[chatbot, chat_input]
843
  )
844
 
845
+ chat_input.submit(
846
+ process_text_chat,
847
+ inputs=[chat_input, chatbot],
848
+ outputs=[chatbot, chat_input]
849
+ )
 
 
850
 
851
+ # Streaming updates
852
+ interval_component.timer(
853
+ update_streaming_display,
854
  inputs=[chatbot],
855
+ outputs=[chatbot],
856
+ every=0.1
 
 
 
 
 
857
  )
858
 
859
  if __name__ == "__main__":
realtime_server.py DELETED
@@ -1,97 +0,0 @@
1
- """
2
- realtime_server.py — v0.1 (2025-12-08)
3
-
4
- Realtime signaling & streaming server (WebSocket-based) for live audio chat.
5
- This module is optional and preserves backward compatibility with existing
6
- Gradio UI. When enabled, clients can stream microphone audio chunks to
7
- `/ws` and receive live transcripts (OpenAI Whisper API) and bot replies.
8
-
9
- NOTE: A full WebRTC peer-to-peer relay with SDP/ICE is scaffolded via
10
- `/webrtc/offer` but returns 501 until the upstream Realtime API is wired.
11
- """
12
-
13
- import os
14
- import asyncio
15
- import json
16
- from typing import Optional
17
- from fastapi import FastAPI, WebSocket, WebSocketDisconnect
18
- from fastapi.responses import JSONResponse
19
-
20
- # Minimal import guard for OpenAI
21
- try:
22
- from openai import OpenAI
23
- OPENAI_AVAILABLE = True
24
- except Exception:
25
- OPENAI_AVAILABLE = False
26
-
27
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
28
-
29
- app = FastAPI()
30
-
31
-
32
- def _openai_transcribe_file(path: str, language: Optional[str] = None) -> str:
33
- """Transcribe a local WAV chunk via OpenAI Whisper-1.
34
- Returns empty string on failure to keep the stream resilient."""
35
- if not (OPENAI_AVAILABLE and OPENAI_API_KEY and path and os.path.exists(path)):
36
- return ""
37
- try:
38
- client = OpenAI(api_key=OPENAI_API_KEY)
39
- with open(path, "rb") as f:
40
- resp = client.audio.transcriptions.create(
41
- model="whisper-1",
42
- file=f,
43
- language=language if language and language != "auto" else None,
44
- )
45
- txt = getattr(resp, "text", "") or (resp.get("text") if isinstance(resp, dict) else "")
46
- return (txt or "").strip()
47
- except Exception:
48
- return ""
49
-
50
-
51
- @app.get("/health")
52
- async def health():
53
- """Basic health endpoint."""
54
- return JSONResponse({"status": "ok"})
55
-
56
-
57
- @app.post("/webrtc/offer")
58
- async def webrtc_offer(body: dict):
59
- """SDP offer scaffold (not fully implemented).
60
- Returns 501 until Realtime API relay is wired (to keep backward compatibility)."""
61
- return JSONResponse({"error": "not_implemented"}, status_code=501)
62
-
63
-
64
- @app.websocket("/ws")
65
- async def ws_stream(ws: WebSocket):
66
- """WebSocket bidirectional streaming.
67
- Client sends JSON frames:
68
- {"type":"audio_chunk","path":"/tmp/chunk.wav","lang":"de"}
69
- Server responds with transcript frames:
70
- {"type":"transcript","text":"..."}
71
- and bot reply frames (if desired in future).
72
- """
73
- await ws.accept()
74
- try:
75
- while True:
76
- raw = await ws.receive_text()
77
- try:
78
- msg = json.loads(raw)
79
- except Exception:
80
- await ws.send_text(json.dumps({"type": "error", "message": "invalid_json"}))
81
- continue
82
-
83
- if msg.get("type") == "audio_chunk":
84
- path = msg.get("path")
85
- lang = msg.get("lang")
86
- text = _openai_transcribe_file(path, language=lang)
87
- await ws.send_text(json.dumps({"type": "transcript", "text": text}))
88
- else:
89
- await ws.send_text(json.dumps({"type": "error", "message": "unknown_type"}))
90
- except WebSocketDisconnect:
91
- pass
92
- except Exception:
93
- try:
94
- await ws.close()
95
- except Exception:
96
- pass
97
-