Nguyen5 commited on
Commit
3a9ed51
·
1 Parent(s): 6b33840
Files changed (2) hide show
  1. app.py +161 -659
  2. speech_io.py +121 -418
app.py CHANGED
@@ -1,709 +1,211 @@
1
- # app.py – Prüfungsrechts-Chatbot (RAG + Sprache, UI kiểu ChatGPT) với các tính năng nâng cao
2
- #
3
- import os
4
- import time
5
- from dataclasses import dataclass, field
6
- from typing import Optional, Dict, Any
7
  import gradio as gr
8
  from gradio_pdf import PDF
9
- import numpy as np
10
 
11
- from load_documents import load_all_documents
12
  from split_documents import split_documents
13
  from vectorstore import build_vectorstore
14
  from retriever import get_retriever
15
  from llm import load_llm
16
- from rag_pipeline import answer
17
- from speech_io import transcribe_audio, synthesize_speech, detect_voice_activity
18
-
19
- # Cấu hình môi trường
20
- ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
21
- ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
22
- VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
23
 
24
- # =====================================================
25
- # STATE MANAGEMENT - Quản lý trạng thái hội thoại liền mạch
26
- # =====================================================
27
- @dataclass
28
- class ConversationState:
29
- """Quản lý trạng thái hội thoại"""
30
- messages: list = field(default_factory=list)
31
- last_audio_time: float = field(default_factory=time.time)
32
- is_listening: bool = False
33
- vad_confidence: float = 0.0
34
- conversation_context: str = ""
35
- whisper_model: str = field(default_factory=lambda: os.getenv("WHISPER_MODEL", "base"))
36
- language: str = field(default_factory=lambda: ASR_LANGUAGE_HINT)
37
- current_audio_path: Optional[str] = None
38
-
39
- def add_message(self, role: str, content: str):
40
- """Thêm message vào hội thoại"""
41
- self.messages.append({
42
- "role": role,
43
- "content": content,
44
- "timestamp": time.time()
45
- })
46
- # Giới hạn lịch sử
47
- if len(self.messages) > 20:
48
- self.messages = self.messages[-20:]
49
-
50
- # Cập nhật context
51
- self._update_context()
52
-
53
- def _update_context(self):
54
- """Cập nhật context từ hội thoại"""
55
- if not self.messages:
56
- self.conversation_context = ""
57
- return
58
-
59
- context_parts = []
60
- for msg in self.messages[-5:]: # Giữ 5 message gần nhất
61
- prefix = "User" if msg["role"] == "user" else "Assistant"
62
- context_parts.append(f"{prefix}: {msg['content'][:200]}") # Giới hạn độ dài
63
- self.conversation_context = "\n".join(context_parts)
64
-
65
- def get_recent_context(self, num_messages: int = 3) -> str:
66
- """Lấy context gần đây"""
67
- if not self.messages or num_messages <= 0:
68
- return ""
69
-
70
- recent = self.messages[-num_messages:] if len(self.messages) >= num_messages else self.messages
71
- return "\n".join([f"{m['role']}: {m['content']}" for m in recent])
72
-
73
- def reset(self):
74
- """Reset trạng thái hội thoại"""
75
- self.messages = []
76
- self.conversation_context = ""
77
- self.is_listening = False
78
- self.vad_confidence = 0.0
79
- self.current_audio_path = None
80
-
81
- # Khởi tạo state
82
- state = ConversationState()
83
 
84
  # =====================================================
85
  # INITIALISIERUNG (global)
86
  # =====================================================
87
 
88
- print("📚 Lade Dokumente")
89
- docs = load_all_documents()
90
 
91
- print("🔪 Splitte Dokumente")
92
- chunks = split_documents(docs)
93
 
94
- print("🔍 Erstelle VectorStore")
95
- vs = build_vectorstore(chunks)
96
 
97
- print("🔎 Erzeuge Retriever")
98
- retriever = get_retriever(vs)
99
 
100
- print("🤖 Lade LLM")
101
- llm = load_llm()
102
 
103
- # Dokument-Metadaten für UI
104
- pdf_meta = next(d.metadata for d in docs if d.metadata.get("type") == "pdf")
105
- hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
106
- hg_url = hg_meta.get("viewer_url")
107
 
108
  # =====================================================
109
- # VOICE ACTIVITY DETECTION
110
  # =====================================================
111
- def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) -> Dict[str, Any]:
112
- """Xử lý phát hiện hoạt động giọng nói"""
113
- if audio_data is None or len(audio_data) == 0:
114
- return {"is_speech": False, "confidence": 0.0, "status": "No audio data"}
115
-
116
- try:
117
- vad_result = detect_voice_activity(audio_data, sample_rate, threshold=VAD_THRESHOLD)
118
-
119
- # Cập nhật state
120
- state.is_listening = vad_result["is_speech"]
121
- if vad_result["is_speech"]:
122
- state.last_audio_time = time.time()
123
- state.vad_confidence = vad_result["confidence"]
124
-
125
- return {
126
- "is_speech": vad_result["is_speech"],
127
- "confidence": vad_result["confidence"],
128
- "status": f"Speech detected: {vad_result['is_speech']} (conf: {vad_result['confidence']:.2f})"
129
- }
130
- except Exception as e:
131
- print(f"VAD error: {e}")
132
- return {"is_speech": False, "confidence": 0.0, "status": f"VAD error: {e}"}
133
 
134
- # =====================================================
135
- # TRANSCRIBE WITH OPTIMIZED PIPELINE
136
- # =====================================================
137
- def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
138
- if not audio_path or not os.path.exists(audio_path):
139
  return ""
140
- return transcribe_audio(audio_path, language=language)
141
 
142
- # =====================================================
143
- # CONVERSATIONAL INTELLIGENCE
144
- # =====================================================
145
- def enhance_conversation_context(user_input: str, history: list) -> str:
146
- """Tăng cường context hội thoại"""
147
- if not user_input:
148
- return user_input
149
-
150
- # Thêm context đơn giản từ history
151
- if history and len(history) > 0:
152
- # Lấy 3 tin nhắn gần nhất từ history
153
- recent_history = history[-3:] if len(history) >= 3 else history
154
- context_parts = ["Previous conversation:"]
155
- for msg in recent_history:
156
- role = "User" if msg.get("role") == "user" else "Assistant"
157
- content = msg.get("content", "")[:100] # Giới hạn độ dài
158
- context_parts.append(f"{role}: {content}")
159
-
160
- context = "\n".join(context_parts)
161
- return f"{context}\n\nCurrent question: {user_input}"
162
-
163
- return user_input
164
 
165
- # =====================================================
166
- # Quellen formatieren – Markdown für Chat
167
- # =====================================================
168
- def format_sources(src):
169
- if not src:
170
- return ""
171
 
172
- out = ["", "## 📚 Quellen"]
 
 
 
173
 
174
- for s in src:
175
- line = f"- [{s['source']}]({s['url']})"
176
- if s.get("page") is not None:
177
- line += f" (Seite {s['page']})"
178
- out.append(line)
179
 
180
- return "\n".join(out)
181
 
182
- # =====================================================
183
- # CORE CHAT-FUNKTION với tất cả tính năng mới
184
- # =====================================================
185
- def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
186
- print(f"DEBUG: chat_fn called - text_input: '{text_input}', audio_path: {audio_path}, history length: {len(history) if history else 0}")
187
- # Chuẩn hóa history về dạng list các cặp [user, assistant]
188
- def to_pairs(h):
189
- if not h:
190
- return []
191
- if isinstance(h[0], dict):
192
- pairs = []
193
- current = [None, None]
194
- for m in h:
195
- if m.get("role") == "user":
196
- if current != [None, None]:
197
- pairs.append(current)
198
- current = [m.get("content", ""), None]
199
- elif m.get("role") == "assistant":
200
- if current[0] is None:
201
- pairs.append([None, m.get("content", "")])
202
- else:
203
- current[1] = m.get("content", "")
204
- pairs.append(current)
205
- current = [None, None]
206
- if current != [None, None]:
207
- pairs.append(current)
208
- return pairs
209
- return h
210
- pairs = to_pairs(history)
211
- text_to_process = ""
212
- # Lấy audio_path nếu chưa có, dùng bản ghi cuối cùng
213
- if (not audio_path) and state.current_audio_path and os.path.exists(state.current_audio_path):
214
- audio_path = state.current_audio_path
215
- # Xử lý audio nếu có
216
- if audio_path and os.path.exists(audio_path):
217
- print(f"DEBUG: Processing audio file: {audio_path}")
218
- state.current_audio_path = audio_path
219
- if use_vad and ENABLE_VAD:
220
- try:
221
- import soundfile as sf
222
- audio_data, sample_rate = sf.read(audio_path)
223
- vad_result = handle_voice_activity(audio_data, sample_rate)
224
- print(f"DEBUG: VAD result: {vad_result}")
225
- if vad_result.get("is_speech", True):
226
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
227
- if transcribed_text and transcribed_text.strip():
228
- text_to_process = transcribed_text.strip()
229
- print(f"DEBUG: Transcribed text: {text_to_process}")
230
- except Exception as e:
231
- print(f"DEBUG: Error in VAD/transcription: {e}")
232
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
233
- if transcribed_text and transcribed_text.strip():
234
- text_to_process = transcribed_text.strip()
235
- else:
236
- transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
237
- if transcribed_text and transcribed_text.strip():
238
- text_to_process = transcribed_text.strip()
239
- print(f"DEBUG: Transcribed text (no VAD): {text_to_process}")
240
- # Nếu có text input từ textbox, ưu tiên sử dụng nó
241
- if text_input and text_input.strip():
242
- text_to_process = text_input.strip()
243
- print(f"DEBUG: Using text input: {text_to_process}")
244
- # Không có text để xử lý
245
- if not text_to_process:
246
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
247
- return pairs, "", None, status_text
248
- print(f"DEBUG: Processing text: {text_to_process}")
249
- enhanced_question = enhance_conversation_context(text_to_process, pairs)
250
- try:
251
- ans, sources = answer(enhanced_question, retriever, llm)
252
- bot_msg = ans + format_sources(sources)
253
- state.add_message("user", text_to_process)
254
- state.add_message("assistant", ans)
255
- pairs.append([text_to_process, bot_msg])
256
- except Exception as e:
257
- print(f"DEBUG: Error in RAG pipeline: {e}")
258
- error_msg = "Entschuldigung, es gab einen Fehler bei der Verarbeitung Ihrer Anfrage. Bitte versuchen Sie es erneut."
259
- pairs.append([text_to_process, error_msg])
260
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
261
- return pairs, "", None, status_text
262
 
263
  # =====================================================
264
- # FUNCTIONS FOR UI CONTROLS
265
  # =====================================================
266
- def toggle_vad(use_vad):
267
- """Toggle Voice Activity Detection"""
268
- global ENABLE_VAD
269
- ENABLE_VAD = use_vad
270
- status = "EIN" if use_vad else "AUS"
271
- return f"Voice Activity Detection: {status} | Model: OpenAI whisper-1"
272
-
273
- def change_whisper_model(model_size):
274
- """Đổi Whisper model"""
275
- state.whisper_model = model_size
276
- os.environ["WHISPER_MODEL"] = model_size
277
- return f"Whisper Model: OpenAI whisper-1 | VAD: {'On' if ENABLE_VAD else 'Off'}"
278
-
279
- def clear_conversation():
280
- """Xóa hội thoại"""
281
- state.reset()
282
- return [], "Konversation gelöscht | Bereit"
283
-
284
- def update_vad_indicator():
285
- """Cập nhật VAD indicator"""
286
- if state.is_listening:
287
- indicator_html = """
288
- <div style="display: flex; align-items: center; gap: 8px;">
289
- <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #10b981; box-shadow: 0 0 10px #10b981; animation: pulse 1.5s infinite;"></div>
290
- <span style="color: #10b981; font-weight: bold;">Sprache erkannt</span>
291
- </div>
292
- <style>
293
- @keyframes pulse {
294
- 0% { opacity: 0.7; }
295
- 50% { opacity: 1; }
296
- 100% { opacity: 0.7; }
297
- }
298
- </style>
299
- """
300
- else:
301
- indicator_html = """
302
- <div style="display: flex; align-items: center; gap: 8px;">
303
- <div style="width: 12px; height: 12px; border-radius: 50%; background-color: #6b7280;"></div>
304
- <span>Bereit</span>
305
- </div>
306
- """
307
-
308
- return indicator_html
309
 
310
  # =====================================================
311
- # AUDIO STREAMING HANDLER
312
  # =====================================================
313
- def handle_audio_stream(audio_path, use_vad):
314
- """Xử audio streaming real-time"""
315
- if not audio_path or not os.path.exists(audio_path):
316
- return "", update_vad_indicator(), "Keine Audiodatei"
317
-
318
- try:
319
- import soundfile as sf
320
- audio_data, sample_rate = sf.read(audio_path)
321
-
322
- # Cập nhật VAD indicator
323
- vad_html = update_vad_indicator()
324
-
325
- if use_vad and ENABLE_VAD:
326
- vad_result = handle_voice_activity(audio_data, sample_rate)
327
-
328
- if vad_result.get("is_speech", False):
329
- # Nếu phát hiện giọng nói, transcribe
330
- text = transcribe_audio_optimized(audio_path, language=state.language)
331
- status = f"Sprache erkannt ({vad_result.get('confidence', 0):.2f})"
332
- return text, vad_html, status
333
- else:
334
- status = "Keine Sprache erkannt"
335
- return "", vad_html, status
336
- else:
337
- # Nếu VAD không bật, vẫn transcribe nhưng hiển thị trạng thái khác
338
- text = transcribe_audio_optimized(audio_path, language=state.language)
339
- status = "Transkription (VAD aus)"
340
- return text, vad_html, status
341
-
342
- except Exception as e:
343
- print(f"Error in audio stream handler: {e}")
344
- return "", update_vad_indicator(), f"Fehler: {str(e)[:50]}"
345
 
346
  # =====================================================
347
- # TTS FUNCTION
348
  # =====================================================
 
349
  def read_last_answer(history):
350
  if not history:
351
- print("DEBUG: No history for TTS")
352
  return None
 
353
  for msg in reversed(history):
354
- if isinstance(msg, (list, tuple)) and len(msg) == 2 and msg[1]:
355
- content = msg[1]
356
- if "## 📚 Quellen" in content:
357
- content = content.split("## 📚 Quellen")[0].strip()
358
- print(f"DEBUG: Synthesizing speech for: {content[:100]}...")
359
- audio_result = synthesize_speech(content)
360
- if audio_result:
361
- print("DEBUG: TTS successful")
362
- return audio_result
363
- print("DEBUG: No assistant message found for TTS")
364
  return None
365
 
366
  # =====================================================
367
- # UI – GRADIO với tất cả tính năng mới
368
  # =====================================================
369
- with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as demo:
370
- # CSS Styling nâng cao
371
- gr.HTML("""
372
- <style>
373
- .gradio-container {
374
- max-width: 1200px;
375
- margin: 0 auto;
376
- padding: 20px;
377
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
378
- }
379
-
380
- .header {
381
- text-align: center;
382
- margin-bottom: 30px;
383
- padding: 20px;
384
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
385
- border-radius: 15px;
386
- color: white;
387
- }
388
-
389
- .control-panel {
390
- background: #f8f9fa;
391
- padding: 20px;
392
- border-radius: 15px;
393
- margin-bottom: 20px;
394
- border: 1px solid #e2e8f0;
395
- }
396
-
397
- .chat-container {
398
- background: white;
399
- border-radius: 15px;
400
- padding: 20px;
401
- box-shadow: 0 4px 20px rgba(0,0,0,0.1);
402
- margin-bottom: 20px;
403
- }
404
-
405
- .input-row {
406
- background: #f8fafc;
407
- border-radius: 25px;
408
- padding: 10px 20px;
409
- border: 2px solid #e2e8f0;
410
- transition: all 0.3s ease;
411
- display: flex;
412
- align-items: center;
413
- gap: 10px;
414
- }
415
-
416
- .input-row:focus-within {
417
- border-color: #667eea;
418
- box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
419
- }
420
-
421
- .send-btn {
422
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
423
- color: white !important;
424
- border: none !important;
425
- border-radius: 50% !important;
426
- width: 44px !important;
427
- height: 44px !important;
428
- display: flex !important;
429
- align-items: center !important;
430
- justify-content: center !important;
431
- cursor: pointer !important;
432
- }
433
-
434
- .send-btn:hover {
435
- transform: scale(1.05);
436
- box-shadow: 0 4px 15px rgba(102, 126, 234, 0.4) !important;
437
- }
438
-
439
- .vad-indicator-container {
440
- padding: 10px;
441
- background: #f1f5f9;
442
- border-radius: 10px;
443
- margin: 10px 0;
444
- display: flex;
445
- align-items: center;
446
- gap: 10px;
447
- }
448
-
449
- .feature-badge {
450
- display: inline-block;
451
- padding: 4px 12px;
452
- background: #e0e7ff;
453
- color: #4f46e5;
454
- border-radius: 20px;
455
- font-size: 12px;
456
- font-weight: 500;
457
- margin: 2px;
458
- }
459
-
460
- .chatbot {
461
- min-height: 400px;
462
- max-height: 500px;
463
- overflow-y: auto;
464
- }
465
-
466
- /* Responsive design */
467
- @media (max-width: 768px) {
468
- .gradio-container {
469
- padding: 10px;
470
- }
471
-
472
- .input-row {
473
- flex-direction: column;
474
- gap: 10px;
475
- }
476
-
477
- .send-btn {
478
- width: 100% !important;
479
- height: 44px !important;
480
- border-radius: 10px !important;
481
- }
482
- }
483
- </style>
484
- """)
485
-
486
- # Header
487
- with gr.Column(elem_classes=["header"]):
488
- gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
489
- gr.Markdown("### Intelligent Voice Interface with Advanced Features")
490
-
491
- # Feature badges
492
- gr.HTML("""
493
- <div style="text-align: center; margin: 10px 0;">
494
- <span class="feature-badge">🎤 Voice Activity Detection</span>
495
- <span class="feature-badge">⚡ Fast Transcription</span>
496
- <span class="feature-badge">🧠 Conversational AI</span>
497
- <span class="feature-badge">📚 Document RAG</span>
498
- </div>
499
- """)
500
-
501
- # Control Panel
502
- with gr.Column(elem_classes=["control-panel"]):
503
- with gr.Row():
504
- with gr.Column(scale=2):
505
- # Model Selection
506
- model_selector = gr.Dropdown(
507
- choices=["tiny", "base", "small", "medium"],
508
- value=state.whisper_model,
509
- label="Whisper Model",
510
- info="Wählen Sie das Modell für Spracherkennung"
511
- )
512
-
513
- # VAD Control
514
- vad_toggle = gr.Checkbox(
515
- value=ENABLE_VAD,
516
- label="Voice Activity Detection aktivieren",
517
- info="Automatische Spracherkennung"
518
- )
519
-
520
- # Language Selection
521
- lang_selector = gr.Dropdown(
522
- choices=["de", "en", "auto"],
523
- value=ASR_LANGUAGE_HINT,
524
- label="Spracherkennung Sprache"
525
- )
526
-
527
- with gr.Column(scale=1):
528
- # Status Display
529
- status_display = gr.Textbox(
530
- label="System Status",
531
- value="Bereit",
532
- interactive=False
533
- )
534
-
535
- # Clear Conversation Button
536
- clear_btn = gr.Button("🗑️ Konversation löschen", variant="secondary", size="sm")
537
-
538
- # VAD Indicator
539
- vad_indicator = gr.HTML(value=update_vad_indicator(), label="VAD Status")
540
-
541
- # Main Chat Interface
542
- with gr.Column(elem_classes=["chat-container"]):
543
- # Chatbot Display
544
- chatbot = gr.Chatbot(
545
- label="Konversation",
546
- height=400,
547
- avatar_images=(None, "🤖")
548
- )
549
-
550
- # Input Row với VAD Indicator
551
- with gr.Row(elem_classes=["input-row"]):
552
- # Text Input
553
- chat_text = gr.Textbox(
554
- label=None,
555
- placeholder="Stellen Sie eine Frage oder sprechen Sie ins Mikrofon...",
556
- lines=1,
557
- max_lines=4,
558
- scale=8,
559
- container=False,
560
- show_label=False
561
  )
562
-
563
- # Audio Input
564
- chat_audio = gr.Audio(
565
- sources=["microphone"],
566
- type="filepath",
567
- format="wav",
568
- streaming=False,
569
- interactive=True,
570
- show_label=False,
571
- scale=1,
572
- elem_id="audio-input"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  )
574
-
575
- # Send Button
576
- send_btn = gr.Button("➤", variant="primary", elem_classes=["send-btn"], scale=1)
577
-
578
- # TTS Controls
579
- with gr.Row():
580
- tts_btn = gr.Button("🔊 Antwort vorlesen", variant="secondary", size="sm")
581
- tts_audio = gr.Audio(label="Audio Ausgabe", interactive=False, visible=False)
582
- tts_status = gr.Textbox(label="TTS Status", interactive=False, visible=False)
583
-
584
- # Documents Section
585
- with gr.Accordion("📚 Quellen & Dokumente", open=False):
586
- with gr.Tabs():
587
- with gr.TabItem("📄 Prüfungsordnung (PDF)"):
588
- PDF(pdf_meta["pdf_url"], height=300)
589
-
590
- with gr.TabItem("📘 Hochschulgesetz NRW"):
591
- if isinstance(hg_url, str) and hg_url.startswith("http"):
592
- gr.Markdown(f"### [Im Viewer öffnen]({hg_url})")
593
- gr.HTML(f'<iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>')
594
- else:
595
- gr.Markdown("Viewer-Link nicht verfügbar.")
596
-
597
- # =====================================================
598
- # EVENT HANDLERS
599
- # =====================================================
600
-
601
- # Model Selection
602
- model_selector.change(
603
- change_whisper_model,
604
- inputs=[model_selector],
605
- outputs=[status_display]
606
- )
607
-
608
- # VAD Toggle
609
- vad_toggle.change(
610
- toggle_vad,
611
- inputs=[vad_toggle],
612
- outputs=[status_display]
613
- )
614
-
615
- # Clear Conversation
616
- clear_btn.click(
617
- clear_conversation,
618
- outputs=[chatbot, status_display]
619
- ).then(
620
- lambda: update_vad_indicator(),
621
- outputs=[vad_indicator]
622
- )
623
-
624
- # Main Chat Function
625
- def process_chat(text_input, audio_path, history, lang_sel, use_vad):
626
- """Wrapper function để xử lý chat"""
627
- try:
628
- return chat_fn(text_input, audio_path, history, lang_sel, use_vad)
629
- except Exception as e:
630
- print(f"Error in process_chat: {e}")
631
- error_msg = f"Fehler: {str(e)}"
632
- if history is None:
633
- history = []
634
- return history, "", None, error_msg
635
-
636
- # Send Button Click
637
- send_btn.click(
638
- process_chat,
639
- inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
640
- outputs=[chatbot, chat_text, chat_audio, status_display]
641
- ).then(
642
- lambda: update_vad_indicator(),
643
- outputs=[vad_indicator]
644
- )
645
-
646
- # Text Submit (Enter key)
647
- chat_text.submit(
648
- process_chat,
649
- inputs=[chat_text, chat_audio, chatbot, lang_selector, vad_toggle],
650
- outputs=[chatbot, chat_text, chat_audio, status_display]
651
- ).then(
652
- lambda: update_vad_indicator(),
653
- outputs=[vad_indicator]
654
- )
655
-
656
- # Audio Change Handler
657
- def on_audio_change(audio_path, use_vad):
658
- """Xử lý khi audio thay đổi"""
659
- if audio_path:
660
- print(f"DEBUG: Audio changed: {audio_path}")
661
- # Lưu lại đường dẫn bản ghi để nút Gửi có thể dùng
662
- state.current_audio_path = audio_path
663
- # Xử lý streaming
664
- text, vad_html, status = handle_audio_stream(audio_path, use_vad)
665
- return text, vad_html, status
666
- return "", update_vad_indicator(), "Bereit"
667
-
668
- chat_audio.change(
669
- on_audio_change,
670
- inputs=[chat_audio, vad_toggle],
671
- outputs=[chat_text, vad_indicator, status_display]
672
- )
673
-
674
- # Process immediately when user stops recording
675
- def on_audio_stop(audio_path, history, lang_sel, use_vad):
676
- print(f"DEBUG: stop_recording with audio_path={audio_path}")
677
- state.current_audio_path = audio_path
678
- return chat_fn("", audio_path, history, lang_sel, use_vad)
679
-
680
- chat_audio.stop_recording(
681
- on_audio_stop,
682
- inputs=[chat_audio, chatbot, lang_selector, vad_toggle],
683
- outputs=[chatbot, chat_text, chat_audio, status_display]
684
- )
685
-
686
- # Streaming handler removed; process on change after user stops recording
687
-
688
- # TTS Button
689
- def handle_tts(history):
690
- """Xử lý TTS"""
691
- audio_result = read_last_answer(history)
692
- if audio_result:
693
- return audio_result, "Audio wird abgespielt..."
694
- return None, "Keine Antwort zum Vorlesen gefunden"
695
-
696
- tts_btn.click(
697
- handle_tts,
698
- inputs=[chatbot],
699
- outputs=[tts_audio, tts_status]
700
- ).then(
701
- lambda: gr.Audio(visible=True),
702
- outputs=[tts_audio]
703
- ).then(
704
- lambda: gr.Textbox(visible=True),
705
- outputs=[tts_status]
706
- )
707
 
708
  if __name__ == "__main__":
709
- demo.queue().launch(show_error=True)
 
 
1
+ # app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
2
+ # Version 26.11 – ohne Modi, stabil für Text + Voice
3
+
 
 
 
4
  import gradio as gr
5
  from gradio_pdf import PDF
6
+ from huggingface_hub import hf_hub_download
7
 
8
+ from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
9
  from split_documents import split_documents
10
  from vectorstore import build_vectorstore
11
  from retriever import get_retriever
12
  from llm import load_llm
13
+ from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 
 
 
 
 
 
14
 
15
+ from speech_io import transcribe_audio, synthesize_speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # =====================================================
18
  # INITIALISIERUNG (global)
19
  # =====================================================
20
 
21
+ print("🔹 Lade Dokumente ...")
22
+ _docs = load_documents()
23
 
24
+ print("🔹 Splitte Dokumente ...")
25
+ _chunks = split_documents(_docs)
26
 
27
+ print("🔹 Baue VectorStore (FAISS) ...")
28
+ _vs = build_vectorstore(_chunks)
29
 
30
+ print("🔹 Erzeuge Retriever ...")
31
+ _retriever = get_retriever(_vs)
32
 
33
+ print("🔹 Lade LLM ...")
34
+ _llm = load_llm()
35
 
36
+ print("🔹 Lade Dateien für Viewer …")
37
+ _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
38
+ _html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 
39
 
40
  # =====================================================
41
+ # Quellen formatieren – Markdown für Chat
42
  # =====================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def format_sources_markdown(sources):
45
+ if not sources:
 
 
 
46
  return ""
 
47
 
48
+ lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
49
+ for s in sources:
50
+ sid = s["id"]
51
+ src = s["source"]
52
+ page = s["page"]
53
+ url = s["url"]
54
+ snippet = s["snippet"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ title = f"Quelle {sid} – {src}"
 
 
 
 
 
57
 
58
+ if url:
59
+ base = f"- [{title}]({url})"
60
+ else:
61
+ base = f"- {title}"
62
 
63
+ if page and "Prüfungsordnung" in src:
64
+ base += f", Seite {page}"
 
 
 
65
 
66
+ lines.append(base)
67
 
68
+ if snippet:
69
+ lines.append(f" > {snippet}")
70
+
71
+ return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # =====================================================
74
+ # TEXT CHATBOT
75
  # =====================================================
76
+
77
+ def chatbot_text(user_message, history):
78
+ if not user_message:
79
+ return history, ""
80
+
81
+ answer_text, sources = answer(
82
+ question=user_message,
83
+ retriever=_retriever,
84
+ chat_model=_llm,
85
+ )
86
+
87
+ quellen_block = format_sources_markdown(sources)
88
+
89
+ history = history + [
90
+ {"role": "user", "content": user_message},
91
+ {"role": "assistant", "content": answer_text + quellen_block},
92
+ ]
93
+
94
+ return history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  # =====================================================
97
+ # VOICE CHATBOT
98
  # =====================================================
99
+
100
+ def chatbot_voice(audio_path, history):
101
+ # 1. Speech Text
102
+ text = transcribe_audio(audio_path)
103
+ if not text:
104
+ return history, None, ""
105
+
106
+ # Lưu vào lịch sử chat
107
+ history = history + [{"role": "user", "content": text}]
108
+
109
+ # 2. RAG trả lời
110
+ answer_text, sources = answer(
111
+ question=text,
112
+ retriever=_retriever,
113
+ chat_model=_llm,
114
+ )
115
+ quellen_block = format_sources_markdown(sources)
116
+
117
+ bot_msg = answer_text + quellen_block
118
+ history = history + [{"role": "assistant", "content": bot_msg}]
119
+
120
+ # 3. Text Speech
121
+ audio = synthesize_speech(bot_msg)
122
+
123
+ return history, audio, ""
 
 
 
 
 
 
 
124
 
125
  # =====================================================
126
+ # LAST ANSWER → TTS
127
  # =====================================================
128
+
129
  def read_last_answer(history):
130
  if not history:
 
131
  return None
132
+
133
  for msg in reversed(history):
134
+ if msg["role"] == "assistant":
135
+ return synthesize_speech(msg["content"])
136
+
 
 
 
 
 
 
 
137
  return None
138
 
139
  # =====================================================
140
+ # UI – GRADIO
141
  # =====================================================
142
+
143
+ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
144
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
145
+ gr.Markdown(
146
+ "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
147
+ "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
148
+ "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
149
+ )
150
+
151
+ with gr.Row():
152
+ with gr.Column(scale=2):
153
+ chatbot = gr.Chatbot(type="messages", label="Chat", height=500)
154
+
155
+ msg = gr.Textbox(
156
+ label="Frage eingeben",
157
+ placeholder="Stelle deine Frage zum Prüfungsrecht …",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  )
159
+
160
+ # TEXT SENDEN
161
+ msg.submit(
162
+ chatbot_text,
163
+ [msg, chatbot],
164
+ [chatbot, msg]
165
+ )
166
+
167
+ send_btn = gr.Button("Senden (Text)")
168
+ send_btn.click(
169
+ chatbot_text,
170
+ [msg, chatbot],
171
+ [chatbot, msg]
172
+ )
173
+
174
+ # SPRACHEINGABE
175
+ gr.Markdown("### 🎙️ Spracheingabe")
176
+ voice_in = gr.Audio(sources=["microphone"], type="filepath")
177
+ voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
178
+
179
+ voice_btn = gr.Button("Sprechen & senden")
180
+ voice_btn.click(
181
+ chatbot_voice,
182
+ [voice_in, chatbot],
183
+ [chatbot, voice_out, msg]
184
+ )
185
+
186
+ read_btn = gr.Button("🔁 Antwort erneut vorlesen")
187
+ read_btn.click(
188
+ read_last_answer,
189
+ [chatbot],
190
+ [voice_out]
191
+ )
192
+
193
+ clear_btn = gr.Button("Chat zurücksetzen")
194
+ clear_btn.click(lambda: [], None, chatbot)
195
+
196
+ # =====================
197
+ # RECHTE SPALTE: Viewer
198
+ # =====================
199
+
200
+ with gr.Column(scale=1):
201
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
202
+ PDF(_pdf_path, height=350)
203
+
204
+ gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
205
+ gr.HTML(
206
+ f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
207
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  if __name__ == "__main__":
210
+ demo.launch()
211
+
speech_io.py CHANGED
@@ -1,455 +1,158 @@
1
  """
2
- speech_io.py - Enhanced Version with working VAD
3
 
4
- Sprachbasierte Ein-/Ausgabe với:
5
- - Speech-to-Text (STT) với Whisper
6
- - Text-to-Speech (TTS)
7
- - Voice Activity Detection (VAD) hoạt động
 
8
  """
9
 
10
- import os
11
- import time
12
- from typing import Optional, Tuple, Dict, Any
13
  import numpy as np
14
  import soundfile as sf
15
- from scipy.signal import butter, filtfilt, resample
16
- import re
17
- import difflib
 
 
 
 
 
 
18
 
19
  # ========================================================
20
- # CẤU HÌNH
21
  # ========================================================
22
- # Model Selection
23
- WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
24
- ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
- TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
 
27
- # OpenAI Configuration
28
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
29
-
30
- # VAD Configuration
31
- ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
32
- VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
33
- VAD_MIN_DURATION = float(os.getenv("VAD_MIN_DURATION", "0.1"))
 
 
 
 
 
34
 
35
- # Other Configs
36
- ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
37
- TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
38
- ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
39
 
40
- # Cache for models
41
- _asr = None
42
- _tts = None
 
 
 
 
 
 
43
 
44
  # ========================================================
45
- # AUDIO PROCESSING UTILITIES
46
  # ========================================================
 
47
  def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
48
- """Highpass filter để loại bỏ noise tần số thấp"""
49
- if len(data) == 0:
50
- return data
51
-
52
  nyq = 0.5 * fs
53
- normal_cutoff = cutoff / nyq
54
- b, a = butter(order, normal_cutoff, btype='high', analog=False)
55
  return filtfilt(b, a, data)
56
 
57
- def apply_fade(audio, sr, fade_in_ms=10, fade_out_ms=10):
58
- """Áp dụng fade in/out để tránh pop"""
59
- if len(audio) == 0:
60
- return audio
61
-
62
- fade_in_samples = int(sr * fade_in_ms / 1000)
63
- fade_out_samples = int(sr * fade_out_ms / 1000)
64
-
65
- # Đảm bảo có đủ samples
66
- if len(audio) < fade_in_samples + fade_out_samples:
67
  return audio
68
-
69
- # Fade in
70
- if fade_in_samples > 0:
71
- fade_in_curve = np.linspace(0, 1, fade_in_samples)
72
- audio[:fade_in_samples] *= fade_in_curve
73
-
74
- # Fade out
75
- if fade_out_samples > 0:
76
- fade_out_curve = np.linspace(1, 0, fade_out_samples)
77
- audio[-fade_out_samples:] *= fade_out_curve
78
-
79
- return audio
80
 
81
- def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
82
- """Chuẩn hóa audio về [-1, 1]"""
83
- if len(audio_data) == 0:
84
- return audio_data
85
-
86
- # Chuyển đổi sang float32
87
- if audio_data.dtype != np.float32:
88
- audio_data = audio_data.astype(np.float32)
89
-
90
- # Normalize
91
- max_val = np.max(np.abs(audio_data))
92
- if max_val > 0:
93
- audio_data = audio_data / max_val
94
-
95
- return audio_data
96
-
97
- def preprocess_audio_for_vad(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
98
- """Tiền xử lý audio cho VAD"""
99
- if len(audio_data) == 0:
100
- return audio_data
101
-
102
- # Chuyển sang mono nếu cần
103
- if len(audio_data.shape) > 1:
104
- audio_data = np.mean(audio_data, axis=1)
105
-
106
- # Normalize
107
- audio_data = normalize_audio(audio_data)
108
-
109
- # Highpass filter để loại bỏ noise tần số thấp
110
- try:
111
- audio_data = butter_highpass_filter(audio_data, cutoff=80, fs=sample_rate)
112
- except:
113
- pass
114
-
115
- return audio_data
116
 
117
- # ========================================================
118
- # VOICE ACTIVITY DETECTION (VAD) - FIXED VERSION
119
- # ========================================================
120
- def detect_voice_activity(
121
- audio_data: np.ndarray,
122
- sample_rate: int,
123
- threshold: float = 0.3,
124
- min_duration: float = 0.1
125
- ) -> Dict[str, Any]:
126
- """
127
- Phát hiện hoạt động giọng nói - Phiên bản đơn giản và hoạt động
128
-
129
- Args:
130
- audio_data: Mảng numpy chứa audio samples
131
- sample_rate: Tần số lấy mẫu
132
- threshold: Ngưỡng phát hiện (0-1)
133
- min_duration: Thời gian tối thiểu để xác định là speech (giây)
134
-
135
- Returns:
136
- Dict với thông tin phát hiện
137
- """
138
- if len(audio_data) == 0:
139
- return {
140
- "is_speech": False,
141
- "confidence": 0.0,
142
- "speech_segments": [],
143
- "energy": 0.0,
144
- "message": "Empty audio data"
145
- }
146
-
147
- try:
148
- # Tiền xử lý audio
149
- processed_audio = preprocess_audio_for_vad(audio_data, sample_rate)
150
-
151
- # Tính toán các đặc trưng
152
- duration = len(processed_audio) / sample_rate
153
-
154
- # 1. Tính RMS energy
155
- rms_energy = np.sqrt(np.mean(processed_audio ** 2))
156
-
157
- # 2. Tính zero-crossing rate
158
- zero_crossings = np.sum(np.abs(np.diff(np.sign(processed_audio)))) / (2 * len(processed_audio))
159
-
160
- # 3. Tính spectral centroid (đơn giản)
161
- # Sử dụng FFT để tính phân bố tần số
162
- if len(processed_audio) >= 256:
163
- fft_size = min(2048, len(processed_audio))
164
- spectrum = np.abs(np.fft.rfft(processed_audio[:fft_size]))
165
- frequencies = np.fft.rfftfreq(fft_size, 1/sample_rate)
166
- if np.sum(spectrum) > 0:
167
- spectral_centroid = np.sum(frequencies * spectrum) / np.sum(spectrum)
168
- else:
169
- spectral_centroid = 0
170
- else:
171
- spectral_centroid = 0
172
-
173
- # 4. Frame-based analysis
174
- frame_length = int(sample_rate * 0.03) # 30ms frame
175
- hop_length = int(frame_length / 2)
176
-
177
- if len(processed_audio) > frame_length:
178
- num_frames = 1 + (len(processed_audio) - frame_length) // hop_length
179
- frame_energies = []
180
-
181
- for i in range(num_frames):
182
- start = i * hop_length
183
- end = start + frame_length
184
- frame = processed_audio[start:end]
185
- frame_energy = np.sqrt(np.mean(frame ** 2))
186
- frame_energies.append(frame_energy)
187
-
188
- # Tính speech ratio
189
- if frame_energies:
190
- energy_threshold = np.percentile(frame_energies, 30) + threshold * (np.max(frame_energies) - np.percentile(frame_energies, 30))
191
- speech_frames = sum(1 for e in frame_energies if e > energy_threshold)
192
- speech_ratio = speech_frames / len(frame_energies)
193
- else:
194
- speech_ratio = 0
195
- else:
196
- speech_ratio = 0
197
-
198
- # 5. Kết hợp các đặc trưng để tính confidence
199
- # Speech thường có:
200
- # - RMS energy cao
201
- # - Zero-crossing rate trung bình (không quá cao như noise, không quá thấp như silence)
202
- # - Spectral centroid trong khoảng 100-3000 Hz cho giọng nói
203
- # - Speech ratio cao
204
-
205
- # Tính confidence score
206
- energy_score = min(1.0, rms_energy * 10) # Scale energy
207
-
208
- # Zero-crossing rate score: lý tưởng khoảng 0.1-0.3 cho speech
209
- if 0.05 < zero_crossings < 0.4:
210
- zcr_score = 1.0 - 2 * abs(zero_crossings - 0.2) # Peak ở 0.2
211
- else:
212
- zcr_score = 0.0
213
-
214
- # Spectral centroid score: lý tưởng 100-3000 Hz
215
- if 100 < spectral_centroid < 3000:
216
- centroid_score = 1.0
217
- elif 50 < spectral_centroid < 5000:
218
- centroid_score = 0.5
219
- else:
220
- centroid_score = 0.0
221
-
222
- # Speech ratio score
223
- speech_ratio_score = speech_ratio
224
-
225
- # Kết hợp các score
226
- weights = [0.4, 0.2, 0.2, 0.2] # energy, zcr, centroid, speech_ratio
227
- confidence = (
228
- weights[0] * energy_score +
229
- weights[1] * zcr_score +
230
- weights[2] * centroid_score +
231
- weights[3] * speech_ratio_score
232
- )
233
-
234
- # Áp dụng ngưỡng
235
- is_speech = confidence > threshold
236
-
237
- # Kiểm tra duration tối thiểu
238
- if duration < min_duration:
239
- is_speech = False
240
- confidence = max(0, confidence - 0.2)
241
-
242
- # Debug info
243
- debug_info = {
244
- "duration": duration,
245
- "rms_energy": rms_energy,
246
- "zero_crossings": zero_crossings,
247
- "spectral_centroid": spectral_centroid,
248
- "speech_ratio": speech_ratio,
249
- "energy_score": energy_score,
250
- "zcr_score": zcr_score,
251
- "centroid_score": centroid_score,
252
- "speech_ratio_score": speech_ratio_score,
253
- "final_confidence": confidence,
254
- "is_speech": is_speech
255
- }
256
-
257
- print(f"VAD Debug: {debug_info}")
258
-
259
- return {
260
- "is_speech": is_speech,
261
- "confidence": float(confidence),
262
- "speech_segments": [[0, duration]] if is_speech else [],
263
- "energy": float(rms_energy),
264
- "message": f"Speech: {is_speech}, Confidence: {confidence:.3f}"
265
- }
266
-
267
- except Exception as e:
268
- print(f"VAD processing error: {e}")
269
- return {
270
- "is_speech": False,
271
- "confidence": 0.0,
272
- "speech_segments": [],
273
- "energy": 0.0,
274
- "message": f"Error: {str(e)}"
275
- }
276
 
277
  # ========================================================
278
- # SPEECH-TO-TEXT FUNCTIONS
279
  # ========================================================
280
- def transcribe_audio(
281
- audio_path: str,
282
- language: Optional[str] = None,
283
- max_duration_s: int = ASR_MAX_DURATION_S
284
- ) -> str:
285
- """
286
- Transcribe audio bằng OpenAI Whisper API
287
- """
288
- if not audio_path or not os.path.exists(audio_path):
289
- print(">>> Kein Audio gefunden.")
290
- return ""
291
- if not OPENAI_API_KEY:
292
- print(">>> OPENAI_API_KEY nicht gesetzt.")
293
- return ""
294
- try:
295
- from openai import OpenAI
296
- client = OpenAI(api_key=OPENAI_API_KEY)
297
- with open(audio_path, "rb") as f:
298
- resp = client.audio.transcriptions.create(
299
- model="whisper-1",
300
- file=f,
301
- language=language if language and language != "auto" else None,
302
- response_format="text"
303
- )
304
- text = resp.text if hasattr(resp, "text") else (resp.get("text", "") if isinstance(resp, dict) else str(resp))
305
- text = fix_domain_terms(text.strip())
306
- print(f">>> Transkription (OpenAI): {text}")
307
- return text
308
- except Exception as e:
309
- print(f">>> Transkriptionsfehler (OpenAI): {e}")
310
- return ""
311
 
312
- def transcribe_audio(
313
- audio_path: str,
314
- language: Optional[str] = None,
315
- max_duration_s: int = ASR_MAX_DURATION_S
316
- ) -> str:
317
  """
318
- Transcribe audio với Whisper local
319
  """
320
- if not audio_path or not os.path.exists(audio_path):
321
- print(">>> Kein Audio gefunden.")
322
- return ""
323
-
324
- try:
325
- # Đọc audio file
326
- data, sr = sf.read(audio_path, always_2d=False)
327
-
328
- if data is None or data.size == 0:
329
- print(">>> Audio leer.")
330
- return ""
331
-
332
- # Chuyển sang mono
333
- if len(data.shape) > 1:
334
- data = np.mean(data, axis=1)
335
-
336
- # Tiền xử lý
337
- data = data.astype(np.float32)
338
- max_val = np.max(np.abs(data))
339
- if max_val > 0:
340
- data = data / max_val
341
-
342
- # Resample về 16kHz nếu cần
343
- TARGET_SR = 16000
344
- if sr != TARGET_SR:
345
- target_len = int(len(data) * TARGET_SR / sr)
346
- data = resample(data, target_len)
347
- sr = TARGET_SR
348
-
349
- # Giới hạn độ dài
350
- MAX_SAMPLES = sr * max_duration_s
351
- if len(data) > MAX_SAMPLES:
352
- data = data[:MAX_SAMPLES]
353
-
354
- # Lấy pipeline
355
- asr = get_asr_pipeline()
356
-
357
- # Cấu hình language
358
- lang = language
359
- if not lang and ASR_DEFAULT_LANGUAGE and ASR_DEFAULT_LANGUAGE.lower() != "auto":
360
- lang = ASR_DEFAULT_LANGUAGE
361
- if isinstance(lang, str) and lang.lower() == "auto":
362
- lang = None
363
-
364
- # Transcribe
365
- print(f">>> Transkribiere mit Whisper-{WHISPER_MODEL}...")
366
- call_kwargs = {}
367
-
368
- if lang:
369
- call_kwargs["generate_kwargs"] = {
370
- "language": lang,
371
- "task": "transcribe",
372
- "max_new_tokens": 120,
373
- "temperature": 0.0,
374
- }
375
-
376
- result = asr({"array": data, "sampling_rate": sr}, **call_kwargs)
377
-
378
- text = result.get("text", "") if isinstance(result, dict) else str(result)
379
- text = text.strip()
380
-
381
- # Sửa lỗi domain terms
382
- text = fix_domain_terms(text)
383
-
384
- print(f">>> Transkription: {text}")
385
- return text
386
-
387
- except Exception as e:
388
- print(f">>> Transkriptionsfehler: {e}")
389
  return ""
390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  # ========================================================
392
  # TEXT-TO-SPEECH (TTS)
393
  # ========================================================
394
- def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
395
- """
396
- Chuyển text sang speech bằng OpenAI TTS
397
- """
398
- if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
399
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  try:
401
- from openai import OpenAI
402
- client = OpenAI(api_key=OPENAI_API_KEY)
403
- response = client.audio.speech.create(
404
- model="tts-1",
405
- voice="nova",
406
- input=text[:4000],
407
- response_format="wav"
408
- )
409
- import io
410
- audio_bytes = response.content
411
- with io.BytesIO(audio_bytes) as f:
412
- data, sr = sf.read(f)
413
- if len(data.shape) > 1:
414
- data = np.mean(data, axis=1)
415
- if data.dtype == np.float32 or data.dtype == np.float64:
416
- data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
417
- return (sr, data)
418
- except Exception as e:
419
- print(f">>> TTS Fehler (OpenAI): {e}")
420
- return None
421
 
422
- # ========================================================
423
- # DOMAIN-SPECIFIC TEXT PROCESSING
424
- # ========================================================
425
- def fix_domain_terms(text: str) -> str:
426
- """
427
- Sửa lỗi các thuật ngữ chuyên ngành
428
- """
429
- if not text:
430
- return text
431
-
432
- # Common mis-transcriptions
433
- correction_pairs = [
434
- (r"\bbriefe\s*um\b", "prüfung"),
435
- (r"\bbrieft\s*um\b", "prüfung"),
436
- (r"\bbriefung\b", "prüfung"),
437
- (r"\bpruefung\b", "prüfung"),
438
- (r"\bhochschule\s*gesetz\b", "hochschulgesetz"),
439
- ]
440
-
441
- for pattern, replacement in correction_pairs:
442
- text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
443
-
444
- return text
445
 
446
- # ========================================================
447
- # MAIN EXPORT
448
- # ========================================================
449
- __all__ = [
450
- 'transcribe_audio',
451
- 'synthesize_speech',
452
- 'detect_voice_activity',
453
- 'normalize_audio',
454
- 'preprocess_audio_for_vad'
455
- ]
 
1
  """
2
+ speech_io.py
3
 
4
+ Sprachbasierte Ein-/Ausgabe:
5
+ - Speech-to-Text (STT) mit Whisper (transformers.pipeline)
6
+ - Text-to-Speech (TTS) mit MMS-TTS Deutsch
7
+
8
+ Dieses File ist 100% stabil für HuggingFace Spaces.
9
  """
10
 
11
+ from typing import Optional, Tuple
 
 
12
  import numpy as np
13
  import soundfile as sf
14
+ from scipy.signal import butter, filtfilt
15
+ from transformers import pipeline
16
+
17
+ # Modelle
18
+ ASR_MODEL_ID = "openai/whisper-small"
19
+ TTS_MODEL_ID = "facebook/mms-tts-deu"
20
+
21
+ _asr = None
22
+ _tts = None
23
 
24
  # ========================================================
25
+ # STT PIPELINE
26
  # ========================================================
 
 
 
 
27
 
28
+ def get_asr_pipeline():
29
+ global _asr
30
+ if _asr is None:
31
+ print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
32
+ _asr = pipeline(
33
+ task="automatic-speech-recognition",
34
+ model=ASR_MODEL_ID,
35
+ device="cpu",
36
+ return_timestamps=True, # wichtig
37
+ chunk_length_s=30 # auto-chunk für lange audio
38
+ )
39
+ return _asr
40
 
41
+ # ========================================================
42
+ # TTS PIPELINE
43
+ # ========================================================
 
44
 
45
+ def get_tts_pipeline():
46
+ global _tts
47
+ if _tts is None:
48
+ print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
49
+ _tts = pipeline(
50
+ task="text-to-speech",
51
+ model=TTS_MODEL_ID,
52
+ )
53
+ return _tts
54
 
55
  # ========================================================
56
+ # AUDIO FILTER – Noise Reduction + Highpass
57
  # ========================================================
58
+
59
  def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
 
 
 
 
60
  nyq = 0.5 * fs
61
+ norm_cutoff = cutoff / nyq
62
+ b, a = butter(order, norm_cutoff, btype="high")
63
  return filtfilt(b, a, data)
64
 
65
+ def apply_fade(audio, sr, duration_ms=10):
66
+ fade_samples = int(sr * duration_ms / 1000)
67
+
68
+ if fade_samples * 2 >= len(audio):
 
 
 
 
 
 
69
  return audio
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ fade_in_curve = np.linspace(0, 1, fade_samples)
72
+ audio[:fade_samples] *= fade_in_curve
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ fade_out_curve = np.linspace(1, 0, fade_samples)
75
+ audio[-fade_samples:] *= fade_out_curve
76
+
77
+ return audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # ========================================================
80
+ # SPEECH-TO-TEXT (STT)
81
  # ========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ def transcribe_audio(audio_path: str) -> str:
 
 
 
 
84
  """
85
+ audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
86
  """
87
+
88
+ if audio_path is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  return ""
90
 
91
+ # WAV einlesen (soundfile garantiert PCM korrekt)
92
+ data, sr = sf.read(audio_path)
93
+
94
+ # immer Mono
95
+ if len(data.shape) > 1:
96
+ data = data[:, 0]
97
+
98
+ # Whisper >30s vermeiden
99
+ MAX_SAMPLES = sr * 30
100
+ if len(data) > MAX_SAMPLES:
101
+ data = data[:MAX_SAMPLES]
102
+
103
+ asr = get_asr_pipeline()
104
+
105
+ print(">>> Transkribiere Audio...")
106
+ result = asr(
107
+ {"array": data, "sampling_rate": sr},
108
+ )
109
+
110
+ text = result.get("text", "").strip()
111
+ print("ASR:", text)
112
+ return text
113
+
114
  # ========================================================
115
  # TEXT-TO-SPEECH (TTS)
116
  # ========================================================
117
+
118
+ def synthesize_speech(text: str):
119
+ if not text or not text.strip():
 
 
120
  return None
121
+
122
+ tts = get_tts_pipeline()
123
+ out = tts(text)
124
+
125
+ # rohes Audio from MMS (float32 [-1, 1])
126
+ audio = np.array(out["audio"], dtype=np.float32)
127
+ sr = out.get("sampling_rate", 16000)
128
+
129
+ # ===== FIX sample_rate =====
130
+ if sr is None or sr <= 0 or sr > 65535:
131
+ sr = 16000
132
+
133
+ # ===== Mono erzwingen =====
134
+ if audio.ndim > 1:
135
+ audio = audio.squeeze()
136
+ if audio.ndim > 1:
137
+ audio = audio[:, 0]
138
+
139
+ # ===== Noise reduction =====
140
  try:
141
+ audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
142
+ except:
143
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ # ===== Normalize =====
146
+ max_val = np.max(np.abs(audio))
147
+ if max_val > 0:
148
+ audio = audio / max_val
149
+
150
+ # ===== Fade gegen pop =====
151
+ audio = apply_fade(audio, sr)
152
+
153
+ # ===== int16 =====
154
+ audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
155
+
156
+ # Rückgabe: (sr, np.int16 array)
157
+ return (sr, audio_int16)
 
 
 
 
 
 
 
 
 
 
158