Nguyen5 commited on
Commit
ed0df67
·
1 Parent(s): e2535a9
Files changed (3) hide show
  1. app.py +332 -729
  2. realtime_server.py +145 -97
  3. speech_io.py +74 -422
app.py CHANGED
@@ -1,15 +1,12 @@
1
- # app.py – Prüfungsrechts-Chatbot với OpenAI Realtime API và Voice Agents
2
  import os
3
  import time
4
- import json
5
- import asyncio
6
- import threading
7
- from dataclasses import dataclass, field
8
- from typing import Optional, Dict, Any, List
9
  import gradio as gr
10
  from gradio_pdf import PDF
11
  import numpy as np
12
- import queue
13
 
14
  from openai import OpenAI
15
 
@@ -19,69 +16,20 @@ from vectorstore import build_vectorstore
19
  from retriever import get_retriever
20
  from llm import load_llm
21
  from rag_pipeline import answer
 
22
 
23
  # =====================================================
24
  # CONFIGURATION
25
  # =====================================================
26
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
27
  if not OPENAI_API_KEY:
28
- raise RuntimeError("OPENAI_API_KEY is required for Realtime API")
29
 
30
  # Initialize OpenAI client
31
  openai_client = OpenAI(api_key=OPENAI_API_KEY)
32
 
33
- # =====================================================
34
- # STATE MANAGEMENT
35
- # =====================================================
36
- @dataclass
37
- class ConversationState:
38
- """Quản lý trạng thái hội thoại"""
39
- messages: list = field(default_factory=list)
40
- is_streaming: bool = False
41
- realtime_session_id: Optional[str] = None
42
- audio_queue: queue.Queue = field(default_factory=queue.Queue)
43
- text_queue: queue.Queue = field(default_factory=queue.Queue)
44
- conversation_context: str = ""
45
-
46
- def add_message(self, role: str, content: str):
47
- """Thêm message vào hội thoại"""
48
- self.messages.append({
49
- "role": role,
50
- "content": content,
51
- "timestamp": time.time()
52
- })
53
- # Giới hạn lịch sử
54
- if len(self.messages) > 20:
55
- self.messages = self.messages[-20:]
56
-
57
- # Cập nhật context
58
- self._update_context()
59
-
60
- def _update_context(self):
61
- """Cập nhật context từ hội thoại"""
62
- if not self.messages:
63
- self.conversation_context = ""
64
- return
65
-
66
- context_parts = []
67
- for msg in self.messages[-5:]: # Giữ 5 message gần nhất
68
- prefix = "User" if msg["role"] == "user" else "Assistant"
69
- context_parts.append(f"{prefix}: {msg['content'][:200]}")
70
- self.conversation_context = "\n".join(context_parts)
71
-
72
- def reset(self):
73
- """Reset trạng thái hội thoại"""
74
- self.messages = []
75
- self.conversation_context = ""
76
- self.is_streaming = False
77
- self.realtime_session_id = None
78
- while not self.audio_queue.empty():
79
- self.audio_queue.get()
80
- while not self.text_queue.empty():
81
- self.text_queue.get()
82
-
83
- # Khởi tạo state
84
- state = ConversationState()
85
 
86
  # =====================================================
87
  # INITIALIZATION - RAG Components
@@ -107,396 +55,145 @@ hg_meta = next(d.metadata for d in docs if d.metadata.get("type") == "hg")
107
  hg_url = hg_meta.get("viewer_url")
108
 
109
  # =====================================================
110
- # OPENAI REALTIME API HANDLERS
111
  # =====================================================
112
- class RealtimeEventHandler:
113
- """Handler cho OpenAI Realtime API events"""
114
 
115
- def __init__(self, state_ref):
116
- self.state = state_ref
117
- self.current_text = ""
118
-
119
- def on_text_delta(self, delta, snapshot=None):
120
- """Xử lý text delta từ Realtime API"""
121
- if delta.value:
122
- self.current_text += delta.value
123
- # Thêm vào text queue để hiển thị
124
- self.state.text_queue.put({
125
- "type": "text_delta",
126
- "content": delta.value
127
- })
128
-
129
- def on_audio_transcript_delta(self, delta, snapshot=None):
130
- """Xử lý audio transcript từ Realtime API"""
131
- if delta.text:
132
- # Thêm vào text queue
133
- self.state.text_queue.put({
134
- "type": "transcript",
135
- "content": delta.text
136
- })
137
-
138
- def on_audio_delta(self, delta, snapshot=None):
139
- """Xử lý audio data từ Realtime API"""
140
- if delta.data:
141
- # Thêm vào audio queue để phát
142
- self.state.audio_queue.put({
143
- "type": "audio",
144
- "data": delta.data
145
- })
146
-
147
- def on_response_created(self, response=None):
148
- """Khi response được tạo"""
149
- print("DEBUG: Response created")
150
-
151
- def on_response_done(self, response=None):
152
- """Khi response hoàn thành"""
153
- print(f"DEBUG: Response done, final text: {self.current_text[:100]}...")
154
- if self.current_text:
155
- # Thêm message vào history
156
- self.state.add_message("assistant", self.current_text)
157
-
158
- # Signal end of response
159
- self.state.text_queue.put({
160
- "type": "response_end",
161
- "content": self.current_text
162
- })
163
 
164
- self.current_text = ""
165
- self.state.is_streaming = False
166
-
167
- def on_error(self, error):
168
- """Xử lý lỗi"""
169
- print(f"DEBUG: Realtime API error: {error}")
170
- self.state.is_streaming = False
171
- self.state.text_queue.put({
172
- "type": "error",
173
- "content": f"Error: {str(error)}"
174
  })
175
-
176
- def start_realtime_conversation():
177
- """Bắt đầu cuộc hội thoại Realtime API"""
178
- try:
179
- # Tạo Realtime session
180
- session = openai_client.realtime.sessions.create(
181
- model="gpt-4o-realtime-preview",
182
- voice="shimmer", # Có thể chọn: alloy, echo, fable, onyx, nova, shimmer
183
- modalities=["text", "audio"],
184
- instructions="""Du bist ein juristischer Assistent für Prüfungsrecht.
185
- Du hilfst Studenten mit Fragen zu Prüfungsordnung und Hochschulgesetz NRW.
186
- Antworte präzise, freundlich und professionell.
187
- Bei unsicheren Fragen, verweise auf die offiziellen Dokumente."""
188
- )
189
-
190
- state.realtime_session_id = session.id
191
- state.is_streaming = True
192
-
193
- print(f"DEBUG: Realtime session started: {session.id}")
194
-
195
- # Bắt đầu streaming
196
- with openai_client.realtime.connect(
197
- session_id=session.id,
198
- event_handler=RealtimeEventHandler(state)
199
- ) as connection:
200
- # Keep connection alive
201
- while state.is_streaming:
202
- time.sleep(0.1)
203
-
204
- except Exception as e:
205
- print(f"DEBUG: Error in realtime conversation: {e}")
206
- state.is_streaming = False
207
-
208
- def stop_realtime_conversation():
209
- """Dừng cuộc hội thoại Realtime"""
210
- state.is_streaming = False
211
- if state.realtime_session_id:
212
- try:
213
- openai_client.realtime.sessions.delete(state.realtime_session_id)
214
- except:
215
- pass
216
- state.realtime_session_id = None
217
-
218
- def send_text_to_realtime(text: str):
219
- """Gửi text đến Realtime API"""
220
- if not state.is_streaming or not state.realtime_session_id:
221
- return False
222
 
223
- try:
224
- # Tạo Realtime client mới để gửi message
225
- with openai_client.realtime.connect(session_id=state.realtime_session_id) as connection:
226
- connection.send({
227
- "type": "response.create",
228
- "response": {
229
- "modalities": ["text", "audio"],
230
- "instructions": f"Antworte auf: {text}"
231
- }
232
- })
233
- return True
234
- except Exception as e:
235
- print(f"DEBUG: Error sending to realtime: {e}")
236
- return False
 
 
 
 
 
 
237
 
238
  # =====================================================
239
- # TOOLS (FUNCTION CALLS) FOR VOICE AGENT
240
  # =====================================================
241
- def search_documents_tool(query: str) -> Dict[str, Any]:
242
- """Tool để tìm kiếm tài liệu"""
 
 
 
 
 
243
  try:
244
- # Sử dụng retriever để tìm tài liệu liên quan
245
- docs = retriever.invoke(query)
246
 
247
- if not docs:
248
- return {
249
- "success": False,
250
- "message": "Keine relevanten Dokumente gefunden.",
251
- "documents": []
252
- }
253
 
254
- # Format kết quả
255
- results = []
256
- for i, doc in enumerate(docs[:3], 1):
257
- meta = doc.metadata
258
- source_type = meta.get("type", "unknown")
259
-
260
- if source_type == "pdf":
261
- source_info = {
262
- "type": "Prüfungsordnung",
263
- "page": meta.get("page"),
264
- "url": meta.get("pdf_url")
265
- }
266
- elif source_type == "hg":
267
- source_info = {
268
- "type": "Hochschulgesetz NRW",
269
- "paragraph": meta.get("title"),
270
- "url": meta.get("viewer_url")
271
- }
272
- else:
273
- source_info = {"type": "unknown"}
274
-
275
- results.append({
276
- "id": i,
277
- "content": doc.page_content[:500] + "...",
278
- "source": source_info
279
- })
280
 
281
- return {
282
- "success": True,
283
- "message": f"{len(results)} Dokumente gefunden",
284
- "documents": results
285
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  except Exception as e:
288
- return {
289
- "success": False,
290
- "message": f"Fehler bei der Suche: {str(e)}",
291
- "documents": []
292
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- def get_legal_advice_tool(question: str) -> Dict[str, Any]:
295
- """Tool để nhận tư vấn pháp từ RAG"""
 
 
 
 
 
 
296
  try:
297
- # Sử dụng RAG pipeline
298
- ans, sources = answer(question, retriever, llm)
 
299
 
300
- # Format sources
301
- formatted_sources = []
302
- for src in sources:
303
- formatted_sources.append({
304
- "source": src["source"],
305
- "page": src.get("page"),
306
- "url": src["url"]
307
- })
308
 
309
- return {
310
- "success": True,
311
- "answer": ans,
312
- "sources": formatted_sources,
313
- "has_relevant_info": len(sources) > 0
314
- }
315
 
316
  except Exception as e:
317
- return {
318
- "success": False,
319
- "answer": f"Fehler: {str(e)}",
320
- "sources": [],
321
- "has_relevant_info": False
322
- }
323
-
324
- # =====================================================
325
- # VOICE AGENT WITH TOOLS
326
- # =====================================================
327
- class VoiceAgent:
328
- """Voice Agent sử dụng OpenAI Realtime API với Tools"""
329
 
330
- def __init__(self, openai_client):
331
- self.client = openai_client
332
- self.session_id = None
333
- self.is_active = False
334
-
335
- def start_session(self):
336
- """Bắt đầu session với tools"""
337
- try:
338
- # Tạo session với tools definition
339
- session = self.client.realtime.sessions.create(
340
- model="gpt-4o-realtime-preview-2024-12-17",
341
- voice="shimmer",
342
- modalities=["text", "audio"],
343
- instructions="""Du bist ein juristischer Voice Agent.
344
- Du kannst:
345
- 1. Dokumente durchsuchen (search_documents)
346
- 2. Rechtliche Beratung geben (get_legal_advice)
347
-
348
- Sei präzise, freundlich und hilfreich.
349
- Verweise immer auf die Quellen.""",
350
- tools=[
351
- {
352
- "type": "function",
353
- "name": "search_documents",
354
- "description": "Durchsucht die Prüfungsordnung und das Hochschulgesetz nach relevanten Informationen",
355
- "parameters": {
356
- "type": "object",
357
- "properties": {
358
- "query": {
359
- "type": "string",
360
- "description": "Suchbegriff oder Frage"
361
- }
362
- },
363
- "required": ["query"]
364
- }
365
- },
366
- {
367
- "type": "function",
368
- "name": "get_legal_advice",
369
- "description": "Gibt juristische Beratung basierend auf den Dokumenten",
370
- "parameters": {
371
- "type": "object",
372
- "properties": {
373
- "question": {
374
- "type": "string",
375
- "description": "Juristische Frage"
376
- }
377
- },
378
- "required": ["question"]
379
- }
380
- }
381
- ],
382
- tool_choice="auto"
383
- )
384
-
385
- self.session_id = session.id
386
- self.is_active = True
387
-
388
- # Start event handling thread
389
- threading.Thread(target=self._handle_events, daemon=True).start()
390
-
391
- return True
392
-
393
- except Exception as e:
394
- print(f"DEBUG: Error starting voice agent: {e}")
395
- return False
396
-
397
- def _handle_events(self):
398
- """Xử lý events từ Realtime API"""
399
- try:
400
- with self.client.realtime.connect(
401
- session_id=self.session_id,
402
- event_handler=VoiceAgentEventHandler(self)
403
- ) as connection:
404
-
405
- while self.is_active:
406
- time.sleep(0.1)
407
-
408
- except Exception as e:
409
- print(f"DEBUG: Error in event handler: {e}")
410
- self.is_active = False
411
-
412
- def stop_session(self):
413
- """Dừng session"""
414
- self.is_active = False
415
- if self.session_id:
416
- try:
417
- self.client.realtime.sessions.delete(self.session_id)
418
- except:
419
- pass
420
- self.session_id = None
421
-
422
- def process_tool_call(self, tool_name: str, arguments: Dict) -> Dict:
423
- """Xử lý tool calls"""
424
- try:
425
- if tool_name == "search_documents":
426
- query = arguments.get("query", "")
427
- return search_documents_tool(query)
428
-
429
- elif tool_name == "get_legal_advice":
430
- question = arguments.get("question", "")
431
- return get_legal_advice_tool(question)
432
-
433
- else:
434
- return {
435
- "success": False,
436
- "message": f"Unbekanntes Tool: {tool_name}"
437
- }
438
-
439
- except Exception as e:
440
- return {
441
- "success": False,
442
- "message": f"Tool Fehler: {str(e)}"
443
- }
444
 
445
- class VoiceAgentEventHandler:
446
- """Event handler cho Voice Agent"""
447
-
448
- def __init__(self, agent):
449
- self.agent = agent
450
- self.current_text = ""
451
-
452
- def on_text_delta(self, delta, snapshot=None):
453
- """Xử lý text delta"""
454
- if delta.value:
455
- self.current_text += delta.value
456
- # Thêm vào state text queue
457
- state.text_queue.put({
458
- "type": "agent_text",
459
- "content": delta.value
460
- })
461
-
462
- def on_function_call_arguments_delta(self, delta, snapshot=None):
463
- """Xử lý function call arguments"""
464
- print(f"DEBUG: Function call arguments: {delta}")
465
-
466
- def on_function_call_done(self, function_call, snapshot=None):
467
- """Khi function call hoàn thành"""
468
- try:
469
- tool_name = function_call.name
470
- arguments = json.loads(function_call.arguments)
471
-
472
- print(f"DEBUG: Processing tool call: {tool_name}, args: {arguments}")
473
-
474
- # Process tool call
475
- result = self.agent.process_tool_call(tool_name, arguments)
476
-
477
- # Gửi kết quả trở lại
478
- with openai_client.realtime.connect(session_id=self.agent.session_id) as conn:
479
- conn.send({
480
- "type": "response.function_call_arguments",
481
- "function_call_id": function_call.id,
482
- "output": json.dumps(result)
483
- })
484
-
485
- except Exception as e:
486
- print(f"DEBUG: Error processing function call: {e}")
487
-
488
- def on_response_done(self, response=None):
489
- """Khi response hoàn thành"""
490
- if self.current_text:
491
- state.add_message("assistant", self.current_text)
492
- self.current_text = ""
493
-
494
- # Khởi tạo Voice Agent
495
- voice_agent = VoiceAgent(openai_client)
496
-
497
- # =====================================================
498
- # GRADIO UI COMPONENTS
499
- # =====================================================
500
  def format_sources(src):
501
  """Format sources cho display"""
502
  if not src:
@@ -512,386 +209,292 @@ def format_sources(src):
512
 
513
  return "\n".join(out)
514
 
515
- def update_chat_display(history, new_text=""):
516
- """Cập nhật chat display với streaming text"""
 
 
 
 
 
517
  if not history:
518
- history = []
519
-
520
- if new_text:
521
- # Nếu last message là của assistant, append text
522
- if history and history[-1]["role"] == "assistant":
523
- history[-1]["content"] += new_text
524
- else:
525
- history.append({"role": "assistant", "content": new_text})
 
 
 
 
 
 
 
526
 
527
- return history
528
-
529
- def process_queue_updates():
530
- """Process queue updates cho streaming"""
531
- updates = []
532
-
533
- # Process text queue
534
- while not state.text_queue.empty():
535
- try:
536
- item = state.text_queue.get_nowait()
537
- updates.append(("text", item.get("content", "")))
538
- except queue.Empty:
539
- break
540
-
541
- # Process audio queue (simplified - trong thực tế cần xử lý audio)
542
- while not state.audio_queue.empty():
543
- try:
544
- item = state.audio_queue.get_nowait()
545
- # Có thể xử lý audio data ở đây
546
- pass
547
- except queue.Empty:
548
- break
549
-
550
- return updates
551
 
552
  # =====================================================
553
- # UI – GRADIO INTERFACE
554
  # =====================================================
555
- with gr.Blocks(title="Prüfungsrechts-Chatbot mit OpenAI Realtime API") as demo:
556
- # CSS Styling
 
 
 
 
557
  gr.HTML("""
558
  <style>
559
  .gradio-container {
560
- max-width: 1200px;
561
  margin: 0 auto;
562
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
 
563
  }
564
 
565
  .header {
566
  text-align: center;
567
  margin-bottom: 30px;
568
- padding: 25px;
569
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
570
- border-radius: 15px;
571
  color: white;
572
  }
573
 
574
- .control-panel {
575
  background: #f8f9fa;
576
- padding: 20px;
577
- border-radius: 15px;
578
- margin-bottom: 20px;
579
- border: 1px solid #e2e8f0;
580
- }
581
-
582
- .status-indicator {
583
- padding: 10px 15px;
584
  border-radius: 10px;
585
- font-weight: 500;
586
- display: inline-flex;
587
  align-items: center;
588
- gap: 8px;
 
589
  }
590
 
591
- .status-active {
592
- background: #d1fae5;
593
- color: #065f46;
 
 
 
594
  }
595
 
596
- .status-inactive {
597
- background: #f3f4f6;
598
- color: #6b7280;
 
 
 
599
  }
600
 
601
- .voice-btn {
602
- padding: 12px 24px;
603
- border-radius: 25px;
604
- font-weight: 600;
605
- transition: all 0.3s;
606
- border: none;
607
  }
608
 
609
- .voice-btn-start {
610
- background: linear-gradient(135deg, #10b981 0%, #059669 100%);
611
- color: white;
 
 
612
  }
613
 
614
- .voice-btn-stop {
615
- background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
 
 
616
  color: white;
 
 
 
617
  }
618
 
619
- .chat-message {
620
- padding: 15px;
621
- border-radius: 15px;
622
- margin: 10px 0;
623
- max-width: 85%;
624
  }
625
 
626
- .user-message {
627
- background: #e0e7ff;
628
- margin-left: auto;
 
 
 
 
 
629
  }
630
 
631
- .assistant-message {
632
- background: #f3f4f6;
633
- margin-right: auto;
634
  }
635
  </style>
636
  """)
637
 
638
- # Header
639
  with gr.Column(elem_classes=["header"]):
640
- gr.Markdown("# 🎤 Voice Agent mit OpenAI Realtime API")
641
- gr.Markdown("### Live Sprachkonversation mit juristischer Beratung")
642
 
643
- # Control Panel
644
- with gr.Column(elem_classes=["control-panel"]):
645
- with gr.Row():
646
- # Status Display
647
- status_display = gr.HTML(
648
- value='<div class="status-indicator status-inactive">🔴 Voice Agent inaktiv</div>',
649
- label="Status"
650
- )
651
-
652
- # Voice Controls
653
- with gr.Column(scale=1):
654
- start_voice_btn = gr.Button(
655
- "🎤 Start Voice Conversation",
656
- variant="primary",
657
- elem_classes=["voice-btn", "voice-btn-start"]
658
- )
659
- stop_voice_btn = gr.Button(
660
- "⏹️ Stop Voice Conversation",
661
- variant="secondary",
662
- elem_classes=["voice-btn", "voice-btn-stop"],
663
- visible=False
664
- )
665
-
666
- # Mode Selection
667
  with gr.Row():
668
  mode_selector = gr.Radio(
669
- choices=["Voice Agent (Live Conversation)", "Text Chat (RAG)"],
670
- value="Text Chat (RAG)",
671
- label="Modus"
 
 
 
 
 
 
 
 
672
  )
 
673
 
674
  # Main Chat Interface
675
- with gr.Column():
676
- # Chatbot Display
677
- chatbot = gr.Chatbot(
678
- label="Konversation",
679
- height=500,
680
- avatar_images=(None, "🤖")
 
 
681
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
 
683
- # Input Area (cho Text Mode)
684
- with gr.Row(visible=True) as text_input_row:
685
- chat_input = gr.Textbox(
686
- label="Ihre Frage",
687
- placeholder="Stellen Sie eine juristische Frage...",
688
- lines=2,
689
- max_lines=4,
690
- scale=8
691
- )
692
- send_btn = gr.Button("Senden", variant="primary", scale=1)
693
-
694
- # Voice Interface (cho Voice Mode)
695
- with gr.Row(visible=False) as voice_interface:
696
- gr.Markdown("### 🎤 Sprechen Sie jetzt...")
697
- voice_status = gr.Textbox(
698
- label="Status",
699
- value="Bereit für Sprachaufnahme",
700
- interactive=False
701
- )
702
- voice_output = gr.Textbox(label="Transkription", interactive=False)
703
- chat_audio = gr.Audio(
704
- sources=["microphone"],
705
- type="numpy",
706
- streaming=True,
707
- show_label=False,
708
- interactive=True
709
- )
710
 
711
- # Documents Section
712
- with gr.Accordion("📚 Dokumente & Quellen", open=False):
713
  with gr.Tabs():
714
  with gr.TabItem("📄 Prüfungsordnung"):
715
- PDF(pdf_meta["pdf_url"], height=400)
716
 
717
  with gr.TabItem("📘 Hochschulgesetz NRW"):
718
  if hg_url:
719
  gr.HTML(f'''
720
- <div style="padding: 20px;">
721
- <h3>Hochschulgesetz NRW Viewer</h3>
722
- <a href="{hg_url}" target="_blank" style="display: inline-block; padding: 10px 20px; background: #3b82f6; color: white; text-decoration: none; border-radius: 5px; margin-bottom: 15px;">
723
- Im Viewer öffnen
724
  </a>
725
- <iframe src="{hg_url}" width="100%" height="500px" style="border: 1px solid #ddd; border-radius: 8px;"></iframe>
726
  </div>
727
  ''')
728
-
 
729
 
730
  # =====================================================
731
  # EVENT HANDLERS
732
  # =====================================================
733
 
734
- def toggle_mode(mode):
735
- """Chuyển đổi giữa Voice và Text mode"""
736
- if "Voice Agent" in mode:
737
- return (
738
- gr.Row(visible=False), # text_input_row
739
- gr.Row(visible=True), # voice_interface
740
- '<div class="status-indicator status-inactive">🔴 Bitte Voice Agent starten</div>'
741
- )
742
- else:
743
- stop_voice_agent()
744
- return (
745
- gr.Row(visible=True), # text_input_row
746
- gr.Row(visible=False), # voice_interface
747
- '<div class="status-indicator status-inactive">🔴 Text Mode aktiv</div>'
748
- )
749
-
750
- def start_voice_agent():
751
- """Bắt đầu Voice Agent"""
752
- state.is_streaming = True
753
- return (
754
- gr.Button(visible=False), # start_voice_btn
755
- gr.Button(visible=True), # stop_voice_btn
756
- '<div class="status-indicator status-active">🟢 Voice Agent aktiv - Sprechen Sie jetzt</div>',
757
- "Voice Agent gestartet. Sie können jetzt sprechen..."
758
- )
759
-
760
- def stop_voice_agent():
761
- """Dừng Voice Agent"""
762
- state.is_streaming = False
763
- state.reset()
764
- return (
765
- gr.Button(visible=True), # start_voice_btn
766
- gr.Button(visible=False), # stop_voice_btn
767
- '<div class="status-indicator status-inactive">🔴 Voice Agent gestoppt</div>',
768
- "Voice Agent gestoppt"
769
- )
770
-
771
- def process_text_chat(message, history):
772
- """Xử lý text chat với RAG"""
773
- if not message:
774
- return history, ""
775
-
776
- # Thêm user message
777
- history.append({"role": "user", "content": message})
778
-
779
- try:
780
- # Get RAG answer
781
- ans, sources = answer(message, retriever, llm)
782
- full_response = ans + format_sources(sources)
783
-
784
- # Add assistant message
785
- history.append({"role": "assistant", "content": full_response})
786
-
787
- # Add to state
788
- state.add_message("user", message)
789
- state.add_message("assistant", ans)
790
-
791
- except Exception as e:
792
- error_msg = f"Fehler: {str(e)[:100]}"
793
- history.append({"role": "assistant", "content": error_msg})
794
-
795
- return history, ""
796
-
797
- def process_voice_chunk(audio_in, history):
798
- import tempfile
799
- import soundfile as sf
800
- if audio_in is None:
801
- return history, "", "Keine Datei erhalten"
802
- temp_path = None
803
- if isinstance(audio_in, str):
804
- temp_path = audio_in
805
- else:
806
- try:
807
- sr, data = audio_in
808
- import numpy as np
809
- dur = 0.0 if sr is None else (len(data) / float(sr))
810
- rms = float(np.sqrt(np.mean((data.astype('float32')) ** 2))) if len(data) else 0.0
811
- peak = float(np.max(np.abs(data))) if len(data) else 0.0
812
- temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
813
- sf.write(temp_path, data.astype('float32'), int(sr))
814
- status_pref = f"Aufnahme {dur:.2f}s · RMS {rms:.5f} · Peak {peak:.5f}"
815
- except Exception:
816
- temp_path = None
817
- status_pref = "Aufnahme fehlgeschlagen"
818
- try:
819
- text = transcribe_audio_optimized(temp_path, language=ASR_LANGUAGE_HINT) if temp_path else ""
820
- except Exception:
821
- text = ""
822
- if not text:
823
- return history, "", (status_pref + " · Keine Sprache erkannt" if 'status_pref' in locals() else "Keine Sprache erkannt")
824
- history.append({"role": "user", "content": text})
825
- try:
826
- ans, sources = answer(text, retriever, llm)
827
- full_response = ans + format_sources(sources)
828
- history.append({"role": "assistant", "content": full_response})
829
- state.add_message("user", text)
830
- state.add_message("assistant", ans)
831
- status = (status_pref + " · Gesendet" if 'status_pref' in locals() else "Transkription und Antwort gesendet")
832
- except Exception as e:
833
- history.append({"role": "assistant", "content": f"Fehler: {str(e)[:100]}"})
834
- status = "Fehler bei Verarbeitung"
835
- return history, text, status
836
-
837
- def update_streaming_display(history):
838
- """Cập nhật display với streaming text"""
839
- updates = process_queue_updates()
840
-
841
- if not updates:
842
- return history
843
-
844
- for update_type, content in updates:
845
- if update_type == "text" and content:
846
- history = update_chat_display(history, content)
847
-
848
- return history
849
-
850
  # Mode toggle
851
  mode_selector.change(
852
- toggle_mode,
853
- inputs=[mode_selector],
854
- outputs=[text_input_row, voice_interface, status_display]
 
 
 
 
 
855
  )
856
 
857
- # Voice Agent controls
858
- start_voice_btn.click(
859
- start_voice_agent,
860
- outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
 
861
  )
862
 
863
- stop_voice_btn.click(
864
- stop_voice_agent,
865
- outputs=[start_voice_btn, stop_voice_btn, status_display, voice_status]
866
- )
867
-
868
- # Voice streaming: chunk → transcript → chat
869
- chat_audio.stream(
870
- process_voice_chunk,
871
- inputs=[chat_audio, chatbot],
872
- outputs=[chatbot, voice_output, voice_status]
873
- )
874
- chat_audio.change(
875
- process_voice_chunk,
876
- inputs=[chat_audio, chatbot],
877
- outputs=[chatbot, voice_output, voice_status]
878
  )
879
 
880
- # Text chat
881
- send_btn.click(
882
- process_text_chat,
883
- inputs=[chat_input, chatbot],
884
- outputs=[chatbot, chat_input]
 
 
 
 
 
 
 
885
  )
886
 
887
- chat_input.submit(
888
- process_text_chat,
889
- inputs=[chat_input, chatbot],
890
- outputs=[chatbot, chat_input]
891
  )
892
 
893
- # Streaming updates
894
- # Streaming updates disabled (no Timer available in current Gradio)
 
 
 
 
 
 
 
 
 
 
895
 
896
  if __name__ == "__main__":
897
  demo.queue().launch(ssr_mode=False, show_error=True)
 
1
+ # app.py – Prüfungsrechts-Chatbot (Đơn giản như ChatGPT)
2
  import os
3
  import time
4
+ import tempfile
5
+ from typing import Optional, Dict, Any
 
 
 
6
  import gradio as gr
7
  from gradio_pdf import PDF
8
  import numpy as np
9
+ import soundfile as sf
10
 
11
  from openai import OpenAI
12
 
 
16
  from retriever import get_retriever
17
  from llm import load_llm
18
  from rag_pipeline import answer
19
+ from speech_io import transcribe_with_openai, synthesize_speech
20
 
21
  # =====================================================
22
  # CONFIGURATION
23
  # =====================================================
24
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
25
  if not OPENAI_API_KEY:
26
+ raise RuntimeError("OPENAI_API_KEY is required")
27
 
28
  # Initialize OpenAI client
29
  openai_client = OpenAI(api_key=OPENAI_API_KEY)
30
 
31
+ # Language configuration
32
+ ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # =====================================================
35
  # INITIALIZATION - RAG Components
 
55
  hg_url = hg_meta.get("viewer_url")
56
 
57
  # =====================================================
58
+ # STATE MANAGEMENT
59
  # =====================================================
60
+ class ConversationState:
61
+ """Quản trạng thái hội thoại đơn giản"""
62
 
63
+ def __init__(self):
64
+ self.messages = []
65
+ self.current_mode = "text" # "text" hoặc "audio"
66
+ self.is_audio_recording = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ def add_message(self, role: str, content: str):
69
+ """Thêm message vào hội thoại"""
70
+ self.messages.append({
71
+ "role": role,
72
+ "content": content,
73
+ "timestamp": time.time()
 
 
 
 
74
  })
75
+ # Giới hạn lịch sử
76
+ if len(self.messages) > 20:
77
+ self.messages = self.messages[-20:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ def get_chat_history(self):
80
+ """Chuyển đổi sang format cho Gradio Chatbot"""
81
+ history = []
82
+ for msg in self.messages:
83
+ if msg["role"] == "user":
84
+ history.append([msg["content"], None])
85
+ elif msg["role"] == "assistant":
86
+ if history and history[-1][1] is None:
87
+ history[-1][1] = msg["content"]
88
+ else:
89
+ history.append([None, msg["content"]])
90
+ return history
91
+
92
+ def reset(self):
93
+ """Reset trạng thái hội thoại"""
94
+ self.messages = []
95
+ self.is_audio_recording = False
96
+
97
+ # Khởi tạo state
98
+ state = ConversationState()
99
 
100
  # =====================================================
101
+ # AUDIO PROCESSING FUNCTIONS
102
  # =====================================================
103
+ def process_audio_input(audio_data: Optional[tuple], history) -> tuple:
104
+ """
105
+ Xử lý audio input từ microphone
106
+ """
107
+ if audio_data is None:
108
+ return history, "", "Warten auf Audioaufnahme..."
109
+
110
  try:
111
+ # Lấy sample rate audio data
112
+ sample_rate, audio_array = audio_data
113
 
114
+ # Tạo file tạm để lưu audio
115
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
116
+ temp_path = tmp.name
117
+ # Lưu audio data
118
+ sf.write(temp_path, audio_array, int(sample_rate))
 
119
 
120
+ print("DEBUG: Audio saved to temp file, transcribing...")
121
+
122
+ # Transcribe audio bằng OpenAI Whisper
123
+ transcribed_text = transcribe_with_openai(temp_path, language=ASR_LANGUAGE_HINT)
124
+
125
+ # Xóa file tạm
126
+ os.unlink(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ if not transcribed_text or not transcribed_text.strip():
129
+ return history, "", "Keine Sprache erkannt. Bitte versuchen Sie es erneut."
130
+
131
+ print(f"DEBUG: Transcribed text: {transcribed_text}")
132
+
133
+ # Thêm vào history
134
+ new_history = history + [[transcribed_text, None]]
135
+
136
+ # Process với RAG
137
+ ans, sources = answer(transcribed_text, retriever, llm)
138
+ full_response = ans + format_sources(sources)
139
+
140
+ # Cập nhật history với response
141
+ new_history[-1][1] = full_response
142
+
143
+ # Thêm vào state
144
+ state.add_message("user", transcribed_text)
145
+ state.add_message("assistant", ans)
146
+
147
+ return new_history, transcribed_text, "Antwort generiert ✓"
148
 
149
  except Exception as e:
150
+ print(f"DEBUG: Error processing audio: {e}")
151
+ return history, "", f"Fehler: {str(e)[:50]}"
152
+
153
+ def toggle_audio_mode(mode_choice: str, history):
154
+ """Chuyển đổi giữa text và audio mode"""
155
+ if mode_choice == "Audio (Sprachmodus)":
156
+ state.current_mode = "audio"
157
+ state.is_audio_recording = True
158
+ mode_text = "🎤 Sprachmodus aktiv - Klicken und Sprechen"
159
+ else:
160
+ state.current_mode = "text"
161
+ state.is_audio_recording = False
162
+ mode_text = "⌨️ Textmodus aktiv"
163
+
164
+ return (
165
+ gr.Audio(visible=(mode_choice == "Audio (Sprachmodus)")),
166
+ gr.Textbox(visible=(mode_choice == "Text (Schreibmodus)")),
167
+ gr.Button(visible=(mode_choice == "Text (Schreibmodus)")),
168
+ mode_text
169
+ )
170
 
171
+ def process_text_input(message: str, history):
172
+ """Xửtext input"""
173
+ if not message or not message.strip():
174
+ return history, ""
175
+
176
+ # Thêm vào history
177
+ new_history = history + [[message, None]]
178
+
179
  try:
180
+ # Process với RAG
181
+ ans, sources = answer(message, retriever, llm)
182
+ full_response = ans + format_sources(sources)
183
 
184
+ # Cập nhật history với response
185
+ new_history[-1][1] = full_response
 
 
 
 
 
 
186
 
187
+ # Thêm vào state
188
+ state.add_message("user", message)
189
+ state.add_message("assistant", ans)
 
 
 
190
 
191
  except Exception as e:
192
+ error_msg = f"Entschuldigung, es gab einen Fehler: {str(e)[:100]}"
193
+ new_history[-1][1] = error_msg
 
 
 
 
 
 
 
 
 
 
194
 
195
+ return new_history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  def format_sources(src):
198
  """Format sources cho display"""
199
  if not src:
 
209
 
210
  return "\n".join(out)
211
 
212
+ def clear_conversation():
213
+ """Xóa hội thoại"""
214
+ state.reset()
215
+ return [], "Konversation gelöscht"
216
+
217
+ def speak_last_response(history):
218
+ """Đọc câu trả lời cuối cùng"""
219
  if not history:
220
+ return None, "Keine Antwort zum Vorlesen"
221
+
222
+ # Tìm câu trả lời cuối cùng
223
+ for i in range(len(history)-1, -1, -1):
224
+ if history[i][1]: # assistant response exists
225
+ response_text = history[i][1]
226
+ # Loại bỏ phần sources
227
+ if "## 📚 Quellen" in response_text:
228
+ response_text = response_text.split("## 📚 Quellen")[0].strip()
229
+
230
+ # Tạo speech
231
+ audio_result = synthesize_speech(response_text[:500]) # Giới hạn độ dài
232
+ if audio_result:
233
+ sr, audio_data = audio_result
234
+ return (sr, audio_data), "Audio wird abgespielt..."
235
 
236
+ return None, "Keine passende Antwort gefunden"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  # =====================================================
239
+ # UI – GRADIO INTERFACE (Đơn giản như ChatGPT)
240
  # =====================================================
241
+ with gr.Blocks(
242
+ title="🧑‍⚖️ Prüfungsrechts-Chatbot",
243
+ theme=gr.themes.Soft()
244
+ ) as demo:
245
+
246
+ # CSS Styling đơn giản
247
  gr.HTML("""
248
  <style>
249
  .gradio-container {
250
+ max-width: 900px;
251
  margin: 0 auto;
252
  font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
253
+ padding: 20px;
254
  }
255
 
256
  .header {
257
  text-align: center;
258
  margin-bottom: 30px;
259
+ padding: 20px;
260
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
261
+ border-radius: 12px;
262
  color: white;
263
  }
264
 
265
+ .mode-selector {
266
  background: #f8f9fa;
267
+ padding: 15px;
 
 
 
 
 
 
 
268
  border-radius: 10px;
269
+ margin-bottom: 20px;
270
+ display: flex;
271
  align-items: center;
272
+ gap: 15px;
273
+ border: 1px solid #e2e8f0;
274
  }
275
 
276
+ .mode-indicator {
277
+ padding: 8px 16px;
278
+ border-radius: 20px;
279
+ font-weight: 600;
280
+ background: #e0e7ff;
281
+ color: #4f46e5;
282
  }
283
 
284
+ .input-area {
285
+ background: white;
286
+ border-radius: 12px;
287
+ padding: 15px;
288
+ border: 2px solid #e2e8f0;
289
+ margin-top: 20px;
290
  }
291
 
292
+ .input-row {
293
+ display: flex;
294
+ gap: 10px;
295
+ align-items: center;
 
 
296
  }
297
 
298
+ .audio-visualizer {
299
+ padding: 10px;
300
+ text-align: center;
301
+ color: #666;
302
+ font-style: italic;
303
  }
304
 
305
+ .tts-btn {
306
+ margin-top: 10px;
307
+ padding: 8px 16px;
308
+ background: #10b981;
309
  color: white;
310
+ border: none;
311
+ border-radius: 8px;
312
+ cursor: pointer;
313
  }
314
 
315
+ .tts-btn:hover {
316
+ background: #059669;
 
 
 
317
  }
318
 
319
+ .clear-btn {
320
+ background: #ef4444;
321
+ color: white;
322
+ border: none;
323
+ border-radius: 8px;
324
+ padding: 8px 16px;
325
+ cursor: pointer;
326
+ margin-left: 10px;
327
  }
328
 
329
+ .clear-btn:hover {
330
+ background: #dc2626;
 
331
  }
332
  </style>
333
  """)
334
 
335
+ # Header đơn giản
336
  with gr.Column(elem_classes=["header"]):
337
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
338
+ gr.Markdown("### Stellen Sie Fragen zu Prüfungsordnung und Hochschulgesetz NRW")
339
 
340
+ # Mode Selector
341
+ with gr.Column(elem_classes=["mode-selector"]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  with gr.Row():
343
  mode_selector = gr.Radio(
344
+ choices=["Text (Schreibmodus)", "Audio (Sprachmodus)"],
345
+ value="Text (Schreibmodus)",
346
+ label="",
347
+ scale=3,
348
+ elem_id="mode-selector"
349
+ )
350
+ mode_indicator = gr.Textbox(
351
+ value="⌨️ Textmodus aktiv",
352
+ label="Status",
353
+ interactive=False,
354
+ scale=2
355
  )
356
+ clear_btn = gr.Button("🗑️ Löschen", elem_classes=["clear-btn"], scale=1)
357
 
358
  # Main Chat Interface
359
+ chatbot = gr.Chatbot(
360
+ label="Konversation",
361
+ height=500,
362
+ bubble_full_width=True,
363
+ show_copy_button=True,
364
+ avatar_images=(
365
+ "https://em-content.zobj.net/source/microsoft-teams/363/bust-in-silhouette_1f464.png",
366
+ "https://em-content.zobj.net/source/microsoft-teams/363/robot_1f916.png"
367
  )
368
+ )
369
+
370
+ # Input Area (thay đổi theo mode)
371
+ with gr.Column(elem_classes=["input-area"], visible=True) as input_area:
372
+ # Text Input (visible khi text mode)
373
+ with gr.Column(visible=True) as text_input_container:
374
+ with gr.Row(elem_classes=["input-row"]):
375
+ text_input = gr.Textbox(
376
+ label="",
377
+ placeholder="Stellen Sie eine juristische Frage... (Enter zum Senden)",
378
+ lines=2,
379
+ max_lines=4,
380
+ scale=8,
381
+ show_label=False,
382
+ container=False
383
+ )
384
+ text_send_btn = gr.Button(
385
+ "Senden",
386
+ variant="primary",
387
+ scale=1,
388
+ min_width=80
389
+ )
390
 
391
+ # Audio Input (visible khi audio mode)
392
+ with gr.Column(visible=False) as audio_input_container:
393
+ gr.Markdown("### 🎤 Klicken und Sprechen")
394
+ with gr.Row():
395
+ audio_input = gr.Audio(
396
+ sources=["microphone"],
397
+ type="numpy",
398
+ streaming=False,
399
+ show_label=False,
400
+ interactive=True,
401
+ scale=8
402
+ )
403
+ audio_status = gr.Textbox(
404
+ label="Status",
405
+ value="Warten auf Aufnahme...",
406
+ interactive=False,
407
+ scale=2
408
+ )
409
+ gr.Markdown("*Drücken Sie aufnehmen, sprechen Sie Ihre Frage, dann stoppen*", elem_classes=["audio-visualizer"])
410
+
411
+ # TTS Controls
412
+ with gr.Row():
413
+ tts_btn = gr.Button("🔊 Letzte Antwort vorlesen", variant="secondary", size="sm")
414
+ tts_audio = gr.Audio(label="", interactive=False, visible=False)
415
+ tts_status = gr.Textbox(label="", interactive=False, visible=False)
 
 
416
 
417
+ # Documents Section (Collapsible)
418
+ with gr.Accordion("📚 Dokumente & Quellen anzeigen", open=False):
419
  with gr.Tabs():
420
  with gr.TabItem("📄 Prüfungsordnung"):
421
+ PDF(pdf_meta["pdf_url"], height=350)
422
 
423
  with gr.TabItem("📘 Hochschulgesetz NRW"):
424
  if hg_url:
425
  gr.HTML(f'''
426
+ <div style="padding: 10px;">
427
+ <h4>Hochschulgesetz NRW Viewer</h4>
428
+ <a href="{hg_url}" target="_blank" style="display: inline-block; padding: 8px 16px; background: #3b82f6; color: white; text-decoration: none; border-radius: 5px; margin-bottom: 10px;">
429
+ Im Viewer öffnen
430
  </a>
431
+ <iframe src="{hg_url}" width="100%" height="400px" style="border: 1px solid #ddd; border-radius: 6px;"></iframe>
432
  </div>
433
  ''')
434
+ else:
435
+ gr.Markdown("Viewer-Link nicht verfügbar.")
436
 
437
  # =====================================================
438
  # EVENT HANDLERS
439
  # =====================================================
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  # Mode toggle
442
  mode_selector.change(
443
+ toggle_audio_mode,
444
+ inputs=[mode_selector, chatbot],
445
+ outputs=[
446
+ audio_input_container,
447
+ text_input_container,
448
+ text_send_btn,
449
+ mode_indicator
450
+ ]
451
  )
452
 
453
+ # Text input handling
454
+ text_send_btn.click(
455
+ process_text_input,
456
+ inputs=[text_input, chatbot],
457
+ outputs=[chatbot, text_input]
458
  )
459
 
460
+ text_input.submit(
461
+ process_text_input,
462
+ inputs=[text_input, chatbot],
463
+ outputs=[chatbot, text_input]
 
 
 
 
 
 
 
 
 
 
 
464
  )
465
 
466
+ # Audio input handling
467
+ def handle_audio_complete(audio_data, history):
468
+ """Xử lý khi audio recording hoàn tất"""
469
+ return process_audio_input(audio_data, history)
470
+
471
+ audio_input.stop_recording(
472
+ handle_audio_complete,
473
+ inputs=[audio_input, chatbot],
474
+ outputs=[chatbot, audio_status, audio_status]
475
+ ).then(
476
+ lambda: ("", "Warten auf neue Aufnahme..."),
477
+ outputs=[audio_input, audio_status]
478
  )
479
 
480
+ # Clear conversation
481
+ clear_btn.click(
482
+ clear_conversation,
483
+ outputs=[chatbot, mode_indicator]
484
  )
485
 
486
+ # TTS button
487
+ tts_btn.click(
488
+ speak_last_response,
489
+ inputs=[chatbot],
490
+ outputs=[tts_audio, tts_status]
491
+ ).then(
492
+ lambda: gr.Audio(visible=True),
493
+ outputs=[tts_audio]
494
+ ).then(
495
+ lambda: gr.Textbox(visible=True),
496
+ outputs=[tts_status]
497
+ )
498
 
499
  if __name__ == "__main__":
500
  demo.queue().launch(ssr_mode=False, show_error=True)
realtime_server.py CHANGED
@@ -1,119 +1,167 @@
1
  """
2
- realtime_server.py v0.2 (2025-12-08)
3
-
4
- OpenAI Realtime WS relay for live voice conversation.
5
- This server accepts a WebSocket connection from the frontend at `/ws`,
6
- forwards audio/text frames to OpenAI Realtime WS using the official
7
- `response.create`, `input_audio_buffer.append`, and `input_audio_buffer.commit`
8
- messages, and streams assistant responses back to the client in real time.
9
-
10
- Compatibility: standalone, does not break existing Gradio UI. Enable by
11
- setting USE_REALTIME=true and pointing the frontend to ws://localhost:8000/ws.
12
  """
13
 
14
  import os
15
  import asyncio
16
  import json
17
  import base64
18
- import websockets
19
- import traceback
20
  from typing import Optional
21
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
22
- from fastapi.responses import JSONResponse
 
23
 
24
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
25
  OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
26
 
27
  app = FastAPI()
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- @app.get("/health")
31
- async def health():
32
- return JSONResponse({"status": "ok"})
33
-
34
-
35
- async def _connect_openai_ws():
36
- url = f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}"
37
- headers = {
38
- "Authorization": f"Bearer {OPENAI_API_KEY}",
39
- "OpenAI-Beta": "realtime=v1",
40
- }
41
- return await websockets.connect(url, extra_headers=headers, max_size=None)
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  @app.websocket("/ws")
45
- async def ws_stream(ws: WebSocket):
46
- """Frontend WS relay.
47
-
48
- Client → Server messages (JSON):
49
- - {type: "audio_chunk", data: base64_wav_string}
50
- - {type: "audio_commit"}
51
- - {type: "response", instructions: "..."}
52
-
53
- Server forwards to OpenAI WS:
54
- - input_audio_buffer.append
55
- - input_audio_buffer.commit
56
- - response.create
57
-
58
- Server → Client messages: pass-through OpenAI event frames.
59
- """
60
- if not OPENAI_API_KEY:
61
- await ws.accept()
62
- await ws.send_text(json.dumps({"type": "error", "message": "OPENAI_API_KEY missing"}))
63
- await ws.close()
64
- return
65
-
66
- await ws.accept()
67
- openai_conn = None
68
  try:
69
- openai_conn = await _connect_openai_ws()
70
-
71
- async def forward_openai_to_client():
72
- try:
73
- async for event in openai_conn:
74
- await ws.send_text(event if isinstance(event, str) else json.dumps(event))
75
- except Exception:
76
- await ws.send_text(json.dumps({"type": "error", "message": "upstream_closed"}))
77
-
78
- async def forward_client_to_openai():
79
- try:
80
- while True:
81
- raw = await ws.receive_text()
82
- msg = json.loads(raw)
83
- t = msg.get("type")
84
- if t == "audio_chunk":
85
- data_b64 = msg.get("data")
86
- await openai_conn.send(json.dumps({
87
- "type": "input_audio_buffer.append",
88
- "audio": {"data": data_b64, "format": "wav"}
89
- }))
90
- elif t == "audio_commit":
91
- await openai_conn.send(json.dumps({"type": "input_audio_buffer.commit"}))
92
- elif t == "response":
93
- instr = msg.get("instructions", "")
94
- await openai_conn.send(json.dumps({
95
- "type": "response.create",
96
- "response": {"modalities": ["text", "audio"], "instructions": instr}
97
- }))
98
- else:
99
- await ws.send_text(json.dumps({"type": "error", "message": "unknown_type"}))
100
- except WebSocketDisconnect:
101
- pass
102
- except Exception:
103
- await ws.send_text(json.dumps({"type": "error", "message": "client_read_error"}))
104
-
105
- await asyncio.gather(forward_openai_to_client(), forward_client_to_openai())
106
-
107
- except Exception:
108
- await ws.send_text(json.dumps({"type": "error", "message": "relay_error", "detail": traceback.format_exc()}))
109
  finally:
110
- try:
111
- if openai_conn:
112
- await openai_conn.close()
113
- except Exception:
114
- pass
115
- try:
116
- await ws.close()
117
- except Exception:
118
- pass
119
 
 
 
 
 
1
  """
2
+ realtime_server.py - Optional WebSocket server for real-time audio streaming
3
+ Chạy riêng biệt: uvicorn realtime_server:app --host 0.0.0.0 --port 8000
 
 
 
 
 
 
 
 
4
  """
5
 
6
  import os
7
  import asyncio
8
  import json
9
  import base64
 
 
10
  from typing import Optional
11
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
12
+ from fastapi.responses import HTMLResponse
13
+ import websockets
14
 
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
16
  OPENAI_REALTIME_MODEL = os.getenv("OPENAI_REALTIME_MODEL", "gpt-4o-realtime-preview")
17
 
18
  app = FastAPI()
19
 
20
+ # Simple HTML test page
21
+ html = """
22
+ <!DOCTYPE html>
23
+ <html>
24
+ <head>
25
+ <title>Realtime Audio Test</title>
26
+ </head>
27
+ <body>
28
+ <h1>Realtime Audio Test</h1>
29
+ <button id="startBtn">Start Recording</button>
30
+ <button id="stopBtn" disabled>Stop Recording</button>
31
+ <div id="status">Status: Ready</div>
32
+ <div id="transcript"></div>
33
+
34
+ <script>
35
+ let mediaRecorder;
36
+ let audioChunks = [];
37
+
38
+ document.getElementById('startBtn').onclick = async () => {
39
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
40
+ mediaRecorder = new MediaRecorder(stream);
41
+
42
+ mediaRecorder.ondataavailable = (event) => {
43
+ audioChunks.push(event.data);
44
+ };
45
+
46
+ mediaRecorder.onstop = async () => {
47
+ const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
48
+ audioChunks = [];
49
+
50
+ // Convert to base64
51
+ const reader = new FileReader();
52
+ reader.readAsDataURL(audioBlob);
53
+ reader.onloadend = () => {
54
+ const base64data = reader.result.split(',')[1];
55
+ // Send to server
56
+ fetch('/process-audio', {
57
+ method: 'POST',
58
+ headers: { 'Content-Type': 'application/json' },
59
+ body: JSON.stringify({ audio: base64data })
60
+ })
61
+ .then(response => response.json())
62
+ .then(data => {
63
+ document.getElementById('transcript').innerHTML =
64
+ `<strong>Transkription:</strong> ${data.transcript}`;
65
+ });
66
+ };
67
+ };
68
+
69
+ mediaRecorder.start();
70
+ document.getElementById('startBtn').disabled = true;
71
+ document.getElementById('stopBtn').disabled = false;
72
+ document.getElementById('status').textContent = 'Status: Recording...';
73
+ };
74
+
75
+ document.getElementById('stopBtn').onclick = () => {
76
+ mediaRecorder.stop();
77
+ document.getElementById('startBtn').disabled = false;
78
+ document.getElementById('stopBtn').disabled = true;
79
+ document.getElementById('status').textContent = 'Status: Processing...';
80
+ };
81
+ </script>
82
+ </body>
83
+ </html>
84
+ """
85
 
86
+ @app.get("/")
87
+ async def get():
88
+ return HTMLResponse(html)
 
 
 
 
 
 
 
 
 
89
 
90
+ @app.post("/process-audio")
91
+ async def process_audio(request: dict):
92
+ """Process audio from frontend"""
93
+ try:
94
+ audio_data = base64.b64decode(request.get("audio", ""))
95
+
96
+ # Save to temp file
97
+ import tempfile
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
99
+ f.write(audio_data)
100
+ temp_path = f.name
101
+
102
+ # Transcribe using OpenAI
103
+ from openai import OpenAI
104
+ client = OpenAI(api_key=OPENAI_API_KEY)
105
+
106
+ with open(temp_path, "rb") as audio_file:
107
+ transcript = client.audio.transcriptions.create(
108
+ model="whisper-1",
109
+ file=audio_file,
110
+ language="de"
111
+ )
112
+
113
+ # Clean up
114
+ import os
115
+ os.unlink(temp_path)
116
+
117
+ return {"success": True, "transcript": transcript.text}
118
+
119
+ except Exception as e:
120
+ return {"success": False, "error": str(e)}
121
 
122
  @app.websocket("/ws")
123
+ async def websocket_endpoint(websocket: WebSocket):
124
+ """WebSocket endpoint for real-time audio streaming"""
125
+ await websocket.accept()
126
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  try:
128
+ # Connect to OpenAI Realtime API
129
+ headers = {
130
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
131
+ "OpenAI-Beta": "realtime=v1",
132
+ }
133
+
134
+ async with websockets.connect(
135
+ f"wss://api.openai.com/v1/realtime?model={OPENAI_REALTIME_MODEL}",
136
+ extra_headers=headers
137
+ ) as openai_ws:
138
+
139
+ # Forward messages in both directions
140
+ async def forward_to_openai():
141
+ try:
142
+ while True:
143
+ data = await websocket.receive_text()
144
+ await openai_ws.send(data)
145
+ except WebSocketDisconnect:
146
+ pass
147
+
148
+ async def forward_to_client():
149
+ try:
150
+ async for message in openai_ws:
151
+ await websocket.send_text(message)
152
+ except:
153
+ pass
154
+
155
+ await asyncio.gather(
156
+ forward_to_openai(),
157
+ forward_to_client()
158
+ )
159
+
160
+ except Exception as e:
161
+ print(f"WebSocket error: {e}")
 
 
 
 
 
 
162
  finally:
163
+ await websocket.close()
 
 
 
 
 
 
 
 
164
 
165
+ if __name__ == "__main__":
166
+ import uvicorn
167
+ uvicorn.run(app, host="0.0.0.0", port=8000)
speech_io.py CHANGED
@@ -1,493 +1,145 @@
1
  """
2
- speech_io.py - Enhanced Version with working VAD
3
-
4
- Sprachbasierte Ein-/Ausgabe với:
5
- - Speech-to-Text (STT) với Whisper
6
- - Text-to-Speech (TTS)
7
- - Voice Activity Detection (VAD) hoạt động
8
  """
9
 
10
  import os
11
- import time
12
- from typing import Optional, Tuple, Dict, Any
13
  import numpy as np
14
  import soundfile as sf
15
- from scipy.signal import butter, filtfilt, resample
16
- import re
17
- import difflib
18
 
19
  # ========================================================
20
  # CẤU HÌNH
21
  # ========================================================
22
- # Model Selection
23
- WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
24
- ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
- TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
27
-
28
- # VAD Configuration
29
- ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
30
- VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
31
- VAD_MIN_DURATION = float(os.getenv("VAD_MIN_DURATION", "0.1"))
32
-
33
- # Other Configs
34
- ASR_DEFAULT_LANGUAGE = os.getenv("ASR_LANGUAGE", "de")
35
  TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
36
- ASR_MAX_DURATION_S = int(os.getenv("ASR_MAX_DURATION_S", "30"))
37
-
38
- # Cache for models
39
- _asr = None
40
- _tts = None
41
 
42
  # ========================================================
43
- # AUDIO PROCESSING UTILITIES
44
  # ========================================================
45
- def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
46
- """Highpass filter để loại bỏ noise tần số thấp"""
47
- if len(data) == 0:
48
- return data
49
-
50
- nyq = 0.5 * fs
51
- normal_cutoff = cutoff / nyq
52
- b, a = butter(order, normal_cutoff, btype='high', analog=False)
53
- return filtfilt(b, a, data)
54
-
55
- def apply_fade(audio, sr, fade_in_ms=10, fade_out_ms=10):
56
- """Áp dụng fade in/out để tránh pop"""
57
- if len(audio) == 0:
58
- return audio
59
-
60
- fade_in_samples = int(sr * fade_in_ms / 1000)
61
- fade_out_samples = int(sr * fade_out_ms / 1000)
62
-
63
- # Đảm bảo có đủ samples
64
- if len(audio) < fade_in_samples + fade_out_samples:
65
- return audio
66
-
67
- # Fade in
68
- if fade_in_samples > 0:
69
- fade_in_curve = np.linspace(0, 1, fade_in_samples)
70
- audio[:fade_in_samples] *= fade_in_curve
71
-
72
- # Fade out
73
- if fade_out_samples > 0:
74
- fade_out_curve = np.linspace(1, 0, fade_out_samples)
75
- audio[-fade_out_samples:] *= fade_out_curve
76
-
77
- return audio
78
-
79
- def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
80
- """Chuẩn hóa audio về [-1, 1]"""
81
- if len(audio_data) == 0:
82
- return audio_data
83
-
84
- # Chuyển đổi sang float32
85
- if audio_data.dtype != np.float32:
86
- audio_data = audio_data.astype(np.float32)
87
-
88
- # Normalize
89
- max_val = np.max(np.abs(audio_data))
90
- if max_val > 0:
91
- audio_data = audio_data / max_val
92
-
93
- return audio_data
94
-
95
- def preprocess_audio_for_vad(audio_data: np.ndarray, sample_rate: int) -> np.ndarray:
96
- """Tiền xử lý audio cho VAD"""
97
- if len(audio_data) == 0:
98
- return audio_data
99
-
100
- # Chuyển sang mono nếu cần
101
- if len(audio_data.shape) > 1:
102
- audio_data = np.mean(audio_data, axis=1)
103
-
104
- # Normalize
105
- audio_data = normalize_audio(audio_data)
106
-
107
- # Highpass filter để loại bỏ noise tần số thấp
108
- try:
109
- audio_data = butter_highpass_filter(audio_data, cutoff=80, fs=sample_rate)
110
- except:
111
- pass
112
-
113
- return audio_data
114
-
115
- # ========================================================
116
- # VOICE ACTIVITY DETECTION (VAD) - FIXED VERSION
117
- # ========================================================
118
- def detect_voice_activity(
119
- audio_data: np.ndarray,
120
- sample_rate: int,
121
- threshold: float = 0.3,
122
- min_duration: float = 0.1
123
- ) -> Dict[str, Any]:
124
- """
125
- Phát hiện hoạt động giọng nói - Phiên bản đơn giản và hoạt động
126
-
127
- Args:
128
- audio_data: Mảng numpy chứa audio samples
129
- sample_rate: Tần số lấy mẫu
130
- threshold: Ngưỡng phát hiện (0-1)
131
- min_duration: Thời gian tối thiểu để xác định là speech (giây)
132
-
133
- Returns:
134
- Dict với thông tin phát hiện
135
- """
136
- if len(audio_data) == 0:
137
- return {
138
- "is_speech": False,
139
- "confidence": 0.0,
140
- "speech_segments": [],
141
- "energy": 0.0,
142
- "message": "Empty audio data"
143
- }
144
-
145
- try:
146
- # Tiền xử lý audio
147
- processed_audio = preprocess_audio_for_vad(audio_data, sample_rate)
148
-
149
- # Tính toán các đặc trưng
150
- duration = len(processed_audio) / sample_rate
151
-
152
- # 1. Tính RMS energy
153
- rms_energy = np.sqrt(np.mean(processed_audio ** 2))
154
-
155
- # 2. Tính zero-crossing rate
156
- zero_crossings = np.sum(np.abs(np.diff(np.sign(processed_audio)))) / (2 * len(processed_audio))
157
-
158
- # 3. Tính spectral centroid (đơn giản)
159
- # Sử dụng FFT để tính phân bố tần số
160
- if len(processed_audio) >= 256:
161
- fft_size = min(2048, len(processed_audio))
162
- spectrum = np.abs(np.fft.rfft(processed_audio[:fft_size]))
163
- frequencies = np.fft.rfftfreq(fft_size, 1/sample_rate)
164
- if np.sum(spectrum) > 0:
165
- spectral_centroid = np.sum(frequencies * spectrum) / np.sum(spectrum)
166
- else:
167
- spectral_centroid = 0
168
- else:
169
- spectral_centroid = 0
170
-
171
- # 4. Frame-based analysis
172
- frame_length = int(sample_rate * 0.03) # 30ms frame
173
- hop_length = int(frame_length / 2)
174
-
175
- if len(processed_audio) > frame_length:
176
- num_frames = 1 + (len(processed_audio) - frame_length) // hop_length
177
- frame_energies = []
178
-
179
- for i in range(num_frames):
180
- start = i * hop_length
181
- end = start + frame_length
182
- frame = processed_audio[start:end]
183
- frame_energy = np.sqrt(np.mean(frame ** 2))
184
- frame_energies.append(frame_energy)
185
-
186
- # Tính speech ratio
187
- if frame_energies:
188
- energy_threshold = np.percentile(frame_energies, 30) + threshold * (np.max(frame_energies) - np.percentile(frame_energies, 30))
189
- speech_frames = sum(1 for e in frame_energies if e > energy_threshold)
190
- speech_ratio = speech_frames / len(frame_energies)
191
- else:
192
- speech_ratio = 0
193
- else:
194
- speech_ratio = 0
195
-
196
- # 5. Kết hợp các đặc trưng để tính confidence
197
- # Speech thường có:
198
- # - RMS energy cao
199
- # - Zero-crossing rate trung bình (không quá cao như noise, không quá thấp như silence)
200
- # - Spectral centroid trong khoảng 100-3000 Hz cho giọng nói
201
- # - Speech ratio cao
202
-
203
- # Tính confidence score
204
- energy_score = min(1.0, rms_energy * 10) # Scale energy
205
-
206
- # Zero-crossing rate score: lý tưởng khoảng 0.1-0.3 cho speech
207
- if 0.05 < zero_crossings < 0.4:
208
- zcr_score = 1.0 - 2 * abs(zero_crossings - 0.2) # Peak ở 0.2
209
- else:
210
- zcr_score = 0.0
211
-
212
- # Spectral centroid score: lý tưởng 100-3000 Hz
213
- if 100 < spectral_centroid < 3000:
214
- centroid_score = 1.0
215
- elif 50 < spectral_centroid < 5000:
216
- centroid_score = 0.5
217
- else:
218
- centroid_score = 0.0
219
-
220
- # Speech ratio score
221
- speech_ratio_score = speech_ratio
222
-
223
- # Kết hợp các score
224
- weights = [0.4, 0.2, 0.2, 0.2] # energy, zcr, centroid, speech_ratio
225
- confidence = (
226
- weights[0] * energy_score +
227
- weights[1] * zcr_score +
228
- weights[2] * centroid_score +
229
- weights[3] * speech_ratio_score
230
- )
231
-
232
- # Áp dụng ngưỡng
233
- is_speech = confidence > threshold
234
-
235
- # Kiểm tra duration tối thiểu
236
- if duration < min_duration:
237
- is_speech = False
238
- confidence = max(0, confidence - 0.2)
239
-
240
- # Debug info
241
- debug_info = {
242
- "duration": duration,
243
- "rms_energy": rms_energy,
244
- "zero_crossings": zero_crossings,
245
- "spectral_centroid": spectral_centroid,
246
- "speech_ratio": speech_ratio,
247
- "energy_score": energy_score,
248
- "zcr_score": zcr_score,
249
- "centroid_score": centroid_score,
250
- "speech_ratio_score": speech_ratio_score,
251
- "final_confidence": confidence,
252
- "is_speech": is_speech
253
- }
254
-
255
- print(f"VAD Debug: {debug_info}")
256
-
257
- return {
258
- "is_speech": is_speech,
259
- "confidence": float(confidence),
260
- "speech_segments": [[0, duration]] if is_speech else [],
261
- "energy": float(rms_energy),
262
- "message": f"Speech: {is_speech}, Confidence: {confidence:.3f}"
263
- }
264
-
265
- except Exception as e:
266
- print(f"VAD processing error: {e}")
267
- return {
268
- "is_speech": False,
269
- "confidence": 0.0,
270
- "speech_segments": [],
271
- "energy": 0.0,
272
- "message": f"Error: {str(e)}"
273
- }
274
-
275
- # ========================================================
276
- # SPEECH-TO-TEXT FUNCTIONS
277
- # ========================================================
278
- def get_asr_pipeline():
279
- """Lấy ASR pipeline"""
280
- global _asr
281
- if _asr is None:
282
- print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
283
-
284
- from transformers import pipeline
285
-
286
- _asr = pipeline(
287
- task="automatic-speech-recognition",
288
- model=ASR_MODEL_ID,
289
- device="cpu",
290
- return_timestamps=False,
291
- chunk_length_s=8,
292
- stride_length_s=(1, 1),
293
- )
294
- return _asr
295
-
296
  def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
297
- """Transcribe audio using OpenAI Whisper-1.
298
- Falls back to local transcription on error. """
299
  if not OPENAI_API_KEY:
300
- return transcribe_audio(audio_path, language)
 
301
  try:
302
  from openai import OpenAI
303
  client = OpenAI(api_key=OPENAI_API_KEY)
 
304
  with open(audio_path, "rb") as f:
305
  resp = client.audio.transcriptions.create(
306
  model="whisper-1",
307
  file=f,
308
  language=language if language and language != "auto" else None,
 
309
  )
310
- txt = getattr(resp, "text", "") or (resp.get("text") if isinstance(resp, dict) else "")
311
- return (txt or "").strip()
312
- except Exception as e:
313
- print(f">>> OpenAI Fehler: {e}")
314
- return transcribe_audio(audio_path, language)
315
-
316
- def transcribe_audio(
317
- audio_path: str,
318
- language: Optional[str] = None,
319
- max_duration_s: int = ASR_MAX_DURATION_S
320
- ) -> str:
321
- """
322
- Transcribe audio với Whisper local
323
- """
324
- if not audio_path or not os.path.exists(audio_path):
325
- print(">>> Kein Audio gefunden.")
326
- return ""
327
-
328
- try:
329
- # Đọc audio file
330
- data, sr = sf.read(audio_path, always_2d=False)
331
-
332
- if data is None or data.size == 0:
333
- print(">>> Audio leer.")
334
- return ""
335
-
336
- # Chuyển sang mono
337
- if len(data.shape) > 1:
338
- data = np.mean(data, axis=1)
339
-
340
- # Tiền xử lý
341
- data = data.astype(np.float32)
342
- max_val = np.max(np.abs(data))
343
- if max_val > 0:
344
- data = data / max_val
345
-
346
- # Resample về 16kHz nếu cần
347
- TARGET_SR = 16000
348
- if sr != TARGET_SR:
349
- target_len = int(len(data) * TARGET_SR / sr)
350
- data = resample(data, target_len)
351
- sr = TARGET_SR
352
 
353
- # Giới hạn độ dài
354
- MAX_SAMPLES = sr * max_duration_s
355
- if len(data) > MAX_SAMPLES:
356
- data = data[:MAX_SAMPLES]
357
-
358
- # Lấy pipeline
359
- asr = get_asr_pipeline()
360
-
361
- # Cấu hình language
362
- lang = language
363
- if not lang and ASR_DEFAULT_LANGUAGE and ASR_DEFAULT_LANGUAGE.lower() != "auto":
364
- lang = ASR_DEFAULT_LANGUAGE
365
- if isinstance(lang, str) and lang.lower() == "auto":
366
- lang = None
367
-
368
- # Transcribe
369
- print(f">>> Transkribiere mit Whisper-{WHISPER_MODEL}...")
370
- call_kwargs = {}
371
-
372
- if lang:
373
- call_kwargs["generate_kwargs"] = {
374
- "language": lang,
375
- "task": "transcribe",
376
- "max_new_tokens": 120,
377
- "temperature": 0.0,
378
- }
379
-
380
- result = asr({"array": data, "sampling_rate": sr}, **call_kwargs)
381
 
382
- text = result.get("text", "") if isinstance(result, dict) else str(result)
383
  text = text.strip()
384
-
385
- # Sửa lỗi domain terms
386
- text = fix_domain_terms(text)
387
-
388
- print(f">>> Transkription: {text}")
389
  return text
390
 
391
  except Exception as e:
392
- print(f">>> Transkriptionsfehler: {e}")
 
393
  return ""
394
 
395
  # ========================================================
396
  # TEXT-TO-SPEECH (TTS)
397
  # ========================================================
 
 
398
  def get_tts_pipeline():
399
- """Lấy TTS pipeline"""
400
  global _tts
401
  if _tts is None:
402
- print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
403
-
404
- from transformers import pipeline
405
-
406
- _tts = pipeline(
407
- task="text-to-speech",
408
- model=TTS_MODEL_ID,
409
- )
410
  return _tts
411
 
412
  def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
413
  """
414
- Chuyển text sang speech
415
  """
416
- if not text or not text.strip() or not TTS_ENABLED:
417
  return None
418
 
419
  try:
420
- tts = get_tts_pipeline()
421
- out = tts(text)
422
-
423
- audio = np.array(out["audio"], dtype=np.float32)
424
- sr = out.get("sampling_rate", 16000)
425
-
426
- # Ensure valid sample rate
427
- if sr is None or sr <= 0:
428
- sr = 16000
429
-
430
- # Ensure mono
431
- if audio.ndim > 1:
432
- audio = audio.squeeze()
433
- if audio.ndim > 1:
434
- audio = audio[:, 0]
435
 
436
- # Apply processing
437
- try:
438
- audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
439
- except:
440
- pass
441
 
442
- # Normalize
443
- max_val = np.max(np.abs(audio))
444
- if max_val > 0:
445
- audio = audio / max_val
446
 
447
- # Apply fade
448
- audio = apply_fade(audio, sr)
 
449
 
450
  # Convert to int16
451
- audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
 
452
 
453
- return (sr, audio_int16)
454
 
455
  except Exception as e:
456
  print(f">>> TTS Fehler: {e}")
457
  return None
458
 
459
  # ========================================================
460
- # DOMAIN-SPECIFIC TEXT PROCESSING
461
  # ========================================================
462
- def fix_domain_terms(text: str) -> str:
463
- """
464
- Sửa lỗi các thuật ngữ chuyên ngành
465
- """
466
- if not text:
467
- return text
468
 
469
- # Common mis-transcriptions
470
- correction_pairs = [
471
- (r"\bbriefe\s*um\b", "prüfung"),
472
- (r"\bbrieft\s*um\b", "prüfung"),
473
- (r"\bbriefung\b", "prüfung"),
474
- (r"\bpruefung\b", "prüfung"),
475
- (r"\bhochschule\s*gesetz\b", "hochschulgesetz"),
476
- ]
 
477
 
478
- for pattern, replacement in correction_pairs:
479
- text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
 
480
 
481
- return text
 
 
 
 
 
482
 
483
  # ========================================================
484
  # MAIN EXPORT
485
  # ========================================================
486
  __all__ = [
487
- 'transcribe_audio',
488
  'transcribe_with_openai',
489
  'synthesize_speech',
490
- 'detect_voice_activity',
491
- 'normalize_audio',
492
- 'preprocess_audio_for_vad'
493
- ]
 
1
  """
2
+ speech_io.py - Simplified version for ChatGPT-like interface
 
 
 
 
 
3
  """
4
 
5
  import os
6
+ from typing import Optional, Tuple
 
7
  import numpy as np
8
  import soundfile as sf
9
+ from scipy.signal import butter, filtfilt
 
 
10
 
11
  # ========================================================
12
  # CẤU HÌNH
13
  # ========================================================
 
 
 
 
14
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 
 
 
 
 
 
 
 
15
  TTS_ENABLED = os.getenv("TTS_ENABLED", "1").lower() not in ("0", "false", "no")
 
 
 
 
 
16
 
17
  # ========================================================
18
+ # SPEECH-TO-TEXT WITH OPENAI
19
  # ========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
21
+ """Transcribe audio using OpenAI Whisper-1."""
 
22
  if not OPENAI_API_KEY:
23
+ raise RuntimeError("OPENAI_API_KEY is required for transcription")
24
+
25
  try:
26
  from openai import OpenAI
27
  client = OpenAI(api_key=OPENAI_API_KEY)
28
+
29
  with open(audio_path, "rb") as f:
30
  resp = client.audio.transcriptions.create(
31
  model="whisper-1",
32
  file=f,
33
  language=language if language and language != "auto" else None,
34
+ response_format="text"
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Lấy text từ response
38
+ if hasattr(resp, 'text'):
39
+ text = resp.text
40
+ elif isinstance(resp, dict):
41
+ text = resp.get('text', '')
42
+ else:
43
+ text = str(resp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
 
45
  text = text.strip()
46
+ print(f">>> Transkription: {text[:100]}...")
 
 
 
 
47
  return text
48
 
49
  except Exception as e:
50
+ print(f">>> OpenAI Transkriptionsfehler: {e}")
51
+ # Fallback: trả về empty string
52
  return ""
53
 
54
  # ========================================================
55
  # TEXT-TO-SPEECH (TTS)
56
  # ========================================================
57
+ _tts = None
58
+
59
  def get_tts_pipeline():
60
+ """Lấy TTS pipeline từ OpenAI"""
61
  global _tts
62
  if _tts is None:
63
+ print(">>> Initialisiere OpenAI TTS Client")
64
+ from openai import OpenAI
65
+ _tts = OpenAI(api_key=OPENAI_API_KEY)
 
 
 
 
 
66
  return _tts
67
 
68
  def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
69
  """
70
+ Chuyển text sang speech sử dụng OpenAI TTS
71
  """
72
+ if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
73
  return None
74
 
75
  try:
76
+ client = get_tts_pipeline()
77
+
78
+ # Gọi OpenAI TTS API
79
+ response = client.audio.speech.create(
80
+ model="tts-1",
81
+ voice="nova", # Các lựa chọn: alloy, echo, fable, onyx, nova, shimmer
82
+ input=text[:4000], # Giới hạn độ dài
83
+ response_format="wav"
84
+ )
 
 
 
 
 
 
85
 
86
+ # Lưu audio vào buffer
87
+ import io
88
+ audio_bytes = response.content
 
 
89
 
90
+ # Đọc WAV từ bytes
91
+ import io as io_module
92
+ with io_module.BytesIO(audio_bytes) as f:
93
+ data, sr = sf.read(f)
94
 
95
+ # Chuyển sang mono nếu cần
96
+ if len(data.shape) > 1:
97
+ data = np.mean(data, axis=1)
98
 
99
  # Convert to int16
100
+ if data.dtype == np.float32 or data.dtype == np.float64:
101
+ data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
102
 
103
+ return (sr, data)
104
 
105
  except Exception as e:
106
  print(f">>> TTS Fehler: {e}")
107
  return None
108
 
109
  # ========================================================
110
+ # AUDIO PROCESSING UTILITIES
111
  # ========================================================
112
+ def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
113
+ """Highpass filter để loại bỏ noise tần số thấp"""
114
+ if len(data) == 0:
115
+ return data
 
 
116
 
117
+ nyq = 0.5 * fs
118
+ normal_cutoff = cutoff / nyq
119
+ b, a = butter(order, normal_cutoff, btype='high', analog=False)
120
+ return filtfilt(b, a, data)
121
+
122
+ def normalize_audio(audio_data: np.ndarray) -> np.ndarray:
123
+ """Chuẩn hóa audio về [-1, 1]"""
124
+ if len(audio_data) == 0:
125
+ return audio_data
126
 
127
+ # Chuyển đổi sang float32
128
+ if audio_data.dtype != np.float32:
129
+ audio_data = audio_data.astype(np.float32)
130
 
131
+ # Normalize
132
+ max_val = np.max(np.abs(audio_data))
133
+ if max_val > 0:
134
+ audio_data = audio_data / max_val
135
+
136
+ return audio_data
137
 
138
  # ========================================================
139
  # MAIN EXPORT
140
  # ========================================================
141
  __all__ = [
 
142
  'transcribe_with_openai',
143
  'synthesize_speech',
144
+ 'normalize_audio'
145
+ ]