Nguyen5 commited on
Commit
4e44ffc
·
1 Parent(s): 5724c84
Files changed (2) hide show
  1. app.py +9 -24
  2. speech_io.py +48 -112
app.py CHANGED
@@ -14,12 +14,10 @@ from vectorstore import build_vectorstore
14
  from retriever import get_retriever
15
  from llm import load_llm
16
  from rag_pipeline import answer
17
- from speech_io import transcribe_audio, synthesize_speech, transcribe_with_groq, detect_voice_activity
18
 
19
  # Cấu hình môi trường
20
  ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
21
- USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
22
- GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
23
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
24
  VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
25
 
@@ -137,15 +135,9 @@ def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) ->
137
  # TRANSCRIBE WITH OPTIMIZED PIPELINE
138
  # =====================================================
139
  def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
140
- """Transcribe audio với pipeline tối ưu"""
141
  if not audio_path or not os.path.exists(audio_path):
142
  return ""
143
-
144
- if USE_GROQ and GROQ_MODEL:
145
- print("Using Groq for transcription...")
146
- return transcribe_with_groq(audio_path, language=language)
147
- else:
148
- return transcribe_audio(audio_path, language=language)
149
 
150
  # =====================================================
151
  # CONVERSATIONAL INTELLIGENCE
@@ -251,7 +243,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
251
  if not text_to_process:
252
  print("DEBUG: No text to process")
253
  # Trả về history hiện tại và status
254
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
255
  if history is None:
256
  history = []
257
  return history, "", None, status_text
@@ -283,7 +275,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
283
  history.append({"role": "user", "content": text_to_process})
284
  history.append({"role": "assistant", "content": error_msg})
285
 
286
- status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: {state.whisper_model}"
287
  return history, "", None, status_text
288
 
289
  # =====================================================
@@ -294,13 +286,13 @@ def toggle_vad(use_vad):
294
  global ENABLE_VAD
295
  ENABLE_VAD = use_vad
296
  status = "EIN" if use_vad else "AUS"
297
- return f"Voice Activity Detection: {status} | Model: {state.whisper_model}"
298
 
299
  def change_whisper_model(model_size):
300
  """Đổi Whisper model"""
301
  state.whisper_model = model_size
302
  os.environ["WHISPER_MODEL"] = model_size
303
- return f"Whisper Model: {model_size} | VAD: {'On' if ENABLE_VAD else 'Off'}"
304
 
305
  def clear_conversation():
306
  """Xóa hội thoại"""
@@ -597,7 +589,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
597
  sources=["microphone"],
598
  type="filepath",
599
  format="wav",
600
- streaming=True,
601
  interactive=True,
602
  show_label=False,
603
  scale=1,
@@ -701,12 +693,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
701
  outputs=[chat_text, vad_indicator, status_display]
702
  )
703
 
704
- # Audio Streaming
705
- chat_audio.stream(
706
- on_audio_change,
707
- inputs=[chat_audio, vad_toggle],
708
- outputs=[chat_text, vad_indicator, status_display]
709
- )
710
 
711
  # TTS Button
712
  def handle_tts(history):
@@ -729,6 +716,4 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
729
  )
730
 
731
  if __name__ == "__main__":
732
- demo.queue().launch(ssr_mode=False, show_error=True)
733
-
734
-
 
14
  from retriever import get_retriever
15
  from llm import load_llm
16
  from rag_pipeline import answer
17
+ from speech_io import transcribe_audio, synthesize_speech, detect_voice_activity
18
 
19
  # Cấu hình môi trường
20
  ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
 
 
21
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
22
  VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
23
 
 
135
  # TRANSCRIBE WITH OPTIMIZED PIPELINE
136
  # =====================================================
137
  def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
 
138
  if not audio_path or not os.path.exists(audio_path):
139
  return ""
140
+ return transcribe_audio(audio_path, language=language)
 
 
 
 
 
141
 
142
  # =====================================================
143
  # CONVERSATIONAL INTELLIGENCE
 
243
  if not text_to_process:
244
  print("DEBUG: No text to process")
245
  # Trả về history hiện tại và status
246
+ status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
247
  if history is None:
248
  history = []
249
  return history, "", None, status_text
 
275
  history.append({"role": "user", "content": text_to_process})
276
  history.append({"role": "assistant", "content": error_msg})
277
 
278
+ status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
279
  return history, "", None, status_text
280
 
281
  # =====================================================
 
286
  global ENABLE_VAD
287
  ENABLE_VAD = use_vad
288
  status = "EIN" if use_vad else "AUS"
289
+ return f"Voice Activity Detection: {status} | Model: OpenAI whisper-1"
290
 
291
  def change_whisper_model(model_size):
292
  """Đổi Whisper model"""
293
  state.whisper_model = model_size
294
  os.environ["WHISPER_MODEL"] = model_size
295
+ return f"Whisper Model: OpenAI whisper-1 | VAD: {'On' if ENABLE_VAD else 'Off'}"
296
 
297
  def clear_conversation():
298
  """Xóa hội thoại"""
 
589
  sources=["microphone"],
590
  type="filepath",
591
  format="wav",
592
+ streaming=False,
593
  interactive=True,
594
  show_label=False,
595
  scale=1,
 
693
  outputs=[chat_text, vad_indicator, status_display]
694
  )
695
 
696
+ # Streaming handler removed; process on change after user stops recording
 
 
 
 
 
697
 
698
  # TTS Button
699
  def handle_tts(history):
 
716
  )
717
 
718
  if __name__ == "__main__":
719
+ demo.queue().launch(show_error=True)
 
 
speech_io.py CHANGED
@@ -24,10 +24,8 @@ WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
24
  ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
  TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
 
27
- # Groq Configuration
28
- USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
29
- GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
30
- GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
31
 
32
  # VAD Configuration
33
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
@@ -279,65 +277,37 @@ def detect_voice_activity(
279
  # ========================================================
280
  # SPEECH-TO-TEXT FUNCTIONS
281
  # ========================================================
282
- def get_asr_pipeline():
283
- """Lấy ASR pipeline"""
284
- global _asr
285
- if _asr is None:
286
- print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
287
-
288
- from transformers import pipeline
289
-
290
- _asr = pipeline(
291
- task="automatic-speech-recognition",
292
- model=ASR_MODEL_ID,
293
- device="cpu",
294
- return_timestamps=False,
295
- chunk_length_s=8,
296
- stride_length_s=(1, 1),
297
- )
298
- return _asr
299
-
300
- def transcribe_with_groq(audio_path: str, language: Optional[str] = None) -> str:
301
  """
302
- Transcribe audio sử dụng Groq Cloud API
303
  """
304
- if not GROQ_API_KEY:
305
- print(">>> Groq API key nicht gefunden. Verwende lokales Modell.")
306
- return transcribe_audio(audio_path, language)
307
-
 
 
308
  try:
309
- import requests
310
-
311
- with open(audio_path, 'rb') as audio_file:
312
- files = {'file': audio_file}
313
- data = {'model': GROQ_MODEL}
314
-
315
- if language and language != 'auto':
316
- data['language'] = language
317
-
318
- headers = {'Authorization': f'Bearer {GROQ_API_KEY}'}
319
-
320
- print(f">>> Sende Anfrage an Groq API...")
321
- response = requests.post(
322
- "https://api.groq.com/openai/v1/audio/transcriptions",
323
- headers=headers,
324
- files=files,
325
- data=data,
326
- timeout=30
327
  )
328
-
329
- if response.status_code == 200:
330
- result = response.json()
331
- text = result.get('text', '').strip()
332
- print(f">>> Groq Transkription: {text}")
333
- return text
334
- else:
335
- print(f">>> Groq Fehler {response.status_code}")
336
- return transcribe_audio(audio_path, language)
337
-
338
  except Exception as e:
339
- print(f">>> Groq Fehler: {e}")
340
- return transcribe_audio(audio_path, language)
341
 
342
  def transcribe_audio(
343
  audio_path: str,
@@ -421,65 +391,32 @@ def transcribe_audio(
421
  # ========================================================
422
  # TEXT-TO-SPEECH (TTS)
423
  # ========================================================
424
- def get_tts_pipeline():
425
- """Lấy TTS pipeline"""
426
- global _tts
427
- if _tts is None:
428
- print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
429
-
430
- from transformers import pipeline
431
-
432
- _tts = pipeline(
433
- task="text-to-speech",
434
- model=TTS_MODEL_ID,
435
- )
436
- return _tts
437
-
438
  def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
439
  """
440
- Chuyển text sang speech
441
  """
442
- if not text or not text.strip() or not TTS_ENABLED:
443
  return None
444
-
445
  try:
446
- tts = get_tts_pipeline()
447
- out = tts(text)
448
-
449
- audio = np.array(out["audio"], dtype=np.float32)
450
- sr = out.get("sampling_rate", 16000)
451
-
452
- # Ensure valid sample rate
453
- if sr is None or sr <= 0:
454
- sr = 16000
455
-
456
- # Ensure mono
457
- if audio.ndim > 1:
458
- audio = audio.squeeze()
459
- if audio.ndim > 1:
460
- audio = audio[:, 0]
461
-
462
- # Apply processing
463
- try:
464
- audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
465
- except:
466
- pass
467
-
468
- # Normalize
469
- max_val = np.max(np.abs(audio))
470
- if max_val > 0:
471
- audio = audio / max_val
472
-
473
- # Apply fade
474
- audio = apply_fade(audio, sr)
475
-
476
- # Convert to int16
477
- audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
478
-
479
- return (sr, audio_int16)
480
-
481
  except Exception as e:
482
- print(f">>> TTS Fehler: {e}")
483
  return None
484
 
485
  # ========================================================
@@ -511,7 +448,6 @@ def fix_domain_terms(text: str) -> str:
511
  # ========================================================
512
  __all__ = [
513
  'transcribe_audio',
514
- 'transcribe_with_groq',
515
  'synthesize_speech',
516
  'detect_voice_activity',
517
  'normalize_audio',
 
24
  ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
  TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
 
27
+ # OpenAI Configuration
28
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 
 
29
 
30
  # VAD Configuration
31
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
 
277
  # ========================================================
278
  # SPEECH-TO-TEXT FUNCTIONS
279
  # ========================================================
280
+ def transcribe_audio(
281
+ audio_path: str,
282
+ language: Optional[str] = None,
283
+ max_duration_s: int = ASR_MAX_DURATION_S
284
+ ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  """
286
+ Transcribe audio bằng OpenAI Whisper API
287
  """
288
+ if not audio_path or not os.path.exists(audio_path):
289
+ print(">>> Kein Audio gefunden.")
290
+ return ""
291
+ if not OPENAI_API_KEY:
292
+ print(">>> OPENAI_API_KEY nicht gesetzt.")
293
+ return ""
294
  try:
295
+ from openai import OpenAI
296
+ client = OpenAI(api_key=OPENAI_API_KEY)
297
+ with open(audio_path, "rb") as f:
298
+ resp = client.audio.transcriptions.create(
299
+ model="whisper-1",
300
+ file=f,
301
+ language=language if language and language != "auto" else None,
302
+ response_format="text"
 
 
 
 
 
 
 
 
 
 
303
  )
304
+ text = resp.text if hasattr(resp, "text") else (resp.get("text", "") if isinstance(resp, dict) else str(resp))
305
+ text = fix_domain_terms(text.strip())
306
+ print(f">>> Transkription (OpenAI): {text}")
307
+ return text
 
 
 
 
 
 
308
  except Exception as e:
309
+ print(f">>> Transkriptionsfehler (OpenAI): {e}")
310
+ return ""
311
 
312
  def transcribe_audio(
313
  audio_path: str,
 
391
  # ========================================================
392
  # TEXT-TO-SPEECH (TTS)
393
  # ========================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
395
  """
396
+ Chuyển text sang speech bằng OpenAI TTS
397
  """
398
+ if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
399
  return None
 
400
  try:
401
+ from openai import OpenAI
402
+ client = OpenAI(api_key=OPENAI_API_KEY)
403
+ response = client.audio.speech.create(
404
+ model="tts-1",
405
+ voice="nova",
406
+ input=text[:4000],
407
+ response_format="wav"
408
+ )
409
+ import io
410
+ audio_bytes = response.content
411
+ with io.BytesIO(audio_bytes) as f:
412
+ data, sr = sf.read(f)
413
+ if len(data.shape) > 1:
414
+ data = np.mean(data, axis=1)
415
+ if data.dtype == np.float32 or data.dtype == np.float64:
416
+ data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
417
+ return (sr, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  except Exception as e:
419
+ print(f">>> TTS Fehler (OpenAI): {e}")
420
  return None
421
 
422
  # ========================================================
 
448
  # ========================================================
449
  __all__ = [
450
  'transcribe_audio',
 
451
  'synthesize_speech',
452
  'detect_voice_activity',
453
  'normalize_audio',