Nguyen5 commited on
Commit
fd831e8
·
1 Parent(s): bca3e7a
Files changed (3) hide show
  1. app.py +7 -9
  2. requirements.txt +0 -2
  3. speech_io.py +16 -42
app.py CHANGED
@@ -14,12 +14,13 @@ from vectorstore import build_vectorstore
14
  from retriever import get_retriever
15
  from llm import load_llm
16
  from rag_pipeline import answer
17
- from speech_io import transcribe_audio, synthesize_speech, transcribe_with_groq, detect_voice_activity
18
 
19
  # Cấu hình môi trường
20
  ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
21
- USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
22
- GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
 
23
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
24
  VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
25
 
@@ -141,11 +142,9 @@ def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None)
141
  if not audio_path or not os.path.exists(audio_path):
142
  return ""
143
 
144
- if USE_GROQ and GROQ_MODEL:
145
- print("Using Groq for transcription...")
146
- return transcribe_with_groq(audio_path, language=language)
147
- else:
148
- return transcribe_audio(audio_path, language=language)
149
 
150
  # =====================================================
151
  # CONVERSATIONAL INTELLIGENCE
@@ -730,4 +729,3 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
730
 
731
  if __name__ == "__main__":
732
  demo.queue().launch(ssr_mode=False, show_error=True)
733
-
 
14
  from retriever import get_retriever
15
  from llm import load_llm
16
  from rag_pipeline import answer
17
+ from speech_io import transcribe_audio, synthesize_speech, transcribe_with_openai, detect_voice_activity
18
 
19
  # Cấu hình môi trường
20
  ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
21
+ USE_OPENAI = os.getenv("USE_OPENAI", "false").lower() == "true"
22
+ USE_REALTIME = os.getenv("USE_REALTIME", "false").lower() == "true"
23
+ REALTIME_SERVER_URL = os.getenv("REALTIME_SERVER_URL", "ws://localhost:8000/ws")
24
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
25
  VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
26
 
 
142
  if not audio_path or not os.path.exists(audio_path):
143
  return ""
144
 
145
+ if USE_OPENAI:
146
+ return transcribe_with_openai(audio_path, language=language)
147
+ return transcribe_audio(audio_path, language=language)
 
 
148
 
149
  # =====================================================
150
  # CONVERSATIONAL INTELLIGENCE
 
729
 
730
  if __name__ == "__main__":
731
  demo.queue().launch(ssr_mode=False, show_error=True)
 
requirements.txt CHANGED
@@ -14,8 +14,6 @@ langchain-community
14
  langchain-text-splitters
15
  langchain-openai
16
  huggingface-hub
17
- groq
18
- google-generativeai
19
  fastapi
20
  uvicorn
21
  websockets
 
14
  langchain-text-splitters
15
  langchain-openai
16
  huggingface-hub
 
 
17
  fastapi
18
  uvicorn
19
  websockets
speech_io.py CHANGED
@@ -23,11 +23,7 @@ import difflib
23
  WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
24
  ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
  TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
-
27
- # Groq Configuration
28
- USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
29
- GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
30
- GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
31
 
32
  # VAD Configuration
33
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
@@ -297,46 +293,24 @@ def get_asr_pipeline():
297
  )
298
  return _asr
299
 
300
- def transcribe_with_groq(audio_path: str, language: Optional[str] = None) -> str:
301
- """
302
- Transcribe audio sử dụng Groq Cloud API
303
- """
304
- if not GROQ_API_KEY:
305
- print(">>> Groq API key nicht gefunden. Verwende lokales Modell.")
306
  return transcribe_audio(audio_path, language)
307
-
308
  try:
309
- import requests
310
-
311
- with open(audio_path, 'rb') as audio_file:
312
- files = {'file': audio_file}
313
- data = {'model': GROQ_MODEL}
314
-
315
- if language and language != 'auto':
316
- data['language'] = language
317
-
318
- headers = {'Authorization': f'Bearer {GROQ_API_KEY}'}
319
-
320
- print(f">>> Sende Anfrage an Groq API...")
321
- response = requests.post(
322
- "https://api.groq.com/openai/v1/audio/transcriptions",
323
- headers=headers,
324
- files=files,
325
- data=data,
326
- timeout=30
327
  )
328
-
329
- if response.status_code == 200:
330
- result = response.json()
331
- text = result.get('text', '').strip()
332
- print(f">>> Groq Transkription: {text}")
333
- return text
334
- else:
335
- print(f">>> Groq Fehler {response.status_code}")
336
- return transcribe_audio(audio_path, language)
337
-
338
  except Exception as e:
339
- print(f">>> Groq Fehler: {e}")
340
  return transcribe_audio(audio_path, language)
341
 
342
  def transcribe_audio(
@@ -511,7 +485,7 @@ def fix_domain_terms(text: str) -> str:
511
  # ========================================================
512
  __all__ = [
513
  'transcribe_audio',
514
- 'transcribe_with_groq',
515
  'synthesize_speech',
516
  'detect_voice_activity',
517
  'normalize_audio',
 
23
  WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
24
  ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
25
  TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
26
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
 
 
 
 
27
 
28
  # VAD Configuration
29
  ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
 
293
  )
294
  return _asr
295
 
296
+ def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
297
+ """Transcribe audio using OpenAI Whisper-1.
298
+ Falls back to local transcription on error. """
299
+ if not OPENAI_API_KEY:
 
 
300
  return transcribe_audio(audio_path, language)
 
301
  try:
302
+ from openai import OpenAI
303
+ client = OpenAI(api_key=OPENAI_API_KEY)
304
+ with open(audio_path, "rb") as f:
305
+ resp = client.audio.transcriptions.create(
306
+ model="whisper-1",
307
+ file=f,
308
+ language=language if language and language != "auto" else None,
 
 
 
 
 
 
 
 
 
 
 
309
  )
310
+ txt = getattr(resp, "text", "") or (resp.get("text") if isinstance(resp, dict) else "")
311
+ return (txt or "").strip()
 
 
 
 
 
 
 
 
312
  except Exception as e:
313
+ print(f">>> OpenAI Fehler: {e}")
314
  return transcribe_audio(audio_path, language)
315
 
316
  def transcribe_audio(
 
485
  # ========================================================
486
  __all__ = [
487
  'transcribe_audio',
488
+ 'transcribe_with_openai',
489
  'synthesize_speech',
490
  'detect_voice_activity',
491
  'normalize_audio',