commit
Browse files- app.py +7 -9
- requirements.txt +0 -2
- speech_io.py +16 -42
app.py
CHANGED
|
@@ -14,12 +14,13 @@ from vectorstore import build_vectorstore
|
|
| 14 |
from retriever import get_retriever
|
| 15 |
from llm import load_llm
|
| 16 |
from rag_pipeline import answer
|
| 17 |
-
from speech_io import transcribe_audio, synthesize_speech,
|
| 18 |
|
| 19 |
# Cấu hình môi trường
|
| 20 |
ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
| 24 |
VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
|
| 25 |
|
|
@@ -141,11 +142,9 @@ def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None)
|
|
| 141 |
if not audio_path or not os.path.exists(audio_path):
|
| 142 |
return ""
|
| 143 |
|
| 144 |
-
if
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
else:
|
| 148 |
-
return transcribe_audio(audio_path, language=language)
|
| 149 |
|
| 150 |
# =====================================================
|
| 151 |
# CONVERSATIONAL INTELLIGENCE
|
|
@@ -730,4 +729,3 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
|
|
| 730 |
|
| 731 |
if __name__ == "__main__":
|
| 732 |
demo.queue().launch(ssr_mode=False, show_error=True)
|
| 733 |
-
|
|
|
|
| 14 |
from retriever import get_retriever
|
| 15 |
from llm import load_llm
|
| 16 |
from rag_pipeline import answer
|
| 17 |
+
from speech_io import transcribe_audio, synthesize_speech, transcribe_with_openai, detect_voice_activity
|
| 18 |
|
| 19 |
# Cấu hình môi trường
|
| 20 |
ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
|
| 21 |
+
USE_OPENAI = os.getenv("USE_OPENAI", "false").lower() == "true"
|
| 22 |
+
USE_REALTIME = os.getenv("USE_REALTIME", "false").lower() == "true"
|
| 23 |
+
REALTIME_SERVER_URL = os.getenv("REALTIME_SERVER_URL", "ws://localhost:8000/ws")
|
| 24 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
| 25 |
VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
|
| 26 |
|
|
|
|
| 142 |
if not audio_path or not os.path.exists(audio_path):
|
| 143 |
return ""
|
| 144 |
|
| 145 |
+
if USE_OPENAI:
|
| 146 |
+
return transcribe_with_openai(audio_path, language=language)
|
| 147 |
+
return transcribe_audio(audio_path, language=language)
|
|
|
|
|
|
|
| 148 |
|
| 149 |
# =====================================================
|
| 150 |
# CONVERSATIONAL INTELLIGENCE
|
|
|
|
| 729 |
|
| 730 |
if __name__ == "__main__":
|
| 731 |
demo.queue().launch(ssr_mode=False, show_error=True)
|
|
|
requirements.txt
CHANGED
|
@@ -14,8 +14,6 @@ langchain-community
|
|
| 14 |
langchain-text-splitters
|
| 15 |
langchain-openai
|
| 16 |
huggingface-hub
|
| 17 |
-
groq
|
| 18 |
-
google-generativeai
|
| 19 |
fastapi
|
| 20 |
uvicorn
|
| 21 |
websockets
|
|
|
|
| 14 |
langchain-text-splitters
|
| 15 |
langchain-openai
|
| 16 |
huggingface-hub
|
|
|
|
|
|
|
| 17 |
fastapi
|
| 18 |
uvicorn
|
| 19 |
websockets
|
speech_io.py
CHANGED
|
@@ -23,11 +23,7 @@ import difflib
|
|
| 23 |
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
|
| 24 |
ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
|
| 25 |
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
|
| 26 |
-
|
| 27 |
-
# Groq Configuration
|
| 28 |
-
USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
|
| 29 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
|
| 30 |
-
GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
|
| 31 |
|
| 32 |
# VAD Configuration
|
| 33 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
|
@@ -297,46 +293,24 @@ def get_asr_pipeline():
|
|
| 297 |
)
|
| 298 |
return _asr
|
| 299 |
|
| 300 |
-
def
|
| 301 |
-
"""
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
if not GROQ_API_KEY:
|
| 305 |
-
print(">>> Groq API key nicht gefunden. Verwende lokales Modell.")
|
| 306 |
return transcribe_audio(audio_path, language)
|
| 307 |
-
|
| 308 |
try:
|
| 309 |
-
import
|
| 310 |
-
|
| 311 |
-
with open(audio_path,
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
data['language'] = language
|
| 317 |
-
|
| 318 |
-
headers = {'Authorization': f'Bearer {GROQ_API_KEY}'}
|
| 319 |
-
|
| 320 |
-
print(f">>> Sende Anfrage an Groq API...")
|
| 321 |
-
response = requests.post(
|
| 322 |
-
"https://api.groq.com/openai/v1/audio/transcriptions",
|
| 323 |
-
headers=headers,
|
| 324 |
-
files=files,
|
| 325 |
-
data=data,
|
| 326 |
-
timeout=30
|
| 327 |
)
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
result = response.json()
|
| 331 |
-
text = result.get('text', '').strip()
|
| 332 |
-
print(f">>> Groq Transkription: {text}")
|
| 333 |
-
return text
|
| 334 |
-
else:
|
| 335 |
-
print(f">>> Groq Fehler {response.status_code}")
|
| 336 |
-
return transcribe_audio(audio_path, language)
|
| 337 |
-
|
| 338 |
except Exception as e:
|
| 339 |
-
print(f">>>
|
| 340 |
return transcribe_audio(audio_path, language)
|
| 341 |
|
| 342 |
def transcribe_audio(
|
|
@@ -511,7 +485,7 @@ def fix_domain_terms(text: str) -> str:
|
|
| 511 |
# ========================================================
|
| 512 |
__all__ = [
|
| 513 |
'transcribe_audio',
|
| 514 |
-
'
|
| 515 |
'synthesize_speech',
|
| 516 |
'detect_voice_activity',
|
| 517 |
'normalize_audio',
|
|
|
|
| 23 |
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
|
| 24 |
ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
|
| 25 |
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
|
| 26 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# VAD Configuration
|
| 29 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
|
|
|
| 293 |
)
|
| 294 |
return _asr
|
| 295 |
|
| 296 |
+
def transcribe_with_openai(audio_path: str, language: Optional[str] = None) -> str:
|
| 297 |
+
"""Transcribe audio using OpenAI Whisper-1.
|
| 298 |
+
Falls back to local transcription on error. """
|
| 299 |
+
if not OPENAI_API_KEY:
|
|
|
|
|
|
|
| 300 |
return transcribe_audio(audio_path, language)
|
|
|
|
| 301 |
try:
|
| 302 |
+
from openai import OpenAI
|
| 303 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 304 |
+
with open(audio_path, "rb") as f:
|
| 305 |
+
resp = client.audio.transcriptions.create(
|
| 306 |
+
model="whisper-1",
|
| 307 |
+
file=f,
|
| 308 |
+
language=language if language and language != "auto" else None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
)
|
| 310 |
+
txt = getattr(resp, "text", "") or (resp.get("text") if isinstance(resp, dict) else "")
|
| 311 |
+
return (txt or "").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
except Exception as e:
|
| 313 |
+
print(f">>> OpenAI Fehler: {e}")
|
| 314 |
return transcribe_audio(audio_path, language)
|
| 315 |
|
| 316 |
def transcribe_audio(
|
|
|
|
| 485 |
# ========================================================
|
| 486 |
__all__ = [
|
| 487 |
'transcribe_audio',
|
| 488 |
+
'transcribe_with_openai',
|
| 489 |
'synthesize_speech',
|
| 490 |
'detect_voice_activity',
|
| 491 |
'normalize_audio',
|