commit
Browse files- app.py +9 -24
- speech_io.py +48 -112
app.py
CHANGED
|
@@ -14,12 +14,10 @@ from vectorstore import build_vectorstore
|
|
| 14 |
from retriever import get_retriever
|
| 15 |
from llm import load_llm
|
| 16 |
from rag_pipeline import answer
|
| 17 |
-
from speech_io import transcribe_audio, synthesize_speech,
|
| 18 |
|
| 19 |
# Cấu hình môi trường
|
| 20 |
ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
|
| 21 |
-
USE_GROQ = os.getenv("USE_GROQ", "false").lower() == "true"
|
| 22 |
-
GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
|
| 23 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
| 24 |
VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
|
| 25 |
|
|
@@ -137,15 +135,9 @@ def handle_voice_activity(audio_data: Optional[np.ndarray], sample_rate: int) ->
|
|
| 137 |
# TRANSCRIBE WITH OPTIMIZED PIPELINE
|
| 138 |
# =====================================================
|
| 139 |
def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
|
| 140 |
-
"""Transcribe audio với pipeline tối ưu"""
|
| 141 |
if not audio_path or not os.path.exists(audio_path):
|
| 142 |
return ""
|
| 143 |
-
|
| 144 |
-
if USE_GROQ and GROQ_MODEL:
|
| 145 |
-
print("Using Groq for transcription...")
|
| 146 |
-
return transcribe_with_groq(audio_path, language=language)
|
| 147 |
-
else:
|
| 148 |
-
return transcribe_audio(audio_path, language=language)
|
| 149 |
|
| 150 |
# =====================================================
|
| 151 |
# CONVERSATIONAL INTELLIGENCE
|
|
@@ -251,7 +243,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
|
|
| 251 |
if not text_to_process:
|
| 252 |
print("DEBUG: No text to process")
|
| 253 |
# Trả về history hiện tại và status
|
| 254 |
-
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model:
|
| 255 |
if history is None:
|
| 256 |
history = []
|
| 257 |
return history, "", None, status_text
|
|
@@ -283,7 +275,7 @@ def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
|
|
| 283 |
history.append({"role": "user", "content": text_to_process})
|
| 284 |
history.append({"role": "assistant", "content": error_msg})
|
| 285 |
|
| 286 |
-
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model:
|
| 287 |
return history, "", None, status_text
|
| 288 |
|
| 289 |
# =====================================================
|
|
@@ -294,13 +286,13 @@ def toggle_vad(use_vad):
|
|
| 294 |
global ENABLE_VAD
|
| 295 |
ENABLE_VAD = use_vad
|
| 296 |
status = "EIN" if use_vad else "AUS"
|
| 297 |
-
return f"Voice Activity Detection: {status} | Model:
|
| 298 |
|
| 299 |
def change_whisper_model(model_size):
|
| 300 |
"""Đổi Whisper model"""
|
| 301 |
state.whisper_model = model_size
|
| 302 |
os.environ["WHISPER_MODEL"] = model_size
|
| 303 |
-
return f"Whisper Model:
|
| 304 |
|
| 305 |
def clear_conversation():
|
| 306 |
"""Xóa hội thoại"""
|
|
@@ -597,7 +589,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
|
|
| 597 |
sources=["microphone"],
|
| 598 |
type="filepath",
|
| 599 |
format="wav",
|
| 600 |
-
streaming=
|
| 601 |
interactive=True,
|
| 602 |
show_label=False,
|
| 603 |
scale=1,
|
|
@@ -701,12 +693,7 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
|
|
| 701 |
outputs=[chat_text, vad_indicator, status_display]
|
| 702 |
)
|
| 703 |
|
| 704 |
-
#
|
| 705 |
-
chat_audio.stream(
|
| 706 |
-
on_audio_change,
|
| 707 |
-
inputs=[chat_audio, vad_toggle],
|
| 708 |
-
outputs=[chat_text, vad_indicator, status_display]
|
| 709 |
-
)
|
| 710 |
|
| 711 |
# TTS Button
|
| 712 |
def handle_tts(history):
|
|
@@ -729,6 +716,4 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache) - Enhanced") as de
|
|
| 729 |
)
|
| 730 |
|
| 731 |
if __name__ == "__main__":
|
| 732 |
-
demo.queue().launch(
|
| 733 |
-
|
| 734 |
-
|
|
|
|
| 14 |
from retriever import get_retriever
|
| 15 |
from llm import load_llm
|
| 16 |
from rag_pipeline import answer
|
| 17 |
+
from speech_io import transcribe_audio, synthesize_speech, detect_voice_activity
|
| 18 |
|
| 19 |
# Cấu hình môi trường
|
| 20 |
ASR_LANGUAGE_HINT = os.getenv("ASR_LANGUAGE", "de")
|
|
|
|
|
|
|
| 21 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
| 22 |
VAD_THRESHOLD = float(os.getenv("VAD_THRESHOLD", "0.3"))
|
| 23 |
|
|
|
|
| 135 |
# TRANSCRIBE WITH OPTIMIZED PIPELINE
|
| 136 |
# =====================================================
|
| 137 |
def transcribe_audio_optimized(audio_path: str, language: Optional[str] = None) -> str:
|
|
|
|
| 138 |
if not audio_path or not os.path.exists(audio_path):
|
| 139 |
return ""
|
| 140 |
+
return transcribe_audio(audio_path, language=language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# =====================================================
|
| 143 |
# CONVERSATIONAL INTELLIGENCE
|
|
|
|
| 243 |
if not text_to_process:
|
| 244 |
print("DEBUG: No text to process")
|
| 245 |
# Trả về history hiện tại và status
|
| 246 |
+
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 247 |
if history is None:
|
| 248 |
history = []
|
| 249 |
return history, "", None, status_text
|
|
|
|
| 275 |
history.append({"role": "user", "content": text_to_process})
|
| 276 |
history.append({"role": "assistant", "content": error_msg})
|
| 277 |
|
| 278 |
+
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 279 |
return history, "", None, status_text
|
| 280 |
|
| 281 |
# =====================================================
|
|
|
|
| 286 |
global ENABLE_VAD
|
| 287 |
ENABLE_VAD = use_vad
|
| 288 |
status = "EIN" if use_vad else "AUS"
|
| 289 |
+
return f"Voice Activity Detection: {status} | Model: OpenAI whisper-1"
|
| 290 |
|
| 291 |
def change_whisper_model(model_size):
|
| 292 |
"""Đổi Whisper model"""
|
| 293 |
state.whisper_model = model_size
|
| 294 |
os.environ["WHISPER_MODEL"] = model_size
|
| 295 |
+
return f"Whisper Model: OpenAI whisper-1 | VAD: {'On' if ENABLE_VAD else 'Off'}"
|
| 296 |
|
| 297 |
def clear_conversation():
|
| 298 |
"""Xóa hội thoại"""
|
|
|
|
| 589 |
sources=["microphone"],
|
| 590 |
type="filepath",
|
| 591 |
format="wav",
|
| 592 |
+
streaming=False,
|
| 593 |
interactive=True,
|
| 594 |
show_label=False,
|
| 595 |
scale=1,
|
|
|
|
| 693 |
outputs=[chat_text, vad_indicator, status_display]
|
| 694 |
)
|
| 695 |
|
| 696 |
+
# Streaming handler removed; process on change after user stops recording
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
# TTS Button
|
| 699 |
def handle_tts(history):
|
|
|
|
| 716 |
)
|
| 717 |
|
| 718 |
if __name__ == "__main__":
|
| 719 |
+
demo.queue().launch(show_error=True)
|
|
|
|
|
|
speech_io.py
CHANGED
|
@@ -24,10 +24,8 @@ WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")
|
|
| 24 |
ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
|
| 25 |
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
|
| 30 |
-
GROQ_MODEL = os.getenv("GROQ_MODEL", "whisper-large-v3-turbo")
|
| 31 |
|
| 32 |
# VAD Configuration
|
| 33 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
|
@@ -279,65 +277,37 @@ def detect_voice_activity(
|
|
| 279 |
# ========================================================
|
| 280 |
# SPEECH-TO-TEXT FUNCTIONS
|
| 281 |
# ========================================================
|
| 282 |
-
def
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
from transformers import pipeline
|
| 289 |
-
|
| 290 |
-
_asr = pipeline(
|
| 291 |
-
task="automatic-speech-recognition",
|
| 292 |
-
model=ASR_MODEL_ID,
|
| 293 |
-
device="cpu",
|
| 294 |
-
return_timestamps=False,
|
| 295 |
-
chunk_length_s=8,
|
| 296 |
-
stride_length_s=(1, 1),
|
| 297 |
-
)
|
| 298 |
-
return _asr
|
| 299 |
-
|
| 300 |
-
def transcribe_with_groq(audio_path: str, language: Optional[str] = None) -> str:
|
| 301 |
"""
|
| 302 |
-
Transcribe audio
|
| 303 |
"""
|
| 304 |
-
if not
|
| 305 |
-
print(">>>
|
| 306 |
-
return
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
try:
|
| 309 |
-
import
|
| 310 |
-
|
| 311 |
-
with open(audio_path,
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
headers = {'Authorization': f'Bearer {GROQ_API_KEY}'}
|
| 319 |
-
|
| 320 |
-
print(f">>> Sende Anfrage an Groq API...")
|
| 321 |
-
response = requests.post(
|
| 322 |
-
"https://api.groq.com/openai/v1/audio/transcriptions",
|
| 323 |
-
headers=headers,
|
| 324 |
-
files=files,
|
| 325 |
-
data=data,
|
| 326 |
-
timeout=30
|
| 327 |
)
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
print(f">>> Groq Transkription: {text}")
|
| 333 |
-
return text
|
| 334 |
-
else:
|
| 335 |
-
print(f">>> Groq Fehler {response.status_code}")
|
| 336 |
-
return transcribe_audio(audio_path, language)
|
| 337 |
-
|
| 338 |
except Exception as e:
|
| 339 |
-
print(f">>>
|
| 340 |
-
return
|
| 341 |
|
| 342 |
def transcribe_audio(
|
| 343 |
audio_path: str,
|
|
@@ -421,65 +391,32 @@ def transcribe_audio(
|
|
| 421 |
# ========================================================
|
| 422 |
# TEXT-TO-SPEECH (TTS)
|
| 423 |
# ========================================================
|
| 424 |
-
def get_tts_pipeline():
|
| 425 |
-
"""Lấy TTS pipeline"""
|
| 426 |
-
global _tts
|
| 427 |
-
if _tts is None:
|
| 428 |
-
print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
|
| 429 |
-
|
| 430 |
-
from transformers import pipeline
|
| 431 |
-
|
| 432 |
-
_tts = pipeline(
|
| 433 |
-
task="text-to-speech",
|
| 434 |
-
model=TTS_MODEL_ID,
|
| 435 |
-
)
|
| 436 |
-
return _tts
|
| 437 |
-
|
| 438 |
def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
|
| 439 |
"""
|
| 440 |
-
Chuyển text sang speech
|
| 441 |
"""
|
| 442 |
-
if not text or not text.strip() or not TTS_ENABLED:
|
| 443 |
return None
|
| 444 |
-
|
| 445 |
try:
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
try:
|
| 464 |
-
audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
|
| 465 |
-
except:
|
| 466 |
-
pass
|
| 467 |
-
|
| 468 |
-
# Normalize
|
| 469 |
-
max_val = np.max(np.abs(audio))
|
| 470 |
-
if max_val > 0:
|
| 471 |
-
audio = audio / max_val
|
| 472 |
-
|
| 473 |
-
# Apply fade
|
| 474 |
-
audio = apply_fade(audio, sr)
|
| 475 |
-
|
| 476 |
-
# Convert to int16
|
| 477 |
-
audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
|
| 478 |
-
|
| 479 |
-
return (sr, audio_int16)
|
| 480 |
-
|
| 481 |
except Exception as e:
|
| 482 |
-
print(f">>> TTS Fehler: {e}")
|
| 483 |
return None
|
| 484 |
|
| 485 |
# ========================================================
|
|
@@ -511,7 +448,6 @@ def fix_domain_terms(text: str) -> str:
|
|
| 511 |
# ========================================================
|
| 512 |
__all__ = [
|
| 513 |
'transcribe_audio',
|
| 514 |
-
'transcribe_with_groq',
|
| 515 |
'synthesize_speech',
|
| 516 |
'detect_voice_activity',
|
| 517 |
'normalize_audio',
|
|
|
|
| 24 |
ASR_MODEL_ID = f"openai/whisper-{WHISPER_MODEL}"
|
| 25 |
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "facebook/mms-tts-deu")
|
| 26 |
|
| 27 |
+
# OpenAI Configuration
|
| 28 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# VAD Configuration
|
| 31 |
ENABLE_VAD = os.getenv("ENABLE_VAD", "true").lower() == "true"
|
|
|
|
| 277 |
# ========================================================
|
| 278 |
# SPEECH-TO-TEXT FUNCTIONS
|
| 279 |
# ========================================================
|
| 280 |
+
def transcribe_audio(
|
| 281 |
+
audio_path: str,
|
| 282 |
+
language: Optional[str] = None,
|
| 283 |
+
max_duration_s: int = ASR_MAX_DURATION_S
|
| 284 |
+
) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
"""
|
| 286 |
+
Transcribe audio bằng OpenAI Whisper API
|
| 287 |
"""
|
| 288 |
+
if not audio_path or not os.path.exists(audio_path):
|
| 289 |
+
print(">>> Kein Audio gefunden.")
|
| 290 |
+
return ""
|
| 291 |
+
if not OPENAI_API_KEY:
|
| 292 |
+
print(">>> OPENAI_API_KEY nicht gesetzt.")
|
| 293 |
+
return ""
|
| 294 |
try:
|
| 295 |
+
from openai import OpenAI
|
| 296 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 297 |
+
with open(audio_path, "rb") as f:
|
| 298 |
+
resp = client.audio.transcriptions.create(
|
| 299 |
+
model="whisper-1",
|
| 300 |
+
file=f,
|
| 301 |
+
language=language if language and language != "auto" else None,
|
| 302 |
+
response_format="text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
)
|
| 304 |
+
text = resp.text if hasattr(resp, "text") else (resp.get("text", "") if isinstance(resp, dict) else str(resp))
|
| 305 |
+
text = fix_domain_terms(text.strip())
|
| 306 |
+
print(f">>> Transkription (OpenAI): {text}")
|
| 307 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
except Exception as e:
|
| 309 |
+
print(f">>> Transkriptionsfehler (OpenAI): {e}")
|
| 310 |
+
return ""
|
| 311 |
|
| 312 |
def transcribe_audio(
|
| 313 |
audio_path: str,
|
|
|
|
| 391 |
# ========================================================
|
| 392 |
# TEXT-TO-SPEECH (TTS)
|
| 393 |
# ========================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
def synthesize_speech(text: str) -> Optional[Tuple[int, np.ndarray]]:
|
| 395 |
"""
|
| 396 |
+
Chuyển text sang speech bằng OpenAI TTS
|
| 397 |
"""
|
| 398 |
+
if not text or not text.strip() or not TTS_ENABLED or not OPENAI_API_KEY:
|
| 399 |
return None
|
|
|
|
| 400 |
try:
|
| 401 |
+
from openai import OpenAI
|
| 402 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
| 403 |
+
response = client.audio.speech.create(
|
| 404 |
+
model="tts-1",
|
| 405 |
+
voice="nova",
|
| 406 |
+
input=text[:4000],
|
| 407 |
+
response_format="wav"
|
| 408 |
+
)
|
| 409 |
+
import io
|
| 410 |
+
audio_bytes = response.content
|
| 411 |
+
with io.BytesIO(audio_bytes) as f:
|
| 412 |
+
data, sr = sf.read(f)
|
| 413 |
+
if len(data.shape) > 1:
|
| 414 |
+
data = np.mean(data, axis=1)
|
| 415 |
+
if data.dtype == np.float32 or data.dtype == np.float64:
|
| 416 |
+
data = np.clip(data * 32767, -32768, 32767).astype(np.int16)
|
| 417 |
+
return (sr, data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
except Exception as e:
|
| 419 |
+
print(f">>> TTS Fehler (OpenAI): {e}")
|
| 420 |
return None
|
| 421 |
|
| 422 |
# ========================================================
|
|
|
|
| 448 |
# ========================================================
|
| 449 |
__all__ = [
|
| 450 |
'transcribe_audio',
|
|
|
|
| 451 |
'synthesize_speech',
|
| 452 |
'detect_voice_activity',
|
| 453 |
'normalize_audio',
|