commit
Browse files
app.py
CHANGED
|
@@ -183,100 +183,82 @@ def format_sources(src):
|
|
| 183 |
# CORE CHAT-FUNKTION với tất cả tính năng mới
|
| 184 |
# =====================================================
|
| 185 |
def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
|
| 186 |
-
"""
|
| 187 |
-
Main chat function với xử lý VAD và transcription
|
| 188 |
-
"""
|
| 189 |
print(f"DEBUG: chat_fn called - text_input: '{text_input}', audio_path: {audio_path}, history length: {len(history) if history else 0}")
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
text_to_process = ""
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
# Xử lý audio nếu có
|
| 199 |
if audio_path and os.path.exists(audio_path):
|
| 200 |
print(f"DEBUG: Processing audio file: {audio_path}")
|
| 201 |
-
|
| 202 |
-
# Lưu đường dẫn audio vào state
|
| 203 |
state.current_audio_path = audio_path
|
| 204 |
-
|
| 205 |
-
# Kiểm tra VAD nếu được bật
|
| 206 |
if use_vad and ENABLE_VAD:
|
| 207 |
try:
|
| 208 |
import soundfile as sf
|
| 209 |
audio_data, sample_rate = sf.read(audio_path)
|
| 210 |
-
print(f"DEBUG: Audio loaded - shape: {audio_data.shape}, sample_rate: {sample_rate}")
|
| 211 |
-
|
| 212 |
vad_result = handle_voice_activity(audio_data, sample_rate)
|
| 213 |
print(f"DEBUG: VAD result: {vad_result}")
|
| 214 |
-
|
| 215 |
-
# Nếu VAD phát hiện có giọng nói, hoặc nếu VAD không bật, tiến hành transcribe
|
| 216 |
if vad_result.get("is_speech", True):
|
| 217 |
-
# Transcribe audio
|
| 218 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 219 |
if transcribed_text and transcribed_text.strip():
|
| 220 |
text_to_process = transcribed_text.strip()
|
| 221 |
print(f"DEBUG: Transcribed text: {text_to_process}")
|
| 222 |
-
else:
|
| 223 |
-
print("DEBUG: VAD detected no speech, skipping transcription")
|
| 224 |
except Exception as e:
|
| 225 |
print(f"DEBUG: Error in VAD/transcription: {e}")
|
| 226 |
-
# Fallback: transcribe ngay cả khi có lỗi
|
| 227 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 228 |
if transcribed_text and transcribed_text.strip():
|
| 229 |
text_to_process = transcribed_text.strip()
|
| 230 |
else:
|
| 231 |
-
# Nếu VAD không bật, transcribe trực tiếp
|
| 232 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 233 |
if transcribed_text and transcribed_text.strip():
|
| 234 |
text_to_process = transcribed_text.strip()
|
| 235 |
print(f"DEBUG: Transcribed text (no VAD): {text_to_process}")
|
| 236 |
-
|
| 237 |
# Nếu có text input từ textbox, ưu tiên sử dụng nó
|
| 238 |
if text_input and text_input.strip():
|
| 239 |
text_to_process = text_input.strip()
|
| 240 |
print(f"DEBUG: Using text input: {text_to_process}")
|
| 241 |
-
|
| 242 |
-
# Nếu không có gì để xử lý
|
| 243 |
if not text_to_process:
|
| 244 |
-
print("DEBUG: No text to process")
|
| 245 |
-
# Trả về history hiện tại và status
|
| 246 |
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 247 |
-
|
| 248 |
-
history = []
|
| 249 |
-
return history, "", None, status_text
|
| 250 |
-
|
| 251 |
print(f"DEBUG: Processing text: {text_to_process}")
|
| 252 |
-
|
| 253 |
-
# Tăng cường context cho câu hỏi
|
| 254 |
-
enhanced_question = enhance_conversation_context(text_to_process, history)
|
| 255 |
-
|
| 256 |
try:
|
| 257 |
-
# RAG-Antwort berechnen
|
| 258 |
ans, sources = answer(enhanced_question, retriever, llm)
|
| 259 |
bot_msg = ans + format_sources(sources)
|
| 260 |
-
|
| 261 |
-
# Thêm vào state
|
| 262 |
state.add_message("user", text_to_process)
|
| 263 |
state.add_message("assistant", ans)
|
| 264 |
-
|
| 265 |
-
# History aktualisieren (ChatGPT-Style)
|
| 266 |
-
history.append({"role": "user", "content": text_to_process})
|
| 267 |
-
history.append({"role": "assistant", "content": bot_msg})
|
| 268 |
-
|
| 269 |
-
print(f"DEBUG: Answer generated, history length: {len(history)}")
|
| 270 |
-
|
| 271 |
except Exception as e:
|
| 272 |
print(f"DEBUG: Error in RAG pipeline: {e}")
|
| 273 |
-
# Fallback response
|
| 274 |
error_msg = "Entschuldigung, es gab einen Fehler bei der Verarbeitung Ihrer Anfrage. Bitte versuchen Sie es erneut."
|
| 275 |
-
|
| 276 |
-
history.append({"role": "assistant", "content": error_msg})
|
| 277 |
-
|
| 278 |
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 279 |
-
return
|
| 280 |
|
| 281 |
# =====================================================
|
| 282 |
# FUNCTIONS FOR UI CONTROLS
|
|
@@ -365,25 +347,19 @@ def handle_audio_stream(audio_path, use_vad):
|
|
| 365 |
# TTS FUNCTION
|
| 366 |
# =====================================================
|
| 367 |
def read_last_answer(history):
|
| 368 |
-
"""Đọc câu trả lời cuối cùng"""
|
| 369 |
if not history:
|
| 370 |
print("DEBUG: No history for TTS")
|
| 371 |
return None
|
| 372 |
-
|
| 373 |
-
# Tìm câu trả lời cuối cùng của assistant
|
| 374 |
for msg in reversed(history):
|
| 375 |
-
if isinstance(msg,
|
| 376 |
-
content = msg
|
| 377 |
-
# Loại bỏ phần sources từ câu trả lời
|
| 378 |
if "## 📚 Quellen" in content:
|
| 379 |
content = content.split("## 📚 Quellen")[0].strip()
|
| 380 |
-
|
| 381 |
print(f"DEBUG: Synthesizing speech for: {content[:100]}...")
|
| 382 |
audio_result = synthesize_speech(content)
|
| 383 |
if audio_result:
|
| 384 |
print("DEBUG: TTS successful")
|
| 385 |
return audio_result
|
| 386 |
-
|
| 387 |
print("DEBUG: No assistant message found for TTS")
|
| 388 |
return None
|
| 389 |
|
|
|
|
| 183 |
# CORE CHAT-FUNKTION với tất cả tính năng mới
|
| 184 |
# =====================================================
|
| 185 |
def chat_fn(text_input, audio_path, history, lang_sel, use_vad):
|
|
|
|
|
|
|
|
|
|
| 186 |
print(f"DEBUG: chat_fn called - text_input: '{text_input}', audio_path: {audio_path}, history length: {len(history) if history else 0}")
|
| 187 |
+
# Chuẩn hóa history về dạng list các cặp [user, assistant]
|
| 188 |
+
def to_pairs(h):
|
| 189 |
+
if not h:
|
| 190 |
+
return []
|
| 191 |
+
if isinstance(h[0], dict):
|
| 192 |
+
pairs = []
|
| 193 |
+
current = [None, None]
|
| 194 |
+
for m in h:
|
| 195 |
+
if m.get("role") == "user":
|
| 196 |
+
if current != [None, None]:
|
| 197 |
+
pairs.append(current)
|
| 198 |
+
current = [m.get("content", ""), None]
|
| 199 |
+
elif m.get("role") == "assistant":
|
| 200 |
+
if current[0] is None:
|
| 201 |
+
pairs.append([None, m.get("content", "")])
|
| 202 |
+
else:
|
| 203 |
+
current[1] = m.get("content", "")
|
| 204 |
+
pairs.append(current)
|
| 205 |
+
current = [None, None]
|
| 206 |
+
if current != [None, None]:
|
| 207 |
+
pairs.append(current)
|
| 208 |
+
return pairs
|
| 209 |
+
return h
|
| 210 |
+
pairs = to_pairs(history)
|
| 211 |
text_to_process = ""
|
| 212 |
+
# Lấy audio_path nếu chưa có, dùng bản ghi cuối cùng
|
| 213 |
+
if (not audio_path) and state.current_audio_path and os.path.exists(state.current_audio_path):
|
| 214 |
+
audio_path = state.current_audio_path
|
| 215 |
# Xử lý audio nếu có
|
| 216 |
if audio_path and os.path.exists(audio_path):
|
| 217 |
print(f"DEBUG: Processing audio file: {audio_path}")
|
|
|
|
|
|
|
| 218 |
state.current_audio_path = audio_path
|
|
|
|
|
|
|
| 219 |
if use_vad and ENABLE_VAD:
|
| 220 |
try:
|
| 221 |
import soundfile as sf
|
| 222 |
audio_data, sample_rate = sf.read(audio_path)
|
|
|
|
|
|
|
| 223 |
vad_result = handle_voice_activity(audio_data, sample_rate)
|
| 224 |
print(f"DEBUG: VAD result: {vad_result}")
|
|
|
|
|
|
|
| 225 |
if vad_result.get("is_speech", True):
|
|
|
|
| 226 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 227 |
if transcribed_text and transcribed_text.strip():
|
| 228 |
text_to_process = transcribed_text.strip()
|
| 229 |
print(f"DEBUG: Transcribed text: {text_to_process}")
|
|
|
|
|
|
|
| 230 |
except Exception as e:
|
| 231 |
print(f"DEBUG: Error in VAD/transcription: {e}")
|
|
|
|
| 232 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 233 |
if transcribed_text and transcribed_text.strip():
|
| 234 |
text_to_process = transcribed_text.strip()
|
| 235 |
else:
|
|
|
|
| 236 |
transcribed_text = transcribe_audio_optimized(audio_path, language=lang_sel)
|
| 237 |
if transcribed_text and transcribed_text.strip():
|
| 238 |
text_to_process = transcribed_text.strip()
|
| 239 |
print(f"DEBUG: Transcribed text (no VAD): {text_to_process}")
|
|
|
|
| 240 |
# Nếu có text input từ textbox, ưu tiên sử dụng nó
|
| 241 |
if text_input and text_input.strip():
|
| 242 |
text_to_process = text_input.strip()
|
| 243 |
print(f"DEBUG: Using text input: {text_to_process}")
|
| 244 |
+
# Không có text để xử lý
|
|
|
|
| 245 |
if not text_to_process:
|
|
|
|
|
|
|
| 246 |
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 247 |
+
return pairs, "", None, status_text
|
|
|
|
|
|
|
|
|
|
| 248 |
print(f"DEBUG: Processing text: {text_to_process}")
|
| 249 |
+
enhanced_question = enhance_conversation_context(text_to_process, pairs)
|
|
|
|
|
|
|
|
|
|
| 250 |
try:
|
|
|
|
| 251 |
ans, sources = answer(enhanced_question, retriever, llm)
|
| 252 |
bot_msg = ans + format_sources(sources)
|
|
|
|
|
|
|
| 253 |
state.add_message("user", text_to_process)
|
| 254 |
state.add_message("assistant", ans)
|
| 255 |
+
pairs.append([text_to_process, bot_msg])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
except Exception as e:
|
| 257 |
print(f"DEBUG: Error in RAG pipeline: {e}")
|
|
|
|
| 258 |
error_msg = "Entschuldigung, es gab einen Fehler bei der Verarbeitung Ihrer Anfrage. Bitte versuchen Sie es erneut."
|
| 259 |
+
pairs.append([text_to_process, error_msg])
|
|
|
|
|
|
|
| 260 |
status_text = f"Bereit | VAD: {'On' if use_vad and ENABLE_VAD else 'Off'} | Model: OpenAI whisper-1"
|
| 261 |
+
return pairs, "", None, status_text
|
| 262 |
|
| 263 |
# =====================================================
|
| 264 |
# FUNCTIONS FOR UI CONTROLS
|
|
|
|
| 347 |
# TTS FUNCTION
|
| 348 |
# =====================================================
|
| 349 |
def read_last_answer(history):
|
|
|
|
| 350 |
if not history:
|
| 351 |
print("DEBUG: No history for TTS")
|
| 352 |
return None
|
|
|
|
|
|
|
| 353 |
for msg in reversed(history):
|
| 354 |
+
if isinstance(msg, (list, tuple)) and len(msg) == 2 and msg[1]:
|
| 355 |
+
content = msg[1]
|
|
|
|
| 356 |
if "## 📚 Quellen" in content:
|
| 357 |
content = content.split("## 📚 Quellen")[0].strip()
|
|
|
|
| 358 |
print(f"DEBUG: Synthesizing speech for: {content[:100]}...")
|
| 359 |
audio_result = synthesize_speech(content)
|
| 360 |
if audio_result:
|
| 361 |
print("DEBUG: TTS successful")
|
| 362 |
return audio_result
|
|
|
|
| 363 |
print("DEBUG: No assistant message found for TTS")
|
| 364 |
return None
|
| 365 |
|