Spaces:
Sleeping
Sleeping
Commit
·
2b377e5
1
Parent(s):
ecd279c
Add 89999999999999999999999999
Browse files- enhanced_websocket_handler.py +106 -15
- voice_service.py +3 -6
enhanced_websocket_handler.py
CHANGED
|
@@ -334,6 +334,86 @@ def select_voice_for_language(user_language: str, preferred_voice: str = None) -
|
|
| 334 |
|
| 335 |
return voice_map.get(lang_lower, 'en-US-AriaNeural')
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
async def handle_enhanced_websocket_connection(websocket: WebSocket):
|
| 338 |
"""Enhanced WebSocket handler with hybrid LLM and voice features"""
|
| 339 |
await websocket.accept()
|
|
@@ -696,44 +776,55 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 696 |
})
|
| 697 |
return
|
| 698 |
|
| 699 |
-
# Validate transcription
|
| 700 |
transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
|
|
|
|
|
|
|
| 702 |
|
| 703 |
# Send transcription with quality info
|
| 704 |
await websocket.send_json({
|
| 705 |
"type": "transcription",
|
| 706 |
-
"text":
|
|
|
|
| 707 |
"language": user_language or "auto-detected",
|
| 708 |
-
"confidence":
|
| 709 |
-
"quality_score":
|
| 710 |
-
"suggestions":
|
|
|
|
| 711 |
})
|
| 712 |
|
| 713 |
# Handle low-quality transcription with detailed feedback
|
| 714 |
-
if
|
| 715 |
await websocket.send_json({
|
| 716 |
"type": "transcription_error",
|
| 717 |
-
"message": f"Could not understand the audio clearly. Transcribed: '{
|
| 718 |
-
"suggestions":
|
| 719 |
"quality_details": {
|
| 720 |
-
"score":
|
| 721 |
-
"garbled_ratio":
|
| 722 |
-
"word_count":
|
| 723 |
}
|
| 724 |
})
|
| 725 |
return
|
| 726 |
-
elif
|
| 727 |
# Continue processing but warn user
|
|
|
|
| 728 |
await websocket.send_json({
|
| 729 |
"type": "transcription_warning",
|
| 730 |
-
"message": f"Audio quality is low (Score: {
|
| 731 |
-
"suggestions":
|
| 732 |
})
|
| 733 |
|
| 734 |
# Add comprehensive language context to the prompt for better responses
|
| 735 |
language_context = create_language_context(user_language, normalized_language)
|
| 736 |
-
enhanced_message =
|
| 737 |
|
| 738 |
# Process as text message with language context
|
| 739 |
if use_hybrid:
|
|
|
|
| 334 |
|
| 335 |
return voice_map.get(lang_lower, 'en-US-AriaNeural')
|
| 336 |
|
| 337 |
+
def attempt_transcription_correction(text: str, quality_info: dict) -> str:
|
| 338 |
+
"""Attempt to correct common transcription errors, especially for government terms"""
|
| 339 |
+
if not text or quality_info.get('score', 1) > 0.6:
|
| 340 |
+
return text # Don't correct if quality is already good
|
| 341 |
+
|
| 342 |
+
text_lower = text.lower()
|
| 343 |
+
corrected = text
|
| 344 |
+
|
| 345 |
+
# Common government term corrections
|
| 346 |
+
corrections = {
|
| 347 |
+
# Pension-related corrections
|
| 348 |
+
'tension': 'pension',
|
| 349 |
+
'penshun': 'pension',
|
| 350 |
+
'penshan': 'pension',
|
| 351 |
+
'mention': 'pension',
|
| 352 |
+
'bruised': 'rules',
|
| 353 |
+
'bruce': 'rules',
|
| 354 |
+
'brews': 'rules',
|
| 355 |
+
'cruise': 'rules',
|
| 356 |
+
|
| 357 |
+
# Policy-related corrections
|
| 358 |
+
'policy': 'policy', # Keep as is
|
| 359 |
+
'polity': 'policy',
|
| 360 |
+
'polly': 'policy',
|
| 361 |
+
|
| 362 |
+
# Government-related corrections
|
| 363 |
+
'government': 'government', # Keep as is
|
| 364 |
+
'goverment': 'government',
|
| 365 |
+
'govermint': 'government',
|
| 366 |
+
|
| 367 |
+
# Allowance corrections
|
| 368 |
+
'allowens': 'allowance',
|
| 369 |
+
'alowance': 'allowance',
|
| 370 |
+
|
| 371 |
+
# Benefits corrections
|
| 372 |
+
'benifits': 'benefits',
|
| 373 |
+
'benefets': 'benefits',
|
| 374 |
+
|
| 375 |
+
# Common words
|
| 376 |
+
'wat': 'what',
|
| 377 |
+
'wot': 'what',
|
| 378 |
+
'wen': 'when',
|
| 379 |
+
'were': 'where',
|
| 380 |
+
'haw': 'how',
|
| 381 |
+
'no': 'know',
|
| 382 |
+
'noe': 'know'
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
# Split into words and correct each
|
| 386 |
+
words = corrected.split()
|
| 387 |
+
corrected_words = []
|
| 388 |
+
|
| 389 |
+
for word in words:
|
| 390 |
+
# Remove punctuation for matching
|
| 391 |
+
clean_word = word.lower().strip('.,!?;:')
|
| 392 |
+
|
| 393 |
+
# Check for corrections
|
| 394 |
+
if clean_word in corrections and corrections[clean_word] != clean_word:
|
| 395 |
+
# Preserve original capitalization pattern
|
| 396 |
+
if word.isupper():
|
| 397 |
+
corrected_word = corrections[clean_word].upper()
|
| 398 |
+
elif word.istitle():
|
| 399 |
+
corrected_word = corrections[clean_word].capitalize()
|
| 400 |
+
else:
|
| 401 |
+
corrected_word = corrections[clean_word]
|
| 402 |
+
|
| 403 |
+
# Preserve punctuation
|
| 404 |
+
punctuation = word[len(clean_word):] if len(word) > len(clean_word) else ''
|
| 405 |
+
corrected_words.append(corrected_word + punctuation)
|
| 406 |
+
else:
|
| 407 |
+
corrected_words.append(word)
|
| 408 |
+
|
| 409 |
+
final_corrected = ' '.join(corrected_words)
|
| 410 |
+
|
| 411 |
+
# Only return correction if it's significantly different
|
| 412 |
+
if final_corrected.lower() != text.lower():
|
| 413 |
+
return final_corrected
|
| 414 |
+
|
| 415 |
+
return text
|
| 416 |
+
|
| 417 |
async def handle_enhanced_websocket_connection(websocket: WebSocket):
|
| 418 |
"""Enhanced WebSocket handler with hybrid LLM and voice features"""
|
| 419 |
await websocket.accept()
|
|
|
|
| 776 |
})
|
| 777 |
return
|
| 778 |
|
| 779 |
+
# Validate and potentially correct transcription
|
| 780 |
transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
|
| 781 |
+
corrected_text = attempt_transcription_correction(transcribed_text, transcription_quality)
|
| 782 |
+
|
| 783 |
+
# Use corrected text if available and quality improved
|
| 784 |
+
final_text = corrected_text if corrected_text != transcribed_text else transcribed_text
|
| 785 |
+
final_quality = validate_transcription_quality(final_text, normalized_language) if corrected_text != transcribed_text else transcription_quality
|
| 786 |
+
|
| 787 |
logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
|
| 788 |
+
if corrected_text != transcribed_text:
|
| 789 |
+
logger.info(f"🔧 Corrected to: {final_text} | New Quality: {final_quality['score']:.2f}")
|
| 790 |
|
| 791 |
# Send transcription with quality info
|
| 792 |
await websocket.send_json({
|
| 793 |
"type": "transcription",
|
| 794 |
+
"text": final_text,
|
| 795 |
+
"original_text": transcribed_text if corrected_text != transcribed_text else None,
|
| 796 |
"language": user_language or "auto-detected",
|
| 797 |
+
"confidence": final_quality['level'],
|
| 798 |
+
"quality_score": final_quality['score'],
|
| 799 |
+
"suggestions": final_quality['suggestions'],
|
| 800 |
+
"was_corrected": corrected_text != transcribed_text
|
| 801 |
})
|
| 802 |
|
| 803 |
# Handle low-quality transcription with detailed feedback
|
| 804 |
+
if final_quality['score'] < 0.2:
|
| 805 |
await websocket.send_json({
|
| 806 |
"type": "transcription_error",
|
| 807 |
+
"message": f"Could not understand the audio clearly. Transcribed: '{final_text}'. Please try again with clearer speech.",
|
| 808 |
+
"suggestions": final_quality['suggestions'],
|
| 809 |
"quality_details": {
|
| 810 |
+
"score": final_quality['score'],
|
| 811 |
+
"garbled_ratio": final_quality.get('garbled_ratio', 0),
|
| 812 |
+
"word_count": final_quality.get('word_count', 0)
|
| 813 |
}
|
| 814 |
})
|
| 815 |
return
|
| 816 |
+
elif final_quality['score'] < 0.4:
|
| 817 |
# Continue processing but warn user
|
| 818 |
+
correction_note = f" (Auto-corrected from: '{transcribed_text}')" if corrected_text != transcribed_text else ""
|
| 819 |
await websocket.send_json({
|
| 820 |
"type": "transcription_warning",
|
| 821 |
+
"message": f"Audio quality is low (Score: {final_quality['score']:.2f}). I heard: '{final_text}'{correction_note}. Is this correct?",
|
| 822 |
+
"suggestions": final_quality['suggestions'] + ["Try speaking more slowly", "Ensure microphone is close to your mouth", "Reduce background noise"]
|
| 823 |
})
|
| 824 |
|
| 825 |
# Add comprehensive language context to the prompt for better responses
|
| 826 |
language_context = create_language_context(user_language, normalized_language)
|
| 827 |
+
enhanced_message = final_text + language_context
|
| 828 |
|
| 829 |
# Process as text message with language context
|
| 830 |
if use_hybrid:
|
voice_service.py
CHANGED
|
@@ -231,16 +231,13 @@ class VoiceService:
|
|
| 231 |
|
| 232 |
# Use enhanced transcription options for better accuracy
|
| 233 |
transcribe_options = {
|
| 234 |
-
"fp16": False, # Use
|
| 235 |
"temperature": 0.0, # Deterministic output for consistency
|
| 236 |
-
"beam_size": 5, # Better beam search for accuracy
|
| 237 |
-
"best_of": 5, # Generate 5 candidates and pick best one
|
| 238 |
-
"patience": 2.0, # More patience for better results
|
| 239 |
-
"suppress_tokens": [-1], # Suppress silence tokens
|
| 240 |
"condition_on_previous_text": False, # Don't use previous context to avoid errors
|
| 241 |
-
"no_speech_threshold": 0.
|
| 242 |
"logprob_threshold": -1.0, # Lower threshold for better detection
|
| 243 |
"compression_ratio_threshold": 2.4, # Reasonable compression ratio
|
|
|
|
| 244 |
}
|
| 245 |
|
| 246 |
if language_code and language_code != 'en':
|
|
|
|
| 231 |
|
| 232 |
# Use enhanced transcription options for better accuracy
|
| 233 |
transcribe_options = {
|
| 234 |
+
"fp16": False, # Use float32 for better stability
|
| 235 |
"temperature": 0.0, # Deterministic output for consistency
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
"condition_on_previous_text": False, # Don't use previous context to avoid errors
|
| 237 |
+
"no_speech_threshold": 0.5, # Lower threshold to catch more speech
|
| 238 |
"logprob_threshold": -1.0, # Lower threshold for better detection
|
| 239 |
"compression_ratio_threshold": 2.4, # Reasonable compression ratio
|
| 240 |
+
"initial_prompt": "Government pension rules, policy, benefits, allowance, procurement, finance, administration.", # Context for better recognition
|
| 241 |
}
|
| 242 |
|
| 243 |
if language_code and language_code != 'en':
|