Spaces:
Runtime error
Runtime error
Commit Β·
9341027
1
Parent(s): dad5387
Add 8999999999999999999999999
Browse files- enhanced_websocket_handler.py +76 -21
- voice_service.py +12 -7
enhanced_websocket_handler.py
CHANGED
|
@@ -116,24 +116,44 @@ def validate_transcription_quality(text: str, language: str) -> dict:
|
|
| 116 |
}
|
| 117 |
|
| 118 |
text_clean = text.strip()
|
|
|
|
| 119 |
|
| 120 |
# Quality indicators
|
| 121 |
-
word_count = len(
|
| 122 |
-
avg_word_length = sum(len(word) for word in
|
| 123 |
-
has_meaningful_words = any(len(word) > 2 for word in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
# Language-specific checks
|
| 126 |
if language in ['en', 'hi-en']:
|
| 127 |
# Check for common English/Hinglish patterns
|
| 128 |
-
common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', '
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
else:
|
| 131 |
has_common_words = True # Assume valid for other languages
|
|
|
|
| 132 |
|
| 133 |
# Calculate quality score
|
| 134 |
score = 0.0
|
| 135 |
if word_count > 0:
|
| 136 |
-
score += 0.
|
| 137 |
if word_count >= 3:
|
| 138 |
score += 0.2
|
| 139 |
if avg_word_length > 2:
|
|
@@ -141,30 +161,40 @@ def validate_transcription_quality(text: str, language: str) -> dict:
|
|
| 141 |
if has_meaningful_words:
|
| 142 |
score += 0.2
|
| 143 |
if has_common_words:
|
| 144 |
-
score += 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
# Penalize very short or nonsensical text
|
| 147 |
if word_count < 2 or avg_word_length < 2:
|
| 148 |
score *= 0.5
|
| 149 |
|
| 150 |
-
# Determine quality level
|
| 151 |
-
if score >= 0.
|
| 152 |
level = "high"
|
| 153 |
suggestions = []
|
| 154 |
-
elif score >= 0.
|
| 155 |
level = "medium"
|
| 156 |
suggestions = ["Speak a bit more clearly for better recognition"]
|
| 157 |
-
elif score >= 0.
|
| 158 |
level = "low"
|
| 159 |
-
suggestions = ["Speak more clearly", "
|
| 160 |
else:
|
| 161 |
level = "very_low"
|
| 162 |
-
suggestions = ["Audio
|
| 163 |
|
| 164 |
return {
|
| 165 |
"score": score,
|
| 166 |
"level": level,
|
| 167 |
-
"suggestions": suggestions
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
|
| 170 |
def create_language_context(user_language: str, normalized_language: str) -> str:
|
|
@@ -571,10 +601,23 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 571 |
})
|
| 572 |
return
|
| 573 |
else:
|
| 574 |
-
# Use server-side ASR (Whisper)
|
| 575 |
logger.info(f"π€ Processing audio with language preference: {user_language}")
|
| 576 |
transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
|
| 577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
# Clean up temp file
|
| 579 |
Path(temp_file_path).unlink()
|
| 580 |
|
|
@@ -599,14 +642,26 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
|
|
| 599 |
"suggestions": transcription_quality['suggestions']
|
| 600 |
})
|
| 601 |
|
| 602 |
-
# Handle low-quality transcription
|
| 603 |
-
if transcription_quality['score'] < 0.
|
| 604 |
await websocket.send_json({
|
| 605 |
-
"type": "
|
| 606 |
-
"message": "
|
| 607 |
-
"suggestions": transcription_quality['suggestions']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
})
|
| 609 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
# Add comprehensive language context to the prompt for better responses
|
| 612 |
language_context = create_language_context(user_language, normalized_language)
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
text_clean = text.strip()
|
| 119 |
+
words = text_clean.split()
|
| 120 |
|
| 121 |
# Quality indicators
|
| 122 |
+
word_count = len(words)
|
| 123 |
+
avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
|
| 124 |
+
has_meaningful_words = any(len(word) > 2 for word in words)
|
| 125 |
+
|
| 126 |
+
# Check for garbled/nonsensical words (too many consonants, unusual patterns)
|
| 127 |
+
garbled_words = 0
|
| 128 |
+
for word in words:
|
| 129 |
+
word_clean = ''.join(c for c in word.lower() if c.isalpha())
|
| 130 |
+
if len(word_clean) > 3:
|
| 131 |
+
consonants = sum(1 for c in word_clean if c not in 'aeiou')
|
| 132 |
+
vowels = len(word_clean) - consonants
|
| 133 |
+
if consonants > vowels * 2: # Too many consonants
|
| 134 |
+
garbled_words += 1
|
| 135 |
+
|
| 136 |
+
garbled_ratio = garbled_words / max(word_count, 1)
|
| 137 |
|
| 138 |
# Language-specific checks
|
| 139 |
if language in ['en', 'hi-en']:
|
| 140 |
# Check for common English/Hinglish patterns
|
| 141 |
+
common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', 'at', 'by', 'from',
|
| 142 |
+
'pension', 'government', 'policy', 'rules', 'what', 'how', 'why', 'when', 'where',
|
| 143 |
+
'benefits', 'allowance', 'service', 'employee', 'officer', 'department']
|
| 144 |
+
has_common_words = any(word.lower() in common_words for word in words)
|
| 145 |
+
|
| 146 |
+
# Check for obvious nonsensical combinations
|
| 147 |
+
nonsensical_patterns = ['benchern', 'trend rules', 'rinterpret', 'wht']
|
| 148 |
+
has_nonsensical = any(pattern in text_clean.lower() for pattern in nonsensical_patterns)
|
| 149 |
else:
|
| 150 |
has_common_words = True # Assume valid for other languages
|
| 151 |
+
has_nonsensical = False
|
| 152 |
|
| 153 |
# Calculate quality score
|
| 154 |
score = 0.0
|
| 155 |
if word_count > 0:
|
| 156 |
+
score += 0.2
|
| 157 |
if word_count >= 3:
|
| 158 |
score += 0.2
|
| 159 |
if avg_word_length > 2:
|
|
|
|
| 161 |
if has_meaningful_words:
|
| 162 |
score += 0.2
|
| 163 |
if has_common_words:
|
| 164 |
+
score += 0.2
|
| 165 |
+
|
| 166 |
+
# Apply penalties
|
| 167 |
+
if garbled_ratio > 0.3: # More than 30% garbled words
|
| 168 |
+
score *= 0.3
|
| 169 |
+
elif garbled_ratio > 0.1: # More than 10% garbled words
|
| 170 |
+
score *= 0.6
|
| 171 |
+
|
| 172 |
+
if has_nonsensical:
|
| 173 |
+
score *= 0.2
|
| 174 |
|
|
|
|
| 175 |
if word_count < 2 or avg_word_length < 2:
|
| 176 |
score *= 0.5
|
| 177 |
|
| 178 |
+
# Determine quality level and suggestions
|
| 179 |
+
if score >= 0.7:
|
| 180 |
level = "high"
|
| 181 |
suggestions = []
|
| 182 |
+
elif score >= 0.4:
|
| 183 |
level = "medium"
|
| 184 |
suggestions = ["Speak a bit more clearly for better recognition"]
|
| 185 |
+
elif score >= 0.2:
|
| 186 |
level = "low"
|
| 187 |
+
suggestions = ["Speak more clearly", "Try speaking slower", "Reduce background noise"]
|
| 188 |
else:
|
| 189 |
level = "very_low"
|
| 190 |
+
suggestions = ["Audio quality is poor", "Speak closer to microphone", "Reduce background noise", "Try speaking more slowly and clearly"]
|
| 191 |
|
| 192 |
return {
|
| 193 |
"score": score,
|
| 194 |
"level": level,
|
| 195 |
+
"suggestions": suggestions,
|
| 196 |
+
"garbled_ratio": garbled_ratio,
|
| 197 |
+
"word_count": word_count
|
| 198 |
}
|
| 199 |
|
| 200 |
def create_language_context(user_language: str, normalized_language: str) -> str:
|
|
|
|
| 601 |
})
|
| 602 |
return
|
| 603 |
else:
|
| 604 |
+
# Use server-side ASR (Whisper) with multiple attempts if needed
|
| 605 |
logger.info(f"π€ Processing audio with language preference: {user_language}")
|
| 606 |
transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
|
| 607 |
|
| 608 |
+
# If transcription seems poor, try with English as fallback
|
| 609 |
+
if transcribed_text and normalized_language != 'en':
|
| 610 |
+
quality_check = validate_transcription_quality(transcribed_text, normalized_language)
|
| 611 |
+
if quality_check['score'] < 0.3:
|
| 612 |
+
logger.info("π Trying English transcription as fallback")
|
| 613 |
+
english_transcription = await voice_service.speech_to_text(temp_file_path, 'en')
|
| 614 |
+
if english_transcription:
|
| 615 |
+
english_quality = validate_transcription_quality(english_transcription, 'en')
|
| 616 |
+
if english_quality['score'] > quality_check['score'] + 0.2:
|
| 617 |
+
logger.info(f"π― English transcription better: {english_transcription}")
|
| 618 |
+
transcribed_text = english_transcription
|
| 619 |
+
normalized_language = 'en'
|
| 620 |
+
|
| 621 |
# Clean up temp file
|
| 622 |
Path(temp_file_path).unlink()
|
| 623 |
|
|
|
|
| 642 |
"suggestions": transcription_quality['suggestions']
|
| 643 |
})
|
| 644 |
|
| 645 |
+
# Handle low-quality transcription with detailed feedback
|
| 646 |
+
if transcription_quality['score'] < 0.2:
|
| 647 |
await websocket.send_json({
|
| 648 |
+
"type": "transcription_error",
|
| 649 |
+
"message": f"Could not understand the audio clearly. Transcribed: '{transcribed_text}'. Please try again.",
|
| 650 |
+
"suggestions": transcription_quality['suggestions'],
|
| 651 |
+
"quality_details": {
|
| 652 |
+
"score": transcription_quality['score'],
|
| 653 |
+
"garbled_ratio": transcription_quality.get('garbled_ratio', 0),
|
| 654 |
+
"word_count": transcription_quality.get('word_count', 0)
|
| 655 |
+
}
|
| 656 |
})
|
| 657 |
return
|
| 658 |
+
elif transcription_quality['score'] < 0.4:
|
| 659 |
+
# Continue processing but warn user
|
| 660 |
+
await websocket.send_json({
|
| 661 |
+
"type": "transcription_warning",
|
| 662 |
+
"message": f"Audio quality is low (Score: {transcription_quality['score']:.2f}). I heard: '{transcribed_text}'. Is this correct?",
|
| 663 |
+
"suggestions": transcription_quality['suggestions']
|
| 664 |
+
})
|
| 665 |
|
| 666 |
# Add comprehensive language context to the prompt for better responses
|
| 667 |
language_context = create_language_context(user_language, normalized_language)
|
voice_service.py
CHANGED
|
@@ -73,10 +73,10 @@ class VoiceService:
|
|
| 73 |
logger.warning("β οΈ FFmpeg not found - Whisper may not work properly")
|
| 74 |
raise ImportError("FFmpeg not available")
|
| 75 |
|
| 76 |
-
# Use
|
| 77 |
-
self.whisper_model = whisper.load_model("
|
| 78 |
self.asr_available = True
|
| 79 |
-
logger.info("β
Whisper ASR initialized (
|
| 80 |
except Exception as whisper_error:
|
| 81 |
logger.error(f"β Failed to initialize Whisper: {whisper_error}")
|
| 82 |
logger.info("π Falling back to browser-native ASR")
|
|
@@ -232,10 +232,15 @@ class VoiceService:
|
|
| 232 |
# Use enhanced transcription options for better accuracy
|
| 233 |
transcribe_options = {
|
| 234 |
"fp16": False, # Use FP32 for better accuracy on CPU
|
| 235 |
-
"temperature": 0.0, # Deterministic output
|
| 236 |
-
"
|
| 237 |
-
"
|
| 238 |
-
"patience":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
}
|
| 240 |
|
| 241 |
if language_code and language_code != 'en':
|
|
|
|
| 73 |
logger.warning("β οΈ FFmpeg not found - Whisper may not work properly")
|
| 74 |
raise ImportError("FFmpeg not available")
|
| 75 |
|
| 76 |
+
# Use small model for better accuracy than base, while maintaining reasonable speed
|
| 77 |
+
self.whisper_model = whisper.load_model("small")
|
| 78 |
self.asr_available = True
|
| 79 |
+
logger.info("β
Whisper ASR initialized (small model for better accuracy)")
|
| 80 |
except Exception as whisper_error:
|
| 81 |
logger.error(f"β Failed to initialize Whisper: {whisper_error}")
|
| 82 |
logger.info("π Falling back to browser-native ASR")
|
|
|
|
| 232 |
# Use enhanced transcription options for better accuracy
|
| 233 |
transcribe_options = {
|
| 234 |
"fp16": False, # Use FP32 for better accuracy on CPU
|
| 235 |
+
"temperature": 0.0, # Deterministic output for consistency
|
| 236 |
+
"beam_size": 5, # Better beam search for accuracy
|
| 237 |
+
"best_of": 5, # Generate 5 candidates and pick best one
|
| 238 |
+
"patience": 2.0, # More patience for better results
|
| 239 |
+
"suppress_tokens": [-1], # Suppress silence tokens
|
| 240 |
+
"condition_on_previous_text": False, # Don't use previous context to avoid errors
|
| 241 |
+
"no_speech_threshold": 0.6, # Higher threshold to avoid false positives
|
| 242 |
+
"logprob_threshold": -1.0, # Lower threshold for better detection
|
| 243 |
+
"compression_ratio_threshold": 2.4, # Reasonable compression ratio
|
| 244 |
}
|
| 245 |
|
| 246 |
if language_code and language_code != 'en':
|