Spaces:

ChAbhishek28
/

PensionBot

Runtime error

App Files Files Community

ChAbhishek28 commited on Oct 3, 2025

Commit

9341027

1 Parent(s): dad5387

Add 8999999999999999999999999

Browse files

Files changed (2) hide show

enhanced_websocket_handler.py +76 -21
voice_service.py +12 -7

enhanced_websocket_handler.py CHANGED Viewed

@@ -116,24 +116,44 @@ def validate_transcription_quality(text: str, language: str) -> dict:
         }
     text_clean = text.strip()
     # Quality indicators
-    word_count = len(text_clean.split())
-    avg_word_length = sum(len(word) for word in text_clean.split()) / max(word_count, 1)
-    has_meaningful_words = any(len(word) > 2 for word in text_clean.split())
     # Language-specific checks
     if language in ['en', 'hi-en']:
         # Check for common English/Hinglish patterns
-        common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', 'pension', 'government']
-        has_common_words = any(word.lower() in common_words for word in text_clean.split())
     else:
         has_common_words = True  # Assume valid for other languages
     # Calculate quality score
     score = 0.0
     if word_count > 0:
-        score += 0.3
     if word_count >= 3:
         score += 0.2
     if avg_word_length > 2:
@@ -141,30 +161,40 @@ def validate_transcription_quality(text: str, language: str) -> dict:
     if has_meaningful_words:
         score += 0.2
     if has_common_words:
-        score += 0.1
-    # Penalize very short or nonsensical text
     if word_count < 2 or avg_word_length < 2:
         score *= 0.5
-    # Determine quality level
-    if score >= 0.8:
         level = "high"
         suggestions = []
-    elif score >= 0.5:
         level = "medium"
         suggestions = ["Speak a bit more clearly for better recognition"]
-    elif score >= 0.3:
         level = "low"
-        suggestions = ["Speak more clearly", "Reduce background noise", "Speak closer to microphone"]
     else:
         level = "very_low"
-        suggestions = ["Audio unclear", "Check microphone", "Reduce noise", "Speak more slowly"]
     return {
         "score": score,
         "level": level,
-        "suggestions": suggestions
     }
 def create_language_context(user_language: str, normalized_language: str) -> str:
@@ -571,10 +601,23 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
                 })
                 return
         else:
-            # Use server-side ASR (Whisper)
             logger.info(f"🎤 Processing audio with language preference: {user_language}")
             transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
             # Clean up temp file
             Path(temp_file_path).unlink()
@@ -599,14 +642,26 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
             "suggestions": transcription_quality['suggestions']
         })
-        # Handle low-quality transcription
-        if transcription_quality['score'] < 0.3:
             await websocket.send_json({
-                "type": "transcription_warning",
-                "message": "The audio quality seems low. Please speak clearly and try again.",
-                "suggestions": transcription_quality['suggestions']
             })
             return
         # Add comprehensive language context to the prompt for better responses
         language_context = create_language_context(user_language, normalized_language)

         }
     text_clean = text.strip()
+    words = text_clean.split()
     # Quality indicators
+    word_count = len(words)
+    avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
+    has_meaningful_words = any(len(word) > 2 for word in words)
+    # Check for garbled/nonsensical words (too many consonants, unusual patterns)
+    garbled_words = 0
+    for word in words:
+        word_clean = ''.join(c for c in word.lower() if c.isalpha())
+        if len(word_clean) > 3:
+            consonants = sum(1 for c in word_clean if c not in 'aeiou')
+            vowels = len(word_clean) - consonants
+            if consonants > vowels * 2:  # Too many consonants
+                garbled_words += 1
+    garbled_ratio = garbled_words / max(word_count, 1)
     # Language-specific checks
     if language in ['en', 'hi-en']:
         # Check for common English/Hinglish patterns
+        common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', 'at', 'by', 'from',
+                       'pension', 'government', 'policy', 'rules', 'what', 'how', 'why', 'when', 'where',
+                       'benefits', 'allowance', 'service', 'employee', 'officer', 'department']
+        has_common_words = any(word.lower() in common_words for word in words)
+        # Check for obvious nonsensical combinations
+        nonsensical_patterns = ['benchern', 'trend rules', 'rinterpret', 'wht']
+        has_nonsensical = any(pattern in text_clean.lower() for pattern in nonsensical_patterns)
     else:
         has_common_words = True  # Assume valid for other languages
+        has_nonsensical = False
     # Calculate quality score
     score = 0.0
     if word_count > 0:
+        score += 0.2
     if word_count >= 3:
         score += 0.2
     if avg_word_length > 2:
     if has_meaningful_words:
         score += 0.2
     if has_common_words:
+        score += 0.2
+    # Apply penalties
+    if garbled_ratio > 0.3:  # More than 30% garbled words
+        score *= 0.3
+    elif garbled_ratio > 0.1:  # More than 10% garbled words
+        score *= 0.6
+    if has_nonsensical:
+        score *= 0.2
     if word_count < 2 or avg_word_length < 2:
         score *= 0.5
+    # Determine quality level and suggestions
+    if score >= 0.7:
         level = "high"
         suggestions = []
+    elif score >= 0.4:
         level = "medium"
         suggestions = ["Speak a bit more clearly for better recognition"]
+    elif score >= 0.2:
         level = "low"
+        suggestions = ["Speak more clearly", "Try speaking slower", "Reduce background noise"]
     else:
         level = "very_low"
+        suggestions = ["Audio quality is poor", "Speak closer to microphone", "Reduce background noise", "Try speaking more slowly and clearly"]
     return {
         "score": score,
         "level": level,
+        "suggestions": suggestions,
+        "garbled_ratio": garbled_ratio,
+        "word_count": word_count
     }
 def create_language_context(user_language: str, normalized_language: str) -> str:
                 })
                 return
         else:
+            # Use server-side ASR (Whisper) with multiple attempts if needed
             logger.info(f"🎤 Processing audio with language preference: {user_language}")
             transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
+            # If transcription seems poor, try with English as fallback
+            if transcribed_text and normalized_language != 'en':
+                quality_check = validate_transcription_quality(transcribed_text, normalized_language)
+                if quality_check['score'] < 0.3:
+                    logger.info("🔄 Trying English transcription as fallback")
+                    english_transcription = await voice_service.speech_to_text(temp_file_path, 'en')
+                    if english_transcription:
+                        english_quality = validate_transcription_quality(english_transcription, 'en')
+                        if english_quality['score'] > quality_check['score'] + 0.2:
+                            logger.info(f"🎯 English transcription better: {english_transcription}")
+                            transcribed_text = english_transcription
+                            normalized_language = 'en'
             # Clean up temp file
             Path(temp_file_path).unlink()
             "suggestions": transcription_quality['suggestions']
         })
+        # Handle low-quality transcription with detailed feedback
+        if transcription_quality['score'] < 0.2:
             await websocket.send_json({
+                "type": "transcription_error",
+                "message": f"Could not understand the audio clearly. Transcribed: '{transcribed_text}'. Please try again.",
+                "suggestions": transcription_quality['suggestions'],
+                "quality_details": {
+                    "score": transcription_quality['score'],
+                    "garbled_ratio": transcription_quality.get('garbled_ratio', 0),
+                    "word_count": transcription_quality.get('word_count', 0)
+                }
             })
             return
+        elif transcription_quality['score'] < 0.4:
+            # Continue processing but warn user
+            await websocket.send_json({
+                "type": "transcription_warning",
+                "message": f"Audio quality is low (Score: {transcription_quality['score']:.2f}). I heard: '{transcribed_text}'. Is this correct?",
+                "suggestions": transcription_quality['suggestions']
+            })
         # Add comprehensive language context to the prompt for better responses
         language_context = create_language_context(user_language, normalized_language)

voice_service.py CHANGED Viewed

@@ -73,10 +73,10 @@ class VoiceService:
                         logger.warning("⚠️ FFmpeg not found - Whisper may not work properly")
                         raise ImportError("FFmpeg not available")
-                    # Use base model for balance between speed and accuracy
-                    self.whisper_model = whisper.load_model("base")
                     self.asr_available = True
-                    logger.info("✅ Whisper ASR initialized (base model for accuracy)")
                 except Exception as whisper_error:
                     logger.error(f"❌ Failed to initialize Whisper: {whisper_error}")
                     logger.info("🔄 Falling back to browser-native ASR")
@@ -232,10 +232,15 @@ class VoiceService:
                 # Use enhanced transcription options for better accuracy
                 transcribe_options = {
                     "fp16": False,  # Use FP32 for better accuracy on CPU
-                    "temperature": 0.0,  # Deterministic output
-                    "best_of": 1,  # Use best transcription
-                    "beam_size": 5,  # Better beam search
-                    "patience": 1.0,  # Wait for better results
                 }
                 if language_code and language_code != 'en':

                         logger.warning("⚠️ FFmpeg not found - Whisper may not work properly")
                         raise ImportError("FFmpeg not available")
+                    # Use small model for better accuracy than base, while maintaining reasonable speed
+                    self.whisper_model = whisper.load_model("small")
                     self.asr_available = True
+                    logger.info("✅ Whisper ASR initialized (small model for better accuracy)")
                 except Exception as whisper_error:
                     logger.error(f"❌ Failed to initialize Whisper: {whisper_error}")
                     logger.info("🔄 Falling back to browser-native ASR")
                 # Use enhanced transcription options for better accuracy
                 transcribe_options = {
                     "fp16": False,  # Use FP32 for better accuracy on CPU
+                    "temperature": 0.0,  # Deterministic output for consistency
+                    "beam_size": 5,  # Better beam search for accuracy
+                    "best_of": 5,  # Generate 5 candidates and pick best one
+                    "patience": 2.0,  # More patience for better results
+                    "suppress_tokens": [-1],  # Suppress silence tokens
+                    "condition_on_previous_text": False,  # Don't use previous context to avoid errors
+                    "no_speech_threshold": 0.6,  # Higher threshold to avoid false positives
+                    "logprob_threshold": -1.0,  # Lower threshold for better detection
+                    "compression_ratio_threshold": 2.4,  # Reasonable compression ratio
                 }
                 if language_code and language_code != 'en':