Spaces:

ChAbhishek28
/

PensionBot

Sleeping

App Files Files Community

ChAbhishek28 commited on Oct 3, 2025

Commit

2b377e5

1 Parent(s): ecd279c

Add 89999999999999999999999999

Browse files

Files changed (2) hide show

enhanced_websocket_handler.py +106 -15
voice_service.py +3 -6

enhanced_websocket_handler.py CHANGED Viewed

@@ -334,6 +334,86 @@ def select_voice_for_language(user_language: str, preferred_voice: str = None) -
     return voice_map.get(lang_lower, 'en-US-AriaNeural')
 async def handle_enhanced_websocket_connection(websocket: WebSocket):
     """Enhanced WebSocket handler with hybrid LLM and voice features"""
     await websocket.accept()
@@ -696,44 +776,55 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
                 })
                 return
-        # Validate transcription quality
         transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
         logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
         # Send transcription with quality info
         await websocket.send_json({
             "type": "transcription",
-            "text": transcribed_text,
             "language": user_language or "auto-detected",
-            "confidence": transcription_quality['level'],
-            "quality_score": transcription_quality['score'],
-            "suggestions": transcription_quality['suggestions']
         })
         # Handle low-quality transcription with detailed feedback
-        if transcription_quality['score'] < 0.2:
             await websocket.send_json({
                 "type": "transcription_error",
-                "message": f"Could not understand the audio clearly. Transcribed: '{transcribed_text}'. Please try again.",
-                "suggestions": transcription_quality['suggestions'],
                 "quality_details": {
-                    "score": transcription_quality['score'],
-                    "garbled_ratio": transcription_quality.get('garbled_ratio', 0),
-                    "word_count": transcription_quality.get('word_count', 0)
                 }
             })
             return
-        elif transcription_quality['score'] < 0.4:
             # Continue processing but warn user
             await websocket.send_json({
                 "type": "transcription_warning",
-                "message": f"Audio quality is low (Score: {transcription_quality['score']:.2f}). I heard: '{transcribed_text}'. Is this correct?",
-                "suggestions": transcription_quality['suggestions']
             })
         # Add comprehensive language context to the prompt for better responses
         language_context = create_language_context(user_language, normalized_language)
-        enhanced_message = transcribed_text + language_context
         # Process as text message with language context
         if use_hybrid:

     return voice_map.get(lang_lower, 'en-US-AriaNeural')
+def attempt_transcription_correction(text: str, quality_info: dict) -> str:
+    """Attempt to correct common transcription errors, especially for government terms"""
+    if not text or quality_info.get('score', 1) > 0.6:
+        return text  # Don't correct if quality is already good
+    text_lower = text.lower()
+    corrected = text
+    # Common government term corrections
+    corrections = {
+        # Pension-related corrections
+        'tension': 'pension',
+        'penshun': 'pension',
+        'penshan': 'pension',
+        'mention': 'pension',
+        'bruised': 'rules',
+        'bruce': 'rules',
+        'brews': 'rules',
+        'cruise': 'rules',
+        # Policy-related corrections
+        'policy': 'policy',  # Keep as is
+        'polity': 'policy',
+        'polly': 'policy',
+        # Government-related corrections
+        'government': 'government',  # Keep as is
+        'goverment': 'government',
+        'govermint': 'government',
+        # Allowance corrections
+        'allowens': 'allowance',
+        'alowance': 'allowance',
+        # Benefits corrections
+        'benifits': 'benefits',
+        'benefets': 'benefits',
+        # Common words
+        'wat': 'what',
+        'wot': 'what',
+        'wen': 'when',
+        'were': 'where',
+        'haw': 'how',
+        'no': 'know',
+        'noe': 'know'
+    }
+    # Split into words and correct each
+    words = corrected.split()
+    corrected_words = []
+    for word in words:
+        # Remove punctuation for matching
+        clean_word = word.lower().strip('.,!?;:')
+        # Check for corrections
+        if clean_word in corrections and corrections[clean_word] != clean_word:
+            # Preserve original capitalization pattern
+            if word.isupper():
+                corrected_word = corrections[clean_word].upper()
+            elif word.istitle():
+                corrected_word = corrections[clean_word].capitalize()
+            else:
+                corrected_word = corrections[clean_word]
+            # Preserve punctuation
+            punctuation = word[len(clean_word):] if len(word) > len(clean_word) else ''
+            corrected_words.append(corrected_word + punctuation)
+        else:
+            corrected_words.append(word)
+    final_corrected = ' '.join(corrected_words)
+    # Only return correction if it's significantly different
+    if final_corrected.lower() != text.lower():
+        return final_corrected
+    return text
 async def handle_enhanced_websocket_connection(websocket: WebSocket):
     """Enhanced WebSocket handler with hybrid LLM and voice features"""
     await websocket.accept()
                 })
                 return
+        # Validate and potentially correct transcription
         transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
+        corrected_text = attempt_transcription_correction(transcribed_text, transcription_quality)
+        # Use corrected text if available and quality improved
+        final_text = corrected_text if corrected_text != transcribed_text else transcribed_text
+        final_quality = validate_transcription_quality(final_text, normalized_language) if corrected_text != transcribed_text else transcription_quality
         logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
+        if corrected_text != transcribed_text:
+            logger.info(f"🔧 Corrected to: {final_text} | New Quality: {final_quality['score']:.2f}")
         # Send transcription with quality info
         await websocket.send_json({
             "type": "transcription",
+            "text": final_text,
+            "original_text": transcribed_text if corrected_text != transcribed_text else None,
             "language": user_language or "auto-detected",
+            "confidence": final_quality['level'],
+            "quality_score": final_quality['score'],
+            "suggestions": final_quality['suggestions'],
+            "was_corrected": corrected_text != transcribed_text
         })
         # Handle low-quality transcription with detailed feedback
+        if final_quality['score'] < 0.2:
             await websocket.send_json({
                 "type": "transcription_error",
+                "message": f"Could not understand the audio clearly. Transcribed: '{final_text}'. Please try again with clearer speech.",
+                "suggestions": final_quality['suggestions'],
                 "quality_details": {
+                    "score": final_quality['score'],
+                    "garbled_ratio": final_quality.get('garbled_ratio', 0),
+                    "word_count": final_quality.get('word_count', 0)
                 }
             })
             return
+        elif final_quality['score'] < 0.4:
             # Continue processing but warn user
+            correction_note = f" (Auto-corrected from: '{transcribed_text}')" if corrected_text != transcribed_text else ""
             await websocket.send_json({
                 "type": "transcription_warning",
+                "message": f"Audio quality is low (Score: {final_quality['score']:.2f}). I heard: '{final_text}'{correction_note}. Is this correct?",
+                "suggestions": final_quality['suggestions'] + ["Try speaking more slowly", "Ensure microphone is close to your mouth", "Reduce background noise"]
             })
         # Add comprehensive language context to the prompt for better responses
         language_context = create_language_context(user_language, normalized_language)
+        enhanced_message = final_text + language_context
         # Process as text message with language context
         if use_hybrid:

voice_service.py CHANGED Viewed

@@ -231,16 +231,13 @@ class VoiceService:
                 # Use enhanced transcription options for better accuracy
                 transcribe_options = {
-                    "fp16": False,  # Use FP32 for better accuracy on CPU
                     "temperature": 0.0,  # Deterministic output for consistency
-                    "beam_size": 5,  # Better beam search for accuracy
-                    "best_of": 5,  # Generate 5 candidates and pick best one
-                    "patience": 2.0,  # More patience for better results
-                    "suppress_tokens": [-1],  # Suppress silence tokens
                     "condition_on_previous_text": False,  # Don't use previous context to avoid errors
-                    "no_speech_threshold": 0.6,  # Higher threshold to avoid false positives
                     "logprob_threshold": -1.0,  # Lower threshold for better detection
                     "compression_ratio_threshold": 2.4,  # Reasonable compression ratio
                 }
                 if language_code and language_code != 'en':

                 # Use enhanced transcription options for better accuracy
                 transcribe_options = {
+                    "fp16": False,  # Use float32 for better stability
                     "temperature": 0.0,  # Deterministic output for consistency
                     "condition_on_previous_text": False,  # Don't use previous context to avoid errors
+                    "no_speech_threshold": 0.5,  # Lower threshold to catch more speech
                     "logprob_threshold": -1.0,  # Lower threshold for better detection
                     "compression_ratio_threshold": 2.4,  # Reasonable compression ratio
+                    "initial_prompt": "Government pension rules, policy, benefits, allowance, procurement, finance, administration.",  # Context for better recognition
                 }
                 if language_code and language_code != 'en':