ChAbhishek28 commited on
Commit
2b377e5
·
1 Parent(s): ecd279c

Add 89999999999999999999999999

Browse files
Files changed (2) hide show
  1. enhanced_websocket_handler.py +106 -15
  2. voice_service.py +3 -6
enhanced_websocket_handler.py CHANGED
@@ -334,6 +334,86 @@ def select_voice_for_language(user_language: str, preferred_voice: str = None) -
334
 
335
  return voice_map.get(lang_lower, 'en-US-AriaNeural')
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  async def handle_enhanced_websocket_connection(websocket: WebSocket):
338
  """Enhanced WebSocket handler with hybrid LLM and voice features"""
339
  await websocket.accept()
@@ -696,44 +776,55 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
696
  })
697
  return
698
 
699
- # Validate transcription quality
700
  transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
 
 
 
 
 
 
701
  logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
 
 
702
 
703
  # Send transcription with quality info
704
  await websocket.send_json({
705
  "type": "transcription",
706
- "text": transcribed_text,
 
707
  "language": user_language or "auto-detected",
708
- "confidence": transcription_quality['level'],
709
- "quality_score": transcription_quality['score'],
710
- "suggestions": transcription_quality['suggestions']
 
711
  })
712
 
713
  # Handle low-quality transcription with detailed feedback
714
- if transcription_quality['score'] < 0.2:
715
  await websocket.send_json({
716
  "type": "transcription_error",
717
- "message": f"Could not understand the audio clearly. Transcribed: '{transcribed_text}'. Please try again.",
718
- "suggestions": transcription_quality['suggestions'],
719
  "quality_details": {
720
- "score": transcription_quality['score'],
721
- "garbled_ratio": transcription_quality.get('garbled_ratio', 0),
722
- "word_count": transcription_quality.get('word_count', 0)
723
  }
724
  })
725
  return
726
- elif transcription_quality['score'] < 0.4:
727
  # Continue processing but warn user
 
728
  await websocket.send_json({
729
  "type": "transcription_warning",
730
- "message": f"Audio quality is low (Score: {transcription_quality['score']:.2f}). I heard: '{transcribed_text}'. Is this correct?",
731
- "suggestions": transcription_quality['suggestions']
732
  })
733
 
734
  # Add comprehensive language context to the prompt for better responses
735
  language_context = create_language_context(user_language, normalized_language)
736
- enhanced_message = transcribed_text + language_context
737
 
738
  # Process as text message with language context
739
  if use_hybrid:
 
334
 
335
  return voice_map.get(lang_lower, 'en-US-AriaNeural')
336
 
337
+ def attempt_transcription_correction(text: str, quality_info: dict) -> str:
338
+ """Attempt to correct common transcription errors, especially for government terms"""
339
+ if not text or quality_info.get('score', 1) > 0.6:
340
+ return text # Don't correct if quality is already good
341
+
342
+ text_lower = text.lower()
343
+ corrected = text
344
+
345
+ # Common government term corrections
346
+ corrections = {
347
+ # Pension-related corrections
348
+ 'tension': 'pension',
349
+ 'penshun': 'pension',
350
+ 'penshan': 'pension',
351
+ 'mention': 'pension',
352
+ 'bruised': 'rules',
353
+ 'bruce': 'rules',
354
+ 'brews': 'rules',
355
+ 'cruise': 'rules',
356
+
357
+ # Policy-related corrections
358
+ 'policy': 'policy', # Keep as is
359
+ 'polity': 'policy',
360
+ 'polly': 'policy',
361
+
362
+ # Government-related corrections
363
+ 'government': 'government', # Keep as is
364
+ 'goverment': 'government',
365
+ 'govermint': 'government',
366
+
367
+ # Allowance corrections
368
+ 'allowens': 'allowance',
369
+ 'alowance': 'allowance',
370
+
371
+ # Benefits corrections
372
+ 'benifits': 'benefits',
373
+ 'benefets': 'benefits',
374
+
375
+ # Common words
376
+ 'wat': 'what',
377
+ 'wot': 'what',
378
+ 'wen': 'when',
379
+ 'were': 'where',
380
+ 'haw': 'how',
381
+ 'no': 'know',
382
+ 'noe': 'know'
383
+ }
384
+
385
+ # Split into words and correct each
386
+ words = corrected.split()
387
+ corrected_words = []
388
+
389
+ for word in words:
390
+ # Remove punctuation for matching
391
+ clean_word = word.lower().strip('.,!?;:')
392
+
393
+ # Check for corrections
394
+ if clean_word in corrections and corrections[clean_word] != clean_word:
395
+ # Preserve original capitalization pattern
396
+ if word.isupper():
397
+ corrected_word = corrections[clean_word].upper()
398
+ elif word.istitle():
399
+ corrected_word = corrections[clean_word].capitalize()
400
+ else:
401
+ corrected_word = corrections[clean_word]
402
+
403
+ # Preserve punctuation
404
+ punctuation = word[len(clean_word):] if len(word) > len(clean_word) else ''
405
+ corrected_words.append(corrected_word + punctuation)
406
+ else:
407
+ corrected_words.append(word)
408
+
409
+ final_corrected = ' '.join(corrected_words)
410
+
411
+ # Only return correction if it's significantly different
412
+ if final_corrected.lower() != text.lower():
413
+ return final_corrected
414
+
415
+ return text
416
+
417
  async def handle_enhanced_websocket_connection(websocket: WebSocket):
418
  """Enhanced WebSocket handler with hybrid LLM and voice features"""
419
  await websocket.accept()
 
776
  })
777
  return
778
 
779
+ # Validate and potentially correct transcription
780
  transcription_quality = validate_transcription_quality(transcribed_text, normalized_language)
781
+ corrected_text = attempt_transcription_correction(transcribed_text, transcription_quality)
782
+
783
+ # Use corrected text if available and quality improved
784
+ final_text = corrected_text if corrected_text != transcribed_text else transcribed_text
785
+ final_quality = validate_transcription_quality(final_text, normalized_language) if corrected_text != transcribed_text else transcription_quality
786
+
787
  logger.info(f"🎤 Transcribed ({user_language}): {transcribed_text} | Quality: {transcription_quality['score']:.2f}")
788
+ if corrected_text != transcribed_text:
789
+ logger.info(f"🔧 Corrected to: {final_text} | New Quality: {final_quality['score']:.2f}")
790
 
791
  # Send transcription with quality info
792
  await websocket.send_json({
793
  "type": "transcription",
794
+ "text": final_text,
795
+ "original_text": transcribed_text if corrected_text != transcribed_text else None,
796
  "language": user_language or "auto-detected",
797
+ "confidence": final_quality['level'],
798
+ "quality_score": final_quality['score'],
799
+ "suggestions": final_quality['suggestions'],
800
+ "was_corrected": corrected_text != transcribed_text
801
  })
802
 
803
  # Handle low-quality transcription with detailed feedback
804
+ if final_quality['score'] < 0.2:
805
  await websocket.send_json({
806
  "type": "transcription_error",
807
+ "message": f"Could not understand the audio clearly. Transcribed: '{final_text}'. Please try again with clearer speech.",
808
+ "suggestions": final_quality['suggestions'],
809
  "quality_details": {
810
+ "score": final_quality['score'],
811
+ "garbled_ratio": final_quality.get('garbled_ratio', 0),
812
+ "word_count": final_quality.get('word_count', 0)
813
  }
814
  })
815
  return
816
+ elif final_quality['score'] < 0.4:
817
  # Continue processing but warn user
818
+ correction_note = f" (Auto-corrected from: '{transcribed_text}')" if corrected_text != transcribed_text else ""
819
  await websocket.send_json({
820
  "type": "transcription_warning",
821
+ "message": f"Audio quality is low (Score: {final_quality['score']:.2f}). I heard: '{final_text}'{correction_note}. Is this correct?",
822
+ "suggestions": final_quality['suggestions'] + ["Try speaking more slowly", "Ensure microphone is close to your mouth", "Reduce background noise"]
823
  })
824
 
825
  # Add comprehensive language context to the prompt for better responses
826
  language_context = create_language_context(user_language, normalized_language)
827
+ enhanced_message = final_text + language_context
828
 
829
  # Process as text message with language context
830
  if use_hybrid:
voice_service.py CHANGED
@@ -231,16 +231,13 @@ class VoiceService:
231
 
232
  # Use enhanced transcription options for better accuracy
233
  transcribe_options = {
234
- "fp16": False, # Use FP32 for better accuracy on CPU
235
  "temperature": 0.0, # Deterministic output for consistency
236
- "beam_size": 5, # Better beam search for accuracy
237
- "best_of": 5, # Generate 5 candidates and pick best one
238
- "patience": 2.0, # More patience for better results
239
- "suppress_tokens": [-1], # Suppress silence tokens
240
  "condition_on_previous_text": False, # Don't use previous context to avoid errors
241
- "no_speech_threshold": 0.6, # Higher threshold to avoid false positives
242
  "logprob_threshold": -1.0, # Lower threshold for better detection
243
  "compression_ratio_threshold": 2.4, # Reasonable compression ratio
 
244
  }
245
 
246
  if language_code and language_code != 'en':
 
231
 
232
  # Use enhanced transcription options for better accuracy
233
  transcribe_options = {
234
+ "fp16": False, # Use float32 for better stability
235
  "temperature": 0.0, # Deterministic output for consistency
 
 
 
 
236
  "condition_on_previous_text": False, # Don't use previous context to avoid errors
237
+ "no_speech_threshold": 0.5, # Lower threshold to catch more speech
238
  "logprob_threshold": -1.0, # Lower threshold for better detection
239
  "compression_ratio_threshold": 2.4, # Reasonable compression ratio
240
+ "initial_prompt": "Government pension rules, policy, benefits, allowance, procurement, finance, administration.", # Context for better recognition
241
  }
242
 
243
  if language_code and language_code != 'en':