ChAbhishek28 commited on
Commit
9341027
Β·
1 Parent(s): dad5387

Add 8999999999999999999999999

Browse files
Files changed (2) hide show
  1. enhanced_websocket_handler.py +76 -21
  2. voice_service.py +12 -7
enhanced_websocket_handler.py CHANGED
@@ -116,24 +116,44 @@ def validate_transcription_quality(text: str, language: str) -> dict:
116
  }
117
 
118
  text_clean = text.strip()
 
119
 
120
  # Quality indicators
121
- word_count = len(text_clean.split())
122
- avg_word_length = sum(len(word) for word in text_clean.split()) / max(word_count, 1)
123
- has_meaningful_words = any(len(word) > 2 for word in text_clean.split())
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  # Language-specific checks
126
  if language in ['en', 'hi-en']:
127
  # Check for common English/Hinglish patterns
128
- common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', 'pension', 'government']
129
- has_common_words = any(word.lower() in common_words for word in text_clean.split())
 
 
 
 
 
 
130
  else:
131
  has_common_words = True # Assume valid for other languages
 
132
 
133
  # Calculate quality score
134
  score = 0.0
135
  if word_count > 0:
136
- score += 0.3
137
  if word_count >= 3:
138
  score += 0.2
139
  if avg_word_length > 2:
@@ -141,30 +161,40 @@ def validate_transcription_quality(text: str, language: str) -> dict:
141
  if has_meaningful_words:
142
  score += 0.2
143
  if has_common_words:
144
- score += 0.1
 
 
 
 
 
 
 
 
 
145
 
146
- # Penalize very short or nonsensical text
147
  if word_count < 2 or avg_word_length < 2:
148
  score *= 0.5
149
 
150
- # Determine quality level
151
- if score >= 0.8:
152
  level = "high"
153
  suggestions = []
154
- elif score >= 0.5:
155
  level = "medium"
156
  suggestions = ["Speak a bit more clearly for better recognition"]
157
- elif score >= 0.3:
158
  level = "low"
159
- suggestions = ["Speak more clearly", "Reduce background noise", "Speak closer to microphone"]
160
  else:
161
  level = "very_low"
162
- suggestions = ["Audio unclear", "Check microphone", "Reduce noise", "Speak more slowly"]
163
 
164
  return {
165
  "score": score,
166
  "level": level,
167
- "suggestions": suggestions
 
 
168
  }
169
 
170
  def create_language_context(user_language: str, normalized_language: str) -> str:
@@ -571,10 +601,23 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
571
  })
572
  return
573
  else:
574
- # Use server-side ASR (Whisper)
575
  logger.info(f"🎀 Processing audio with language preference: {user_language}")
576
  transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
577
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  # Clean up temp file
579
  Path(temp_file_path).unlink()
580
 
@@ -599,14 +642,26 @@ async def handle_voice_message(websocket: WebSocket, data: dict, session_data: d
599
  "suggestions": transcription_quality['suggestions']
600
  })
601
 
602
- # Handle low-quality transcription
603
- if transcription_quality['score'] < 0.3:
604
  await websocket.send_json({
605
- "type": "transcription_warning",
606
- "message": "The audio quality seems low. Please speak clearly and try again.",
607
- "suggestions": transcription_quality['suggestions']
 
 
 
 
 
608
  })
609
  return
 
 
 
 
 
 
 
610
 
611
  # Add comprehensive language context to the prompt for better responses
612
  language_context = create_language_context(user_language, normalized_language)
 
116
  }
117
 
118
  text_clean = text.strip()
119
+ words = text_clean.split()
120
 
121
  # Quality indicators
122
+ word_count = len(words)
123
+ avg_word_length = sum(len(word) for word in words) / max(word_count, 1)
124
+ has_meaningful_words = any(len(word) > 2 for word in words)
125
+
126
+ # Check for garbled/nonsensical words (too many consonants, unusual patterns)
127
+ garbled_words = 0
128
+ for word in words:
129
+ word_clean = ''.join(c for c in word.lower() if c.isalpha())
130
+ if len(word_clean) > 3:
131
+ consonants = sum(1 for c in word_clean if c not in 'aeiou')
132
+ vowels = len(word_clean) - consonants
133
+ if consonants > vowels * 2: # Too many consonants
134
+ garbled_words += 1
135
+
136
+ garbled_ratio = garbled_words / max(word_count, 1)
137
 
138
  # Language-specific checks
139
  if language in ['en', 'hi-en']:
140
  # Check for common English/Hinglish patterns
141
+ common_words = ['the', 'and', 'is', 'in', 'to', 'of', 'for', 'with', 'on', 'at', 'by', 'from',
142
+ 'pension', 'government', 'policy', 'rules', 'what', 'how', 'why', 'when', 'where',
143
+ 'benefits', 'allowance', 'service', 'employee', 'officer', 'department']
144
+ has_common_words = any(word.lower() in common_words for word in words)
145
+
146
+ # Check for obvious nonsensical combinations
147
+ nonsensical_patterns = ['benchern', 'trend rules', 'rinterpret', 'wht']
148
+ has_nonsensical = any(pattern in text_clean.lower() for pattern in nonsensical_patterns)
149
  else:
150
  has_common_words = True # Assume valid for other languages
151
+ has_nonsensical = False
152
 
153
  # Calculate quality score
154
  score = 0.0
155
  if word_count > 0:
156
+ score += 0.2
157
  if word_count >= 3:
158
  score += 0.2
159
  if avg_word_length > 2:
 
161
  if has_meaningful_words:
162
  score += 0.2
163
  if has_common_words:
164
+ score += 0.2
165
+
166
+ # Apply penalties
167
+ if garbled_ratio > 0.3: # More than 30% garbled words
168
+ score *= 0.3
169
+ elif garbled_ratio > 0.1: # More than 10% garbled words
170
+ score *= 0.6
171
+
172
+ if has_nonsensical:
173
+ score *= 0.2
174
 
 
175
  if word_count < 2 or avg_word_length < 2:
176
  score *= 0.5
177
 
178
+ # Determine quality level and suggestions
179
+ if score >= 0.7:
180
  level = "high"
181
  suggestions = []
182
+ elif score >= 0.4:
183
  level = "medium"
184
  suggestions = ["Speak a bit more clearly for better recognition"]
185
+ elif score >= 0.2:
186
  level = "low"
187
+ suggestions = ["Speak more clearly", "Try speaking slower", "Reduce background noise"]
188
  else:
189
  level = "very_low"
190
+ suggestions = ["Audio quality is poor", "Speak closer to microphone", "Reduce background noise", "Try speaking more slowly and clearly"]
191
 
192
  return {
193
  "score": score,
194
  "level": level,
195
+ "suggestions": suggestions,
196
+ "garbled_ratio": garbled_ratio,
197
+ "word_count": word_count
198
  }
199
 
200
  def create_language_context(user_language: str, normalized_language: str) -> str:
 
601
  })
602
  return
603
  else:
604
+ # Use server-side ASR (Whisper) with multiple attempts if needed
605
  logger.info(f"🎀 Processing audio with language preference: {user_language}")
606
  transcribed_text = await voice_service.speech_to_text(temp_file_path, normalized_language)
607
 
608
+ # If transcription seems poor, try with English as fallback
609
+ if transcribed_text and normalized_language != 'en':
610
+ quality_check = validate_transcription_quality(transcribed_text, normalized_language)
611
+ if quality_check['score'] < 0.3:
612
+ logger.info("πŸ”„ Trying English transcription as fallback")
613
+ english_transcription = await voice_service.speech_to_text(temp_file_path, 'en')
614
+ if english_transcription:
615
+ english_quality = validate_transcription_quality(english_transcription, 'en')
616
+ if english_quality['score'] > quality_check['score'] + 0.2:
617
+ logger.info(f"🎯 English transcription better: {english_transcription}")
618
+ transcribed_text = english_transcription
619
+ normalized_language = 'en'
620
+
621
  # Clean up temp file
622
  Path(temp_file_path).unlink()
623
 
 
642
  "suggestions": transcription_quality['suggestions']
643
  })
644
 
645
+ # Handle low-quality transcription with detailed feedback
646
+ if transcription_quality['score'] < 0.2:
647
  await websocket.send_json({
648
+ "type": "transcription_error",
649
+ "message": f"Could not understand the audio clearly. Transcribed: '{transcribed_text}'. Please try again.",
650
+ "suggestions": transcription_quality['suggestions'],
651
+ "quality_details": {
652
+ "score": transcription_quality['score'],
653
+ "garbled_ratio": transcription_quality.get('garbled_ratio', 0),
654
+ "word_count": transcription_quality.get('word_count', 0)
655
+ }
656
  })
657
  return
658
+ elif transcription_quality['score'] < 0.4:
659
+ # Continue processing but warn user
660
+ await websocket.send_json({
661
+ "type": "transcription_warning",
662
+ "message": f"Audio quality is low (Score: {transcription_quality['score']:.2f}). I heard: '{transcribed_text}'. Is this correct?",
663
+ "suggestions": transcription_quality['suggestions']
664
+ })
665
 
666
  # Add comprehensive language context to the prompt for better responses
667
  language_context = create_language_context(user_language, normalized_language)
voice_service.py CHANGED
@@ -73,10 +73,10 @@ class VoiceService:
73
  logger.warning("⚠️ FFmpeg not found - Whisper may not work properly")
74
  raise ImportError("FFmpeg not available")
75
 
76
- # Use base model for balance between speed and accuracy
77
- self.whisper_model = whisper.load_model("base")
78
  self.asr_available = True
79
- logger.info("βœ… Whisper ASR initialized (base model for accuracy)")
80
  except Exception as whisper_error:
81
  logger.error(f"❌ Failed to initialize Whisper: {whisper_error}")
82
  logger.info("πŸ”„ Falling back to browser-native ASR")
@@ -232,10 +232,15 @@ class VoiceService:
232
  # Use enhanced transcription options for better accuracy
233
  transcribe_options = {
234
  "fp16": False, # Use FP32 for better accuracy on CPU
235
- "temperature": 0.0, # Deterministic output
236
- "best_of": 1, # Use best transcription
237
- "beam_size": 5, # Better beam search
238
- "patience": 1.0, # Wait for better results
 
 
 
 
 
239
  }
240
 
241
  if language_code and language_code != 'en':
 
73
  logger.warning("⚠️ FFmpeg not found - Whisper may not work properly")
74
  raise ImportError("FFmpeg not available")
75
 
76
+ # Use small model for better accuracy than base, while maintaining reasonable speed
77
+ self.whisper_model = whisper.load_model("small")
78
  self.asr_available = True
79
+ logger.info("βœ… Whisper ASR initialized (small model for better accuracy)")
80
  except Exception as whisper_error:
81
  logger.error(f"❌ Failed to initialize Whisper: {whisper_error}")
82
  logger.info("πŸ”„ Falling back to browser-native ASR")
 
232
  # Use enhanced transcription options for better accuracy
233
  transcribe_options = {
234
  "fp16": False, # Use FP32 for better accuracy on CPU
235
+ "temperature": 0.0, # Deterministic output for consistency
236
+ "beam_size": 5, # Better beam search for accuracy
237
+ "best_of": 5, # Generate 5 candidates and pick best one
238
+ "patience": 2.0, # More patience for better results
239
+ "suppress_tokens": [-1], # Suppress silence tokens
240
+ "condition_on_previous_text": False, # Don't use previous context to avoid errors
241
+ "no_speech_threshold": 0.6, # Higher threshold to avoid false positives
242
+ "logprob_threshold": -1.0, # Lower threshold for better detection
243
+ "compression_ratio_threshold": 2.4, # Reasonable compression ratio
244
  }
245
 
246
  if language_code and language_code != 'en':