Rulga commited on
Commit
7073ee1
·
1 Parent(s): fb855e1

Refactor language utilities to enhance language support and detection accuracy

Browse files
Files changed (2) hide show
  1. app.py +93 -29
  2. src/language_utils.py +68 -73
app.py CHANGED
@@ -181,31 +181,63 @@ def get_context(message, conversation_id):
181
  logger.error(f"Error getting context: {str(e)}")
182
  return ""
183
 
184
- def translate_with_llm(text: str, target_lang: str) -> str:
185
- """Translate text using the active LLM"""
186
  try:
187
- prompt = (
188
- f"Translate the following text to {target_lang}. "
189
- f"Provide ONLY the direct translation, no explanations or additional text. "
190
- f"Maintain the same tone and style:\n\n{text}"
191
- )
192
 
193
- response = client.chat_completion(
194
- messages=[
195
- {"role": "system", "content": "You are a professional translator."},
196
- {"role": "user", "content": prompt}
197
- ],
198
- max_tokens=ACTIVE_MODEL['parameters']['max_length'],
199
- temperature=0.3,
200
- top_p=0.9,
201
- stream=False
202
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- return response.choices[0].message.content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  except Exception as e:
207
- logger.error(f"Translation failed: {e}")
208
- return text
209
 
210
  def post_process_response(user_message, bot_response):
211
  """Check if the response language matches the user's language and translate if needed"""
@@ -268,14 +300,39 @@ def load_vector_store():
268
  return None
269
 
270
  def detect_language(text: str) -> str:
271
- """Detect language with fallback"""
272
  try:
273
- if len(text.strip()) < 5:
 
274
  logger.debug(f"Text too short for reliable detection: '{text}'")
275
  return "en"
276
 
277
- return detect(text.strip())
 
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  except Exception as e:
280
  logger.error(f"Language detection error: {str(e)} for text: '{text[:50]}...'")
281
  return "en"
@@ -290,11 +347,12 @@ def respond(
290
  top_p,
291
  attempt_fallback=True
292
  ):
293
- """Generate response with proper error handling"""
294
  try:
295
  # Reset and determine user language for new request
296
  user_lang = detect_language(message)
297
- logger.debug(f"Detected user language: {user_lang}")
 
298
 
299
  # Create clean history without system messages
300
  clean_history = [
@@ -302,9 +360,15 @@ def respond(
302
  if msg["role"] != "system"
303
  ]
304
 
305
- # Add fresh system message with current language instruction
306
- language_instruction = f"\nIMPORTANT: You MUST respond in {user_lang} language ONLY."
307
- full_system_message = system_message + language_instruction
 
 
 
 
 
 
308
 
309
  # --- API Request ---
310
  response = client.chat_completion(
@@ -321,7 +385,7 @@ def respond(
321
 
322
  bot_response = response.choices[0].message.content
323
 
324
- # Post-process response to check language
325
  processed_response = post_process_response(message, bot_response)
326
 
327
  # --- Format Successful Response ---
 
181
  logger.error(f"Error getting context: {str(e)}")
182
  return ""
183
 
184
+ def post_process_response(user_message, bot_response):
185
+ """Enhanced post-processing of bot responses to ensure correct language"""
186
  try:
187
+ user_lang = detect_language(user_message)
188
+ # Convert to closest supported language
189
+ user_lang = LanguageUtils.get_closest_supported_language(user_lang)
 
 
190
 
191
+ logger.info(f"User language detected: {user_lang} ({LanguageUtils.get_language_name(user_lang)})")
192
+
193
+ # If English, no need to translate
194
+ if user_lang == 'en':
195
+ return bot_response
196
+
197
+ # Check if language is supported
198
+ if not LanguageUtils.is_supported(user_lang):
199
+ logger.warning(f"Unsupported language: {user_lang}")
200
+ apology = ("I apologize, but I cannot respond in your language. "
201
+ "I will answer in English instead.\n\n")
202
+ return apology + bot_response
203
+
204
+ # Don't try to detect language of very short responses
205
+ if len(bot_response.strip()) < 20:
206
+ # Short responses just translate directly
207
+ return translate_with_llm(bot_response, user_lang)
208
+
209
+ # Check bot response language
210
+ bot_lang = detect_language(bot_response)
211
+ logger.info(f"Bot response language: {bot_lang}")
212
+
213
+ # If languages match, return as is
214
+ if bot_lang == user_lang:
215
+ return bot_response
216
+
217
+ # Need translation
218
+ logger.warning(f"Language mismatch! User: {user_lang}, Bot: {bot_lang}")
219
+
220
+ translated_response = translate_with_llm(bot_response, user_lang)
221
 
222
+ # Verify translation worked by checking a sample (not the whole text)
223
+ # This is more reliable than checking the entire text
224
+ sample_size = min(100, len(translated_response) // 2)
225
+ if sample_size > 20: # Only verify if we have enough text
226
+ sample = translated_response[:sample_size]
227
+ translated_lang = detect_language(sample)
228
+
229
+ if translated_lang != user_lang:
230
+ logger.error(f"Translation verification failed: got {translated_lang} instead of {user_lang}")
231
+ # If translation failed, return with apology
232
+ apology = (f"I apologize, but I cannot translate my response to {LanguageUtils.get_language_name(user_lang)}. "
233
+ "Here is my answer in English:\n\n")
234
+ return apology + bot_response
235
+
236
+ return translated_response
237
 
238
  except Exception as e:
239
+ logger.error(f"Post-processing error: {e}")
240
+ return bot_response
241
 
242
  def post_process_response(user_message, bot_response):
243
  """Check if the response language matches the user's language and translate if needed"""
 
300
  return None
301
 
302
  def detect_language(text: str) -> str:
303
+ """Enhanced language detection with better handling of edge cases"""
304
  try:
305
+ # If text is too short, don't try to detect
306
+ if len(text.strip()) < 10:
307
  logger.debug(f"Text too short for reliable detection: '{text}'")
308
  return "en"
309
 
310
+ # First detection with langdetect
311
+ from langdetect import detect, LangDetectException
312
 
313
+ try:
314
+ lang_code = detect(text.strip())
315
+ logger.debug(f"Detected language: {lang_code}")
316
+
317
+ # Verify detection with confidence check by analyzing a larger portion of text
318
+ if len(text) > 50:
319
+ from langdetect import DetectorFactory
320
+ DetectorFactory.seed = 0 # For consistent results
321
+
322
+ detector = DetectorFactory.create()
323
+ detector.append(text)
324
+ lang_probabilities = detector.get_probabilities()
325
+
326
+ # If top language has low probability, fallback to English
327
+ if lang_probabilities and lang_probabilities[0].prob < 0.5:
328
+ logger.warning(f"Low confidence detection ({lang_probabilities[0].prob:.2f}) for '{lang_code}', defaulting to English")
329
+ return "en"
330
+
331
+ return lang_code
332
+ except LangDetectException as e:
333
+ logger.warning(f"LangDetect exception: {e}")
334
+ return "en"
335
+
336
  except Exception as e:
337
  logger.error(f"Language detection error: {str(e)} for text: '{text[:50]}...'")
338
  return "en"
 
347
  top_p,
348
  attempt_fallback=True
349
  ):
350
+ """Generate response with improved language handling"""
351
  try:
352
  # Reset and determine user language for new request
353
  user_lang = detect_language(message)
354
+ user_lang = LanguageUtils.get_closest_supported_language(user_lang)
355
+ logger.info(f"User language detected for request: {user_lang} ({LanguageUtils.get_language_name(user_lang)})")
356
 
357
  # Create clean history without system messages
358
  clean_history = [
 
360
  if msg["role"] != "system"
361
  ]
362
 
363
+ # Remove language instruction from system message to avoid confusion
364
+ base_system_message = system_message.split("\nIMPORTANT:")[0] if "\nIMPORTANT:" in system_message else system_message
365
+
366
+ # Always request English response, we'll translate later
367
+ full_system_message = (
368
+ f"{base_system_message}\n\n"
369
+ f"IMPORTANT: Always respond in English, no matter what language the user speaks. "
370
+ f"Provide a complete and helpful response - we will handle translation separately."
371
+ )
372
 
373
  # --- API Request ---
374
  response = client.chat_completion(
 
385
 
386
  bot_response = response.choices[0].message.content
387
 
388
+ # Post-process response to translate if needed
389
  processed_response = post_process_response(message, bot_response)
390
 
391
  # --- Format Successful Response ---
src/language_utils.py CHANGED
@@ -10,89 +10,84 @@ DetectorFactory.seed = 0
10
  logger = logging.getLogger(__name__)
11
 
12
  class LanguageUtils:
13
- """Centralized class for language processing"""
14
 
15
- # Supported languages (can be extended)
16
- SUPPORTED_LANGUAGES = ["en", "ru", "uk", "de", "fr", "es", "it", "pt"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  @classmethod
19
- def detect_language(cls, text: str, default: str = "en") -> str:
20
- """
21
- Detects text language with enhanced error handling
22
-
23
- Args:
24
- text: Text to analyze
25
- default: Default language in case of error
26
-
27
- Returns:
28
- Language code (ISO 639-1)
29
- """
30
- try:
31
- # Minimum length for reliable detection
32
- if len(text.strip()) < 15:
33
- logger.warning(f"Text too short for reliable detection: '{text}'")
34
- return default
35
-
36
- lang = detect(text)
37
-
38
- # Check language support
39
- if lang not in cls.SUPPORTED_LANGUAGES:
40
- logger.warning(f"Unsupported language detected: {lang}. Defaulting to {default}")
41
- return default
42
-
43
- logger.debug(f"Detected language: {lang} for text: '{text[:50]}...'")
44
- return lang
45
-
46
- except Exception as e:
47
- logger.error(f"Language detection failed: {str(e)}. Text: '{text[:100]}...'")
48
- return default
49
 
50
  @classmethod
51
- def get_language_instruction(cls, target_lang: str, user_message: str) -> str:
52
  """
53
- Generates strict response language instructions
54
 
55
- Args:
56
- target_lang: Language the bot should respond in
57
- user_message: Original user message
58
-
59
- Returns:
60
- String with prompt instructions
61
  """
62
- instructions = {
63
- "en": f"CRITICAL: Respond in English only. Never switch languages.\n\nOriginal message: {user_message}",
64
- "ru": f"ВАЖНО: Отвечайте только на русском. Не переключайтесь на другие языки.\n\nОригинальное сообщение: {user_message}",
65
- "uk": f"ВАЖЛИВО: Відповідайте лише українською. Не змінюйте мову.\n\nОригінальне повідомлення: {user_message}",
66
- "de": f"KRITISCH: Antworten Sie nur auf Deutsch. Wechseln Sie nie die Sprache.\n\nOriginalnachricht: {user_message}",
67
- "fr": f"CRITIQUE: Répondez uniquement en français. Ne changez jamais de langue.\n\nMessage original: {user_message}",
68
- "es": f"CRÍTICO: Responda sólo en español. Nunca cambie de idioma.\n\nMensaje original: {user_message}",
69
- "it": f"IMPORTANTE: Rispondere solo in italiano. Non cambiare lingua.\n\nMessaggio originale: {user_message}",
70
- "pt": f"CRÍTICO: Responda apenas em português. Nunca mude de idioma.\n\nMensagem original: {user_message}"
 
 
 
 
 
 
 
71
  }
72
 
73
- return instructions.get(target_lang, instructions["en"])
74
 
75
- @classmethod
76
- def validate_response_language(cls, response: str, expected_lang: str) -> bool:
77
- """
78
- Validates if response language matches expected language
79
-
80
- Args:
81
- response: Bot's response
82
- expected_lang: Expected language (ISO 639-1)
83
-
84
- Returns:
85
- True if language matches, False if not
86
- """
87
- try:
88
- detected_lang = cls.detect_language(response)
89
- if detected_lang != expected_lang:
90
- logger.warning(f"Language mismatch! Expected {expected_lang}, got {detected_lang}")
91
- return False
92
- return True
93
- except Exception as e:
94
- logger.error(f"Language validation failed: {str(e)}")
95
- return False
96
-
97
  # Create instance for convenient import
98
  language_processor = LanguageUtils()
 
10
  logger = logging.getLogger(__name__)
11
 
12
  class LanguageUtils:
13
+ """Utility class for language operations"""
14
 
15
+ SUPPORTED_LANGUAGES = {
16
+ # Common European languages
17
+ 'en': 'English',
18
+ 'ru': 'Russian',
19
+ 'de': 'German',
20
+ 'fr': 'French',
21
+ 'es': 'Spanish',
22
+ 'it': 'Italian',
23
+ 'pt': 'Portuguese',
24
+ 'nl': 'Dutch',
25
+ 'pl': 'Polish',
26
+ 'sv': 'Swedish',
27
+ 'no': 'Norwegian',
28
+ 'da': 'Danish',
29
+ 'fi': 'Finnish',
30
+
31
+ # Asian languages
32
+ 'zh': 'Chinese',
33
+ 'ja': 'Japanese',
34
+ 'ko': 'Korean',
35
+
36
+ # Other widely used languages
37
+ 'ar': 'Arabic',
38
+ 'hi': 'Hindi',
39
+ 'tr': 'Turkish',
40
+ 'cs': 'Czech',
41
+ 'uk': 'Ukrainian',
42
+ 'bg': 'Bulgarian',
43
+ 'el': 'Greek',
44
+ 'he': 'Hebrew',
45
+ 'th': 'Thai',
46
+ 'vi': 'Vietnamese',
47
+ 'hu': 'Hungarian',
48
+ 'sk': 'Slovak',
49
+ 'ro': 'Romanian',
50
+ 'id': 'Indonesian',
51
+ 'ms': 'Malay',
52
+ }
53
 
54
  @classmethod
55
+ def get_language_name(cls, lang_code: str) -> str:
56
+ """Get language name from code"""
57
+ return cls.SUPPORTED_LANGUAGES.get(lang_code, "Unknown")
58
+
59
+ @classmethod
60
+ def is_supported(cls, lang_code: str) -> bool:
61
+ """Check if language is supported"""
62
+ return lang_code in cls.SUPPORTED_LANGUAGES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  @classmethod
65
+ def get_closest_supported_language(cls, lang_code: str) -> str:
66
  """
67
+ Get the closest supported language code
68
 
69
+ This helps with similar language detection issues
70
+ like confusing 'no' (Norwegian) with 'da' (Danish)
 
 
 
 
71
  """
72
+ if lang_code in cls.SUPPORTED_LANGUAGES:
73
+ return lang_code
74
+
75
+ # Language mapping for commonly confused languages
76
+ similar_languages = {
77
+ 'nb': 'no', # Norwegian Bokmål Norwegian
78
+ 'nn': 'no', # Norwegian Nynorsk Norwegian
79
+ 'zh-cn': 'zh', # Chinese Simplified Chinese
80
+ 'zh-tw': 'zh', # Chinese Traditional Chinese
81
+ 'hr': 'sr', # Croatian → Serbian (similar)
82
+ 'bs': 'sr', # Bosnian → Serbian (similar)
83
+ 'mk': 'bg', # Macedonian → Bulgarian (similar)
84
+ 'be': 'ru', # Belarusian → Russian (similar)
85
+ 'ca': 'es', # Catalan → Spanish (similar)
86
+ 'gl': 'pt', # Galician → Portuguese (similar)
87
+ 'af': 'nl', # Afrikaans → Dutch (similar)
88
  }
89
 
90
+ return similar_languages.get(lang_code, "en")
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # Create instance for convenient import
93
  language_processor = LanguageUtils()