Peter Yang commited on
Commit
e632f5c
ยท
1 Parent(s): f04ca50

Add name translation fix for OPUS-MT - correct 'sand geese' to 'Zhang Shaian'

Browse files

- Add _fix_name_translations() method to post-process OPUS-MT translations
- Fix incorrect translation of ็ซ ๆฒ™้› (Zhang Shaian) as 'sand geese'
- Handle 'elders of the sand geese' -> 'Zhang Shaian Elder'
- Preserve 'Elder' title when original text contains ้•ฟ่€
- Apply fix automatically after OPUS-MT translation

Files changed (1) hide show
  1. document_processing_agent.py +66 -0
document_processing_agent.py CHANGED
@@ -548,6 +548,69 @@ class DocumentProcessingAgent:
548
 
549
  return self._qwen_model, self._qwen_tokenizer, self._qwen_device
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  def _validate_translation_quality(self, translation: str, original: str) -> bool:
552
  """Validate translation quality. Returns True if translation is acceptable."""
553
  if not translation or len(translation.strip()) < 2:
@@ -998,6 +1061,9 @@ English:"""
998
  # Decode result
999
  opus_result = tokenizer.decode(translated[0], skip_special_tokens=True)
1000
 
 
 
 
1001
  # HYBRID DECISION: Choose best result
1002
  # Prefer Qwen if available and valid, otherwise use OPUS-MT
1003
  if qwen_result and self._validate_translation_quality(qwen_result, text):
 
548
 
549
  return self._qwen_model, self._qwen_tokenizer, self._qwen_device
550
 
551
+ def _fix_name_translations(self, translation: str, original_text: str) -> str:
552
+ """Fix known name translation errors in OPUS-MT output.
553
+
554
+ OPUS-MT sometimes incorrectly translates Chinese names. This function
555
+ checks for known incorrect translations and replaces them with correct ones.
556
+ """
557
+ if not translation:
558
+ return translation
559
+
560
+ # Check if original text contains the Chinese name ็ซ ๆฒ™้›
561
+ if "็ซ ๆฒ™้›" not in original_text:
562
+ return translation # No need to fix if name not in original
563
+
564
+ import re
565
+
566
+ # Fix "็ซ ๆฒ™้›" (Zhang Shaian) mis-translations
567
+ # OPUS-MT translates ็ซ ๆฒ™้› as "sand geese" (ๆฒ™้› = sand geese)
568
+ corrected = translation
569
+
570
+ # Pattern 1: "elders of the sand geese" -> "Zhang Shaian Elder"
571
+ # This handles: "We have the ceremonial ceremony of the elders of the sand geese here"
572
+ if "้•ฟ่€" in original_text:
573
+ # Replace "elders of the sand geese" with "Zhang Shaian Elder"
574
+ corrected = re.sub(
575
+ r'\b(?:the\s+)?elders\s+of\s+the\s+sand\s+geese\b',
576
+ 'Zhang Shaian Elder',
577
+ corrected,
578
+ flags=re.IGNORECASE
579
+ )
580
+ # Replace "sand geese elder" with "Zhang Shaian Elder"
581
+ corrected = re.sub(
582
+ r'\b(?:the\s+)?sand\s+geese\s+elder\b',
583
+ 'Zhang Shaian Elder',
584
+ corrected,
585
+ flags=re.IGNORECASE
586
+ )
587
+ # Replace remaining "sand geese" with "Zhang Shaian" (if ้•ฟ่€ is present, add Elder)
588
+ corrected = re.sub(
589
+ r'\bsand\s+geese\b',
590
+ 'Zhang Shaian',
591
+ corrected,
592
+ flags=re.IGNORECASE
593
+ )
594
+ # If we have "Zhang Shaian" but original had ้•ฟ่€, make sure we have "Zhang Shaian Elder"
595
+ if "Zhang Shaian" in corrected and "Zhang Shaian Elder" not in corrected:
596
+ # Only add Elder if it's in a context where it makes sense (not in the middle of a sentence)
597
+ corrected = re.sub(
598
+ r'\bZhang\s+Shaian\b(?!\s+Elder)',
599
+ 'Zhang Shaian Elder',
600
+ corrected,
601
+ count=1 # Only replace first occurrence to avoid over-correction
602
+ )
603
+ else:
604
+ # If no ้•ฟ่€, just replace "sand geese" with "Zhang Shaian"
605
+ corrected = re.sub(
606
+ r'\bsand\s+geese\b',
607
+ 'Zhang Shaian',
608
+ corrected,
609
+ flags=re.IGNORECASE
610
+ )
611
+
612
+ return corrected
613
+
614
  def _validate_translation_quality(self, translation: str, original: str) -> bool:
615
  """Validate translation quality. Returns True if translation is acceptable."""
616
  if not translation or len(translation.strip()) < 2:
 
1061
  # Decode result
1062
  opus_result = tokenizer.decode(translated[0], skip_special_tokens=True)
1063
 
1064
+ # Fix known name translation errors in OPUS-MT output
1065
+ opus_result = self._fix_name_translations(opus_result, text)
1066
+
1067
  # HYBRID DECISION: Choose best result
1068
  # Prefer Qwen if available and valid, otherwise use OPUS-MT
1069
  if qwen_result and self._validate_translation_quality(qwen_result, text):