Spaces:
Sleeping
Sleeping
Peter Yang commited on
Commit ยท
e632f5c
1
Parent(s): f04ca50
Add name translation fix for OPUS-MT - correct 'sand geese' to 'Zhang Shaian'
Browse files- Add _fix_name_translations() method to post-process OPUS-MT translations
- Fix incorrect translation of ็ซ ๆฒ้ (Zhang Shaian) as 'sand geese'
- Handle 'elders of the sand geese' -> 'Zhang Shaian Elder'
- Preserve 'Elder' title when original text contains ้ฟ่
- Apply fix automatically after OPUS-MT translation
- document_processing_agent.py +66 -0
document_processing_agent.py
CHANGED
|
@@ -548,6 +548,69 @@ class DocumentProcessingAgent:
|
|
| 548 |
|
| 549 |
return self._qwen_model, self._qwen_tokenizer, self._qwen_device
|
| 550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
def _validate_translation_quality(self, translation: str, original: str) -> bool:
|
| 552 |
"""Validate translation quality. Returns True if translation is acceptable."""
|
| 553 |
if not translation or len(translation.strip()) < 2:
|
|
@@ -998,6 +1061,9 @@ English:"""
|
|
| 998 |
# Decode result
|
| 999 |
opus_result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
| 1000 |
|
|
|
|
|
|
|
|
|
|
| 1001 |
# HYBRID DECISION: Choose best result
|
| 1002 |
# Prefer Qwen if available and valid, otherwise use OPUS-MT
|
| 1003 |
if qwen_result and self._validate_translation_quality(qwen_result, text):
|
|
|
|
| 548 |
|
| 549 |
return self._qwen_model, self._qwen_tokenizer, self._qwen_device
|
| 550 |
|
| 551 |
+
def _fix_name_translations(self, translation: str, original_text: str) -> str:
|
| 552 |
+
"""Fix known name translation errors in OPUS-MT output.
|
| 553 |
+
|
| 554 |
+
OPUS-MT sometimes incorrectly translates Chinese names. This function
|
| 555 |
+
checks for known incorrect translations and replaces them with correct ones.
|
| 556 |
+
"""
|
| 557 |
+
if not translation:
|
| 558 |
+
return translation
|
| 559 |
+
|
| 560 |
+
# Check if original text contains the Chinese name ็ซ ๆฒ้
|
| 561 |
+
if "็ซ ๆฒ้" not in original_text:
|
| 562 |
+
return translation # No need to fix if name not in original
|
| 563 |
+
|
| 564 |
+
import re
|
| 565 |
+
|
| 566 |
+
# Fix "็ซ ๆฒ้" (Zhang Shaian) mis-translations
|
| 567 |
+
# OPUS-MT translates ็ซ ๆฒ้ as "sand geese" (ๆฒ้ = sand geese)
|
| 568 |
+
corrected = translation
|
| 569 |
+
|
| 570 |
+
# Pattern 1: "elders of the sand geese" -> "Zhang Shaian Elder"
|
| 571 |
+
# This handles: "We have the ceremonial ceremony of the elders of the sand geese here"
|
| 572 |
+
if "้ฟ่" in original_text:
|
| 573 |
+
# Replace "elders of the sand geese" with "Zhang Shaian Elder"
|
| 574 |
+
corrected = re.sub(
|
| 575 |
+
r'\b(?:the\s+)?elders\s+of\s+the\s+sand\s+geese\b',
|
| 576 |
+
'Zhang Shaian Elder',
|
| 577 |
+
corrected,
|
| 578 |
+
flags=re.IGNORECASE
|
| 579 |
+
)
|
| 580 |
+
# Replace "sand geese elder" with "Zhang Shaian Elder"
|
| 581 |
+
corrected = re.sub(
|
| 582 |
+
r'\b(?:the\s+)?sand\s+geese\s+elder\b',
|
| 583 |
+
'Zhang Shaian Elder',
|
| 584 |
+
corrected,
|
| 585 |
+
flags=re.IGNORECASE
|
| 586 |
+
)
|
| 587 |
+
# Replace remaining "sand geese" with "Zhang Shaian" (if ้ฟ่ is present, add Elder)
|
| 588 |
+
corrected = re.sub(
|
| 589 |
+
r'\bsand\s+geese\b',
|
| 590 |
+
'Zhang Shaian',
|
| 591 |
+
corrected,
|
| 592 |
+
flags=re.IGNORECASE
|
| 593 |
+
)
|
| 594 |
+
# If we have "Zhang Shaian" but original had ้ฟ่, make sure we have "Zhang Shaian Elder"
|
| 595 |
+
if "Zhang Shaian" in corrected and "Zhang Shaian Elder" not in corrected:
|
| 596 |
+
# Only add Elder if it's in a context where it makes sense (not in the middle of a sentence)
|
| 597 |
+
corrected = re.sub(
|
| 598 |
+
r'\bZhang\s+Shaian\b(?!\s+Elder)',
|
| 599 |
+
'Zhang Shaian Elder',
|
| 600 |
+
corrected,
|
| 601 |
+
count=1 # Only replace first occurrence to avoid over-correction
|
| 602 |
+
)
|
| 603 |
+
else:
|
| 604 |
+
# If no ้ฟ่, just replace "sand geese" with "Zhang Shaian"
|
| 605 |
+
corrected = re.sub(
|
| 606 |
+
r'\bsand\s+geese\b',
|
| 607 |
+
'Zhang Shaian',
|
| 608 |
+
corrected,
|
| 609 |
+
flags=re.IGNORECASE
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
return corrected
|
| 613 |
+
|
| 614 |
def _validate_translation_quality(self, translation: str, original: str) -> bool:
|
| 615 |
"""Validate translation quality. Returns True if translation is acceptable."""
|
| 616 |
if not translation or len(translation.strip()) < 2:
|
|
|
|
| 1061 |
# Decode result
|
| 1062 |
opus_result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
| 1063 |
|
| 1064 |
+
# Fix known name translation errors in OPUS-MT output
|
| 1065 |
+
opus_result = self._fix_name_translations(opus_result, text)
|
| 1066 |
+
|
| 1067 |
# HYBRID DECISION: Choose best result
|
| 1068 |
# Prefer Qwen if available and valid, otherwise use OPUS-MT
|
| 1069 |
if qwen_result and self._validate_translation_quality(qwen_result, text):
|