Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -86,63 +86,20 @@ def detect_language(text: str) -> str:
|
|
| 86 |
return "en"
|
| 87 |
|
| 88 |
def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
|
| 89 |
-
"""Replace protected terms with placeholders
|
| 90 |
modified_text = text
|
| 91 |
replacements = {}
|
| 92 |
-
|
| 93 |
for i, term in enumerate(protected_terms):
|
| 94 |
-
|
| 95 |
-
placeholder = f"PROTECTEDTERM{i}PLACEHOLDER"
|
| 96 |
replacements[placeholder] = term
|
| 97 |
-
|
| 98 |
-
# Use multiple patterns to catch the term
|
| 99 |
-
patterns = [
|
| 100 |
-
# Exact match with word boundaries
|
| 101 |
-
r'\b' + re.escape(term) + r'\b',
|
| 102 |
-
# Case insensitive match
|
| 103 |
-
r'(?i)\b' + re.escape(term) + r'\b',
|
| 104 |
-
# Match with potential spaces/punctuation
|
| 105 |
-
re.escape(term).replace(r'\ ', r'\s+'),
|
| 106 |
-
]
|
| 107 |
-
|
| 108 |
-
for pattern in patterns:
|
| 109 |
-
if re.search(pattern, modified_text):
|
| 110 |
-
modified_text = re.sub(pattern, placeholder, modified_text)
|
| 111 |
-
logger.debug(f"Protected term '{term}' replaced with '{placeholder}'")
|
| 112 |
-
break
|
| 113 |
-
|
| 114 |
return modified_text, replacements
|
| 115 |
|
| 116 |
def restore_terms(text: str, replacements: dict) -> str:
|
| 117 |
-
"""Restore protected terms in the translated text
|
| 118 |
restored_text = text
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
# Direct replacement
|
| 122 |
-
if placeholder in restored_text:
|
| 123 |
-
restored_text = restored_text.replace(placeholder, original_term)
|
| 124 |
-
logger.debug(f"Restored '{placeholder}' to '{original_term}'")
|
| 125 |
-
else:
|
| 126 |
-
# Try to find partial matches or corrupted placeholders
|
| 127 |
-
# Sometimes translation models might alter the placeholder slightly
|
| 128 |
-
words = restored_text.split()
|
| 129 |
-
for i, word in enumerate(words):
|
| 130 |
-
# Check if word contains part of our placeholder
|
| 131 |
-
if "PROTECTEDTERM" in word and "PLACEHOLDER" in word:
|
| 132 |
-
words[i] = original_term
|
| 133 |
-
logger.debug(f"Fuzzy restored corrupted placeholder '{word}' to '{original_term}'")
|
| 134 |
-
# Also check for common corruptions
|
| 135 |
-
elif word.upper().replace(".", "").replace(",", "") == placeholder.upper():
|
| 136 |
-
words[i] = original_term
|
| 137 |
-
logger.debug(f"Restored corrupted '{word}' to '{original_term}'")
|
| 138 |
-
|
| 139 |
-
restored_text = " ".join(words)
|
| 140 |
-
|
| 141 |
-
# Clean up any remaining artifacts (dots, extra spaces)
|
| 142 |
-
restored_text = re.sub(r'\s*\.\s*\.\s*\.\s*\.+', '', restored_text) # Remove multiple dots
|
| 143 |
-
restored_text = re.sub(r'\s+', ' ', restored_text) # Normalize spaces
|
| 144 |
-
restored_text = restored_text.strip()
|
| 145 |
-
|
| 146 |
return restored_text
|
| 147 |
|
| 148 |
# FastAPI endpoints
|
|
@@ -190,25 +147,13 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
|
|
| 190 |
|
| 191 |
# Protect terms before translation
|
| 192 |
modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
|
| 193 |
-
logger.debug(f"Original text: '{text}'")
|
| 194 |
-
logger.debug(f"Modified text: '{modified_text}'")
|
| 195 |
-
logger.debug(f"Replacements: {replacements}")
|
| 196 |
|
| 197 |
-
# Perform translation
|
| 198 |
-
result = translator(
|
| 199 |
-
modified_text,
|
| 200 |
-
max_length=512,
|
| 201 |
-
num_beams=2, # Reduced from 4 to be more conservative
|
| 202 |
-
do_sample=False,
|
| 203 |
-
early_stopping=True,
|
| 204 |
-
no_repeat_ngram_size=2
|
| 205 |
-
)
|
| 206 |
translated_text = result[0]["translation_text"]
|
| 207 |
-
logger.debug(f"Raw translation: '{translated_text}'")
|
| 208 |
|
| 209 |
# Restore protected terms
|
| 210 |
final_text = restore_terms(translated_text, replacements)
|
| 211 |
-
logger.debug(f"Final text after restoration: '{final_text}'")
|
| 212 |
|
| 213 |
return TranslationResponse(
|
| 214 |
translated_text=final_text,
|
|
@@ -309,7 +254,6 @@ def create_gradio_interface():
|
|
| 309 |
gr.Examples(
|
| 310 |
examples=[
|
| 311 |
["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
|
| 312 |
-
["ฉันเลือกทานอาหารที่ดีต่อสุขภาพร่างกายเพื่อเป็นส่วนหนึ่งในการสนับสนุน 2030 Aspirations", "th"],
|
| 313 |
["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
|
| 314 |
["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
|
| 315 |
["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
|
|
|
|
| 86 |
return "en"
|
| 87 |
|
| 88 |
def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
|
| 89 |
+
"""Replace protected terms with placeholders."""
|
| 90 |
modified_text = text
|
| 91 |
replacements = {}
|
|
|
|
| 92 |
for i, term in enumerate(protected_terms):
|
| 93 |
+
placeholder = f"__PROTECTED_{i}__"
|
|
|
|
| 94 |
replacements[placeholder] = term
|
| 95 |
+
modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
return modified_text, replacements
|
| 97 |
|
| 98 |
def restore_terms(text: str, replacements: dict) -> str:
|
| 99 |
+
"""Restore protected terms in the translated text."""
|
| 100 |
restored_text = text
|
| 101 |
+
for placeholder, term in replacements.items():
|
| 102 |
+
restored_text = restored_text.replace(placeholder, term)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
return restored_text
|
| 104 |
|
| 105 |
# FastAPI endpoints
|
|
|
|
| 147 |
|
| 148 |
# Protect terms before translation
|
| 149 |
modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
# Perform translation
|
| 152 |
+
result = translator(modified_text, max_length=512, num_beams=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
translated_text = result[0]["translation_text"]
|
|
|
|
| 154 |
|
| 155 |
# Restore protected terms
|
| 156 |
final_text = restore_terms(translated_text, replacements)
|
|
|
|
| 157 |
|
| 158 |
return TranslationResponse(
|
| 159 |
translated_text=final_text,
|
|
|
|
| 254 |
gr.Examples(
|
| 255 |
examples=[
|
| 256 |
["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
|
|
|
|
| 257 |
["こんにちは、はじめまして。Griffith大学での研究が進んで��ます。", "ja"],
|
| 258 |
["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
|
| 259 |
["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
|