Spaces:

entropy25
/

mt

Running

App Files Files Community

entropy25 commited on Jan 5

Commit

be31671

verified ·

1 Parent(s): 711f784

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -31

app.py CHANGED Viewed

@@ -57,6 +57,12 @@ try:
         TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
     print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
 except Exception as e:
     print(f"Warning: Could not load glossary.json: {e}")
     TERMINOLOGY_EN_TO_NO = {}
@@ -113,9 +119,10 @@ QUALITY_TEST_CASES = {
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
-def post_process_terminology(text, direction, use_terminology):
     if not use_terminology or not text:
-        return text
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
@@ -123,26 +130,60 @@ def post_process_terminology(text, direction, use_terminology):
         term_dict = TERMINOLOGY_NO_TO_EN
     if not term_dict:
         return text
     result = text
-    replacements_made = []
-    for source_term, target_term in sorted(term_dict.items(), key=lambda x: len(x[0]), reverse=True):
-        pattern = re.compile(re.escape(source_term), re.IGNORECASE)
         def replace_match(match):
             original = match.group(0)
-            if original[0].isupper():
-                replacement = target_term.capitalize()
-            else:
-                replacement = target_term
-            if original != replacement:
-                replacements_made.append(f"{original} → {replacement}")
-            return replacement
         result = pattern.sub(replace_match, result)
     return result
@@ -164,9 +205,12 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
     else:
         return "Unsupported language pair", 0.0, []
     preset = QUALITY_PRESETS[quality_preset]
-    original_paragraphs = text.split('\n')
     final_translated_paragraphs = []
     for paragraph in original_paragraphs:
@@ -209,16 +253,17 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
     raw_translation = '\n'.join(final_translated_paragraphs)
-    corrected_translation = post_process_terminology(raw_translation, direction, use_terminology)
     elapsed_time = time.time() - start_time
-    return corrected_translation, elapsed_time, []
 @lru_cache(maxsize=512)
 def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
-    result, elapsed, _ = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
-    return result, elapsed
 def translate(text, source_lang, target_lang, quality_preset, use_terminology):
     try:
@@ -228,9 +273,9 @@ def translate(text, source_lang, target_lang, quality_preset, use_terminology):
         if not text.strip():
             return "", ""
-        result, elapsed = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
-        terminology_status = "with terminology enforcement" if use_terminology else "without terminology enforcement"
         time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
         return result, time_info
@@ -251,7 +296,7 @@ def run_quality_tests(use_terminology):
         results.append(f"\n{src_lang} to {tgt_lang}\n")
         for i, case in enumerate(test_cases, 1):
-            translation, _, _ = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
             passed_checks = []
             failed_checks = []
@@ -262,25 +307,28 @@ def run_quality_tests(use_terminology):
                 else:
                     failed_checks.append(keyword)
-            status = "PASS" if not failed_checks else "CHECK"
             results.append(f"\nTest {i}: {status}")
             results.append(f"Input:    {case['input']}")
             results.append(f"Expected: {case['expected']}")
             results.append(f"Got:      {translation}")
             if passed_checks:
-                results.append(f"Found:  {', '.join(passed_checks)}")
             if failed_checks:
-                results.append(f"Missing: {', '.join(failed_checks)}")
     results.append("\n=== TEST COMPLETE ===")
-    pass_count = sum(1 for r in results if "PASS" in r)
-    check_count = sum(1 for r in results if "CHECK" in r)
     total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
-    results.insert(2, f"\nScore: {pass_count}/{total} passed, {check_count}/{total} need review\n")
     return '\n'.join(results)
@@ -461,9 +509,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
         )
         use_terminology = gr.Checkbox(
-            label="Enable Terminology Enforcement",
             value=True,
-            info=f"Use glossary of {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms for post-processing"
         )
     with gr.Row():
@@ -523,7 +571,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
             elem_classes="time-info"
         )
-    gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • Production-Ready System</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():

         TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
     print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
+    # 调试：打印前5个术语验证编码
+    print("\n术语表示例（验证编码）:")
+    for i, (en, no) in enumerate(list(TERMINOLOGY_EN_TO_NO.items())[:5]):
+        print(f"  {en!r} -> {no!r}")
 except Exception as e:
     print(f"Warning: Could not load glossary.json: {e}")
     TERMINOLOGY_EN_TO_NO = {}
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
+def preprocess_with_terminology(text, direction, use_terminology):
+    """在翻译前添加术语提示"""
     if not use_terminology or not text:
+        return text, []
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
         term_dict = TERMINOLOGY_NO_TO_EN
     if not term_dict:
+        return text, []
+    found_terms = []
+    text_lower = text.lower()
+    # 查找文本中的术语（按长度排序，优先匹配长术语）
+    for source_term in sorted(term_dict.keys(), key=len, reverse=True):
+        if source_term in text_lower:
+            target_term = term_dict[source_term]
+            found_terms.append((source_term, target_term))
+    # 如果找到术语，添加翻译提示
+    if found_terms:
+        hint = "\n[Terminology: "
+        hints = []
+        for src, tgt in found_terms[:10]:  # 最多显示10个术语
+            hints.append(f"{src}={tgt}")
+        hint += ", ".join(hints) + "]"
+        # 将提示添加到文本末尾
+        text_with_hint = text + hint
+        return text_with_hint, found_terms
+    return text, []
+def post_process_terminology(text, direction, found_terms, use_terminology):
+    """翻译后修正术语（处理模型可能的小错误）"""
+    if not use_terminology or not text or not found_terms:
         return text
+    if direction == "en_to_no":
+        term_dict = TERMINOLOGY_EN_TO_NO
+    else:
+        term_dict = TERMINOLOGY_NO_TO_EN
     result = text
+    # 移除可能被模型翻译的提示部分
+    result = re.sub(r'\[Terminology:.*?\]', '', result, flags=re.IGNORECASE).strip()
+    # 尝试修正可能的术语变体
+    for source_term, target_term in found_terms:
+        # 添加词边界以避免错误替换
+        # 例如：确保 "drilling mud" 不会匹配到 "mud" 中间
+        pattern = re.compile(r'\b' + re.escape(target_term) + r'\b', re.IGNORECASE)
         def replace_match(match):
             original = match.group(0)
+            # 保持首字母大小写
+            if original and original[0].isupper():
+                return target_term.capitalize()
+            return target_term.lower()
+        # 如果目标术语的变体出现，统一替换
         result = pattern.sub(replace_match, result)
     return result
     else:
         return "Unsupported language pair", 0.0, []
+    # 🆕 预处理：添加术语提示
+    processed_text, found_terms = preprocess_with_terminology(text, direction, use_terminology)
     preset = QUALITY_PRESETS[quality_preset]
+    original_paragraphs = processed_text.split('\n')
     final_translated_paragraphs = []
     for paragraph in original_paragraphs:
     raw_translation = '\n'.join(final_translated_paragraphs)
+    # 🆕 后处理：清理和修正术语
+    corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
     elapsed_time = time.time() - start_time
+    return corrected_translation, elapsed_time, found_terms
 @lru_cache(maxsize=512)
 def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
+    result, elapsed, terms = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
+    return result, elapsed, len(terms)
 def translate(text, source_lang, target_lang, quality_preset, use_terminology):
     try:
         if not text.strip():
             return "", ""
+        result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
+        terminology_status = f"with {terms_count} terminology hints" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
         time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
         return result, time_info
         results.append(f"\n{src_lang} to {tgt_lang}\n")
         for i, case in enumerate(test_cases, 1):
+            translation, _, found_terms = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
             passed_checks = []
             failed_checks = []
                 else:
                     failed_checks.append(keyword)
+            status = "✅ PASS" if not failed_checks else "⚠️ CHECK"
             results.append(f"\nTest {i}: {status}")
             results.append(f"Input:    {case['input']}")
             results.append(f"Expected: {case['expected']}")
             results.append(f"Got:      {translation}")
+            if use_terminology and found_terms:
+                results.append(f"Terms found: {len(found_terms)}")
             if passed_checks:
+                results.append(f"✓ Found:  {', '.join(passed_checks)}")
             if failed_checks:
+                results.append(f"✗ Missing: {', '.join(failed_checks)}")
     results.append("\n=== TEST COMPLETE ===")
+    pass_count = sum(1 for r in results if "✅ PASS" in r)
+    check_count = sum(1 for r in results if "⚠️ CHECK" in r)
     total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
+    results.insert(2, f"\n📊 Score: {pass_count}/{total} passed, {check_count}/{total} need review\n")
     return '\n'.join(results)
         )
         use_terminology = gr.Checkbox(
+            label="Enable Terminology Enforcement (PRE+POST)",
             value=True,
+            info=f"🆕 Uses {len(TERMINOLOGY_EN_TO_NO)} terms: adds hints before translation + cleanup after"
         )
     with gr.Row():
             elem_classes="time-info"
         )
+    gr.HTML("<div class='footer-info'>🛠️ Oil & Gas Translation • English ↔ Norwegian • 🆕 Pre+Post Terminology Processing</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():