Spaces:

entropy25
/

mt

Running

App Files Files Community

entropy25 commited on Jan 5

Commit

3e226cf

verified ·

1 Parent(s): be31671

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -46

app.py CHANGED Viewed

@@ -58,16 +58,28 @@ try:
     print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
-    # 调试：打印前5个术语验证编码
-    print("\n术语表示例（验证编码）:")
-    for i, (en, no) in enumerate(list(TERMINOLOGY_EN_TO_NO.items())[:5]):
-        print(f"  {en!r} -> {no!r}")
 except Exception as e:
     print(f"Warning: Could not load glossary.json: {e}")
     TERMINOLOGY_EN_TO_NO = {}
     TERMINOLOGY_NO_TO_EN = {}
 QUALITY_PRESETS = {
     "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
     "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
@@ -119,72 +131,68 @@ QUALITY_TEST_CASES = {
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
-def preprocess_with_terminology(text, direction, use_terminology):
-    """在翻译前添加术语提示"""
-    if not use_terminology or not text:
-        return text, []
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
     else:
         term_dict = TERMINOLOGY_NO_TO_EN
     if not term_dict:
-        return text, []
     found_terms = []
     text_lower = text.lower()
-    # 查找文本中的术语（按长度排序，优先匹配长术语）
     for source_term in sorted(term_dict.keys(), key=len, reverse=True):
         if source_term in text_lower:
             target_term = term_dict[source_term]
             found_terms.append((source_term, target_term))
-    # 如果找到术语，添加翻译提示
-    if found_terms:
-        hint = "\n[Terminology: "
-        hints = []
-        for src, tgt in found_terms[:10]:  # 最多显示10个术语
-            hints.append(f"{src}={tgt}")
-        hint += ", ".join(hints) + "]"
-        # 将提示添加到文本末尾
-        text_with_hint = text + hint
-        return text_with_hint, found_terms
-    return text, []
 def post_process_terminology(text, direction, found_terms, use_terminology):
-    """翻译后修正术语（处理模型可能的小错误）"""
-    if not use_terminology or not text or not found_terms:
         return text
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
     else:
         term_dict = TERMINOLOGY_NO_TO_EN
     result = text
-    # 移除可能被模型翻译的提示部分
-    result = re.sub(r'\[Terminology:.*?\]', '', result, flags=re.IGNORECASE).strip()
-    # 尝试修正可能的术语变体
     for source_term, target_term in found_terms:
-        # 添加词边界以避免错误替换
-        # 例如：确保 "drilling mud" 不会匹配到 "mud" 中间
-        pattern = re.compile(r'\b' + re.escape(target_term) + r'\b', re.IGNORECASE)
-        def replace_match(match):
             original = match.group(0)
-            # 保持首字母大小写
             if original and original[0].isupper():
                 return target_term.capitalize()
             return target_term.lower()
-        # 如果目标术语的变体出现，统一替换
-        result = pattern.sub(replace_match, result)
     return result
@@ -205,12 +213,11 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
     else:
         return "Unsupported language pair", 0.0, []
-    # 🆕 预处理：添加术语提示
-    processed_text, found_terms = preprocess_with_terminology(text, direction, use_terminology)
     preset = QUALITY_PRESETS[quality_preset]
-    original_paragraphs = processed_text.split('\n')
     final_translated_paragraphs = []
     for paragraph in original_paragraphs:
@@ -253,7 +260,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
     raw_translation = '\n'.join(final_translated_paragraphs)
-    # 🆕 后处理：清理和修正术语
     corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
     elapsed_time = time.time() - start_time
@@ -275,7 +281,7 @@ def translate(text, source_lang, target_lang, quality_preset, use_terminology):
         result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
-        terminology_status = f"with {terms_count} terminology hints" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
         time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
         return result, time_info
@@ -509,9 +515,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
         )
         use_terminology = gr.Checkbox(
-            label="Enable Terminology Enforcement (PRE+POST)",
             value=True,
-            info=f"🆕 Uses {len(TERMINOLOGY_EN_TO_NO)} terms: adds hints before translation + cleanup after"
         )
     with gr.Row():
@@ -571,7 +577,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
             elem_classes="time-info"
         )
-    gr.HTML("<div class='footer-info'>🛠️ Oil & Gas Translation • English ↔ Norwegian • 🆕 Pre+Post Terminology Processing</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():
@@ -627,7 +633,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
         )
         run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
         run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
     gr.HTML(f"""
     <div class='disclaimer'>
         <strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary

     print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
 except Exception as e:
     print(f"Warning: Could not load glossary.json: {e}")
     TERMINOLOGY_EN_TO_NO = {}
     TERMINOLOGY_NO_TO_EN = {}
+COMMON_ERRORS = {
+    "en_to_no": {
+        "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
+        "christmas tree": ["juletræet", "jule tre", "juletre"],
+        "permeability": ["permeabiliteten"],
+        "porosity": ["porøsiteten"],
+        "training": ["utdanning"],
+        "working pressure": ["arbeidstrykk"],
+    },
+    "no_to_en": {
+        "slamvekt": ["slam weight", "mudweight"],
+        "juletre": ["yule tree", "christmas-tree"],
+        "permeabilitet": ["permeabiliteten"],
+        "porøsitet": ["porøsiteten"],
+    }
+}
 QUALITY_PRESETS = {
     "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
     "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
+def fix_number_format(text, target_lang):
+    if target_lang == "Norwegian":
+        text = re.sub(r'(\d),(\d{3})', r'\1 \2', text)
+        text = re.sub(r'(\d)\.(\d{3})(?!\d)', r'\1 \2', text)
+        text = re.sub(r'(\d)\.(\d{1,2})(?=\s|$|[^\d])', r'\1,\2', text)
+    else:
+        text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
+        text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
+    return text
+def find_source_terms_in_input(text, direction):
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
     else:
         term_dict = TERMINOLOGY_NO_TO_EN
     if not term_dict:
+        return []
     found_terms = []
     text_lower = text.lower()
     for source_term in sorted(term_dict.keys(), key=len, reverse=True):
         if source_term in text_lower:
             target_term = term_dict[source_term]
             found_terms.append((source_term, target_term))
+    return found_terms
 def post_process_terminology(text, direction, found_terms, use_terminology):
+    if not use_terminology or not text:
         return text
     if direction == "en_to_no":
         term_dict = TERMINOLOGY_EN_TO_NO
+        error_dict = COMMON_ERRORS.get("en_to_no", {})
     else:
         term_dict = TERMINOLOGY_NO_TO_EN
+        error_dict = COMMON_ERRORS.get("no_to_en", {})
     result = text
     for source_term, target_term in found_terms:
+        correct_term = target_term.lower()
+        pattern = re.compile(r'\b' + re.escape(correct_term) + r'\b', re.IGNORECASE)
+        def preserve_case(match):
             original = match.group(0)
             if original and original[0].isupper():
                 return target_term.capitalize()
             return target_term.lower()
+        result = pattern.sub(preserve_case, result)
+        if source_term in error_dict:
+            for error_variant in error_dict[source_term]:
+                error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
+                result = error_pattern.sub(preserve_case, result)
+    result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
     return result
     else:
         return "Unsupported language pair", 0.0, []
+    found_terms = find_source_terms_in_input(text, direction)
     preset = QUALITY_PRESETS[quality_preset]
+    original_paragraphs = text.split('\n')
     final_translated_paragraphs = []
     for paragraph in original_paragraphs:
     raw_translation = '\n'.join(final_translated_paragraphs)
     corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
     elapsed_time = time.time() - start_time
         result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
+        terminology_status = f"with {terms_count} terms enforced" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
         time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
         return result, time_info
         )
         use_terminology = gr.Checkbox(
+            label="Enable Terminology Enforcement (POST)",
             value=True,
+            info=f"Uses {len(TERMINOLOGY_EN_TO_NO)} terms + error variants + number format fixing"
         )
     with gr.Row():
             elem_classes="time-info"
         )
+    gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • POST-only Terminology Processing</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():
         )
         run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
         run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
     gr.HTML(f"""
     <div class='disclaimer'>
         <strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary