Update app.py
Browse files
app.py
CHANGED
|
@@ -58,16 +58,28 @@ try:
|
|
| 58 |
|
| 59 |
print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
|
| 60 |
|
| 61 |
-
# 调试:打印前5个术语验证编码
|
| 62 |
-
print("\n术语表示例(验证编码):")
|
| 63 |
-
for i, (en, no) in enumerate(list(TERMINOLOGY_EN_TO_NO.items())[:5]):
|
| 64 |
-
print(f" {en!r} -> {no!r}")
|
| 65 |
-
|
| 66 |
except Exception as e:
|
| 67 |
print(f"Warning: Could not load glossary.json: {e}")
|
| 68 |
TERMINOLOGY_EN_TO_NO = {}
|
| 69 |
TERMINOLOGY_NO_TO_EN = {}
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
QUALITY_PRESETS = {
|
| 72 |
"Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
|
| 73 |
"Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
|
|
@@ -119,72 +131,68 @@ QUALITY_TEST_CASES = {
|
|
| 119 |
MAX_FILE_SIZE = 1024 * 1024
|
| 120 |
MAX_TEXT_LENGTH = 10000
|
| 121 |
|
| 122 |
-
def
|
| 123 |
-
""
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
|
|
|
|
|
|
| 127 |
if direction == "en_to_no":
|
| 128 |
term_dict = TERMINOLOGY_EN_TO_NO
|
| 129 |
else:
|
| 130 |
term_dict = TERMINOLOGY_NO_TO_EN
|
| 131 |
|
| 132 |
if not term_dict:
|
| 133 |
-
return
|
| 134 |
|
| 135 |
found_terms = []
|
| 136 |
text_lower = text.lower()
|
| 137 |
|
| 138 |
-
# 查找文本中的术语(按长度排序,优先匹配长术语)
|
| 139 |
for source_term in sorted(term_dict.keys(), key=len, reverse=True):
|
| 140 |
if source_term in text_lower:
|
| 141 |
target_term = term_dict[source_term]
|
| 142 |
found_terms.append((source_term, target_term))
|
| 143 |
|
| 144 |
-
|
| 145 |
-
if found_terms:
|
| 146 |
-
hint = "\n[Terminology: "
|
| 147 |
-
hints = []
|
| 148 |
-
for src, tgt in found_terms[:10]: # 最多显示10个术语
|
| 149 |
-
hints.append(f"{src}={tgt}")
|
| 150 |
-
hint += ", ".join(hints) + "]"
|
| 151 |
-
|
| 152 |
-
# 将提示添加到文本末尾
|
| 153 |
-
text_with_hint = text + hint
|
| 154 |
-
return text_with_hint, found_terms
|
| 155 |
-
|
| 156 |
-
return text, []
|
| 157 |
|
| 158 |
def post_process_terminology(text, direction, found_terms, use_terminology):
|
| 159 |
-
|
| 160 |
-
if not use_terminology or not text or not found_terms:
|
| 161 |
return text
|
| 162 |
|
| 163 |
if direction == "en_to_no":
|
| 164 |
term_dict = TERMINOLOGY_EN_TO_NO
|
|
|
|
| 165 |
else:
|
| 166 |
term_dict = TERMINOLOGY_NO_TO_EN
|
|
|
|
| 167 |
|
| 168 |
result = text
|
| 169 |
|
| 170 |
-
# 移除可能被模型翻译的提示部分
|
| 171 |
-
result = re.sub(r'\[Terminology:.*?\]', '', result, flags=re.IGNORECASE).strip()
|
| 172 |
-
|
| 173 |
-
# 尝试修正可能的术语变体
|
| 174 |
for source_term, target_term in found_terms:
|
| 175 |
-
|
| 176 |
-
# 例如:确保 "drilling mud" 不会匹配到 "mud" 中间
|
| 177 |
-
pattern = re.compile(r'\b' + re.escape(target_term) + r'\b', re.IGNORECASE)
|
| 178 |
|
| 179 |
-
|
|
|
|
|
|
|
| 180 |
original = match.group(0)
|
| 181 |
-
# 保持首字母大小写
|
| 182 |
if original and original[0].isupper():
|
| 183 |
return target_term.capitalize()
|
| 184 |
return target_term.lower()
|
| 185 |
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
return result
|
| 190 |
|
|
@@ -205,12 +213,11 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 205 |
else:
|
| 206 |
return "Unsupported language pair", 0.0, []
|
| 207 |
|
| 208 |
-
|
| 209 |
-
processed_text, found_terms = preprocess_with_terminology(text, direction, use_terminology)
|
| 210 |
|
| 211 |
preset = QUALITY_PRESETS[quality_preset]
|
| 212 |
|
| 213 |
-
original_paragraphs =
|
| 214 |
final_translated_paragraphs = []
|
| 215 |
|
| 216 |
for paragraph in original_paragraphs:
|
|
@@ -253,7 +260,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 253 |
|
| 254 |
raw_translation = '\n'.join(final_translated_paragraphs)
|
| 255 |
|
| 256 |
-
# 🆕 后处理:清理和修正术语
|
| 257 |
corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
|
| 258 |
|
| 259 |
elapsed_time = time.time() - start_time
|
|
@@ -275,7 +281,7 @@ def translate(text, source_lang, target_lang, quality_preset, use_terminology):
|
|
| 275 |
|
| 276 |
result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
|
| 277 |
|
| 278 |
-
terminology_status = f"with {terms_count}
|
| 279 |
time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
|
| 280 |
return result, time_info
|
| 281 |
|
|
@@ -509,9 +515,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 509 |
)
|
| 510 |
|
| 511 |
use_terminology = gr.Checkbox(
|
| 512 |
-
label="Enable Terminology Enforcement (
|
| 513 |
value=True,
|
| 514 |
-
info=f"
|
| 515 |
)
|
| 516 |
|
| 517 |
with gr.Row():
|
|
@@ -571,7 +577,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 571 |
elem_classes="time-info"
|
| 572 |
)
|
| 573 |
|
| 574 |
-
gr.HTML("<div class='footer-info'>
|
| 575 |
|
| 576 |
with gr.Accordion("Example Sentences", open=True):
|
| 577 |
with gr.Row():
|
|
@@ -627,7 +633,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 627 |
)
|
| 628 |
run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
|
| 629 |
run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
|
| 630 |
-
|
| 631 |
gr.HTML(f"""
|
| 632 |
<div class='disclaimer'>
|
| 633 |
<strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary
|
|
|
|
| 58 |
|
| 59 |
print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
except Exception as e:
|
| 62 |
print(f"Warning: Could not load glossary.json: {e}")
|
| 63 |
TERMINOLOGY_EN_TO_NO = {}
|
| 64 |
TERMINOLOGY_NO_TO_EN = {}
|
| 65 |
|
| 66 |
+
COMMON_ERRORS = {
|
| 67 |
+
"en_to_no": {
|
| 68 |
+
"mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
|
| 69 |
+
"christmas tree": ["juletræet", "jule tre", "juletre"],
|
| 70 |
+
"permeability": ["permeabiliteten"],
|
| 71 |
+
"porosity": ["porøsiteten"],
|
| 72 |
+
"training": ["utdanning"],
|
| 73 |
+
"working pressure": ["arbeidstrykk"],
|
| 74 |
+
},
|
| 75 |
+
"no_to_en": {
|
| 76 |
+
"slamvekt": ["slam weight", "mudweight"],
|
| 77 |
+
"juletre": ["yule tree", "christmas-tree"],
|
| 78 |
+
"permeabilitet": ["permeabiliteten"],
|
| 79 |
+
"porøsitet": ["porøsiteten"],
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
QUALITY_PRESETS = {
|
| 84 |
"Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
|
| 85 |
"Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
|
|
|
|
| 131 |
MAX_FILE_SIZE = 1024 * 1024
|
| 132 |
MAX_TEXT_LENGTH = 10000
|
| 133 |
|
| 134 |
+
def fix_number_format(text, target_lang):
|
| 135 |
+
if target_lang == "Norwegian":
|
| 136 |
+
text = re.sub(r'(\d),(\d{3})', r'\1 \2', text)
|
| 137 |
+
text = re.sub(r'(\d)\.(\d{3})(?!\d)', r'\1 \2', text)
|
| 138 |
+
text = re.sub(r'(\d)\.(\d{1,2})(?=\s|$|[^\d])', r'\1,\2', text)
|
| 139 |
+
else:
|
| 140 |
+
text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
|
| 141 |
+
text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
|
| 142 |
|
| 143 |
+
return text
|
| 144 |
+
|
| 145 |
+
def find_source_terms_in_input(text, direction):
|
| 146 |
if direction == "en_to_no":
|
| 147 |
term_dict = TERMINOLOGY_EN_TO_NO
|
| 148 |
else:
|
| 149 |
term_dict = TERMINOLOGY_NO_TO_EN
|
| 150 |
|
| 151 |
if not term_dict:
|
| 152 |
+
return []
|
| 153 |
|
| 154 |
found_terms = []
|
| 155 |
text_lower = text.lower()
|
| 156 |
|
|
|
|
| 157 |
for source_term in sorted(term_dict.keys(), key=len, reverse=True):
|
| 158 |
if source_term in text_lower:
|
| 159 |
target_term = term_dict[source_term]
|
| 160 |
found_terms.append((source_term, target_term))
|
| 161 |
|
| 162 |
+
return found_terms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
def post_process_terminology(text, direction, found_terms, use_terminology):
|
| 165 |
+
if not use_terminology or not text:
|
|
|
|
| 166 |
return text
|
| 167 |
|
| 168 |
if direction == "en_to_no":
|
| 169 |
term_dict = TERMINOLOGY_EN_TO_NO
|
| 170 |
+
error_dict = COMMON_ERRORS.get("en_to_no", {})
|
| 171 |
else:
|
| 172 |
term_dict = TERMINOLOGY_NO_TO_EN
|
| 173 |
+
error_dict = COMMON_ERRORS.get("no_to_en", {})
|
| 174 |
|
| 175 |
result = text
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
for source_term, target_term in found_terms:
|
| 178 |
+
correct_term = target_term.lower()
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
pattern = re.compile(r'\b' + re.escape(correct_term) + r'\b', re.IGNORECASE)
|
| 181 |
+
|
| 182 |
+
def preserve_case(match):
|
| 183 |
original = match.group(0)
|
|
|
|
| 184 |
if original and original[0].isupper():
|
| 185 |
return target_term.capitalize()
|
| 186 |
return target_term.lower()
|
| 187 |
|
| 188 |
+
result = pattern.sub(preserve_case, result)
|
| 189 |
+
|
| 190 |
+
if source_term in error_dict:
|
| 191 |
+
for error_variant in error_dict[source_term]:
|
| 192 |
+
error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
|
| 193 |
+
result = error_pattern.sub(preserve_case, result)
|
| 194 |
+
|
| 195 |
+
result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
|
| 196 |
|
| 197 |
return result
|
| 198 |
|
|
|
|
| 213 |
else:
|
| 214 |
return "Unsupported language pair", 0.0, []
|
| 215 |
|
| 216 |
+
found_terms = find_source_terms_in_input(text, direction)
|
|
|
|
| 217 |
|
| 218 |
preset = QUALITY_PRESETS[quality_preset]
|
| 219 |
|
| 220 |
+
original_paragraphs = text.split('\n')
|
| 221 |
final_translated_paragraphs = []
|
| 222 |
|
| 223 |
for paragraph in original_paragraphs:
|
|
|
|
| 260 |
|
| 261 |
raw_translation = '\n'.join(final_translated_paragraphs)
|
| 262 |
|
|
|
|
| 263 |
corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
|
| 264 |
|
| 265 |
elapsed_time = time.time() - start_time
|
|
|
|
| 281 |
|
| 282 |
result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
|
| 283 |
|
| 284 |
+
terminology_status = f"with {terms_count} terms enforced" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
|
| 285 |
time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
|
| 286 |
return result, time_info
|
| 287 |
|
|
|
|
| 515 |
)
|
| 516 |
|
| 517 |
use_terminology = gr.Checkbox(
|
| 518 |
+
label="Enable Terminology Enforcement (POST)",
|
| 519 |
value=True,
|
| 520 |
+
info=f"Uses {len(TERMINOLOGY_EN_TO_NO)} terms + error variants + number format fixing"
|
| 521 |
)
|
| 522 |
|
| 523 |
with gr.Row():
|
|
|
|
| 577 |
elem_classes="time-info"
|
| 578 |
)
|
| 579 |
|
| 580 |
+
gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • POST-only Terminology Processing</div>")
|
| 581 |
|
| 582 |
with gr.Accordion("Example Sentences", open=True):
|
| 583 |
with gr.Row():
|
|
|
|
| 633 |
)
|
| 634 |
run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
|
| 635 |
run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
|
| 636 |
+
|
| 637 |
gr.HTML(f"""
|
| 638 |
<div class='disclaimer'>
|
| 639 |
<strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary
|