entropy25 commited on
Commit
be31671
·
verified ·
1 Parent(s): 711f784

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -31
app.py CHANGED
@@ -57,6 +57,12 @@ try:
57
  TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
58
 
59
  print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
 
 
 
 
 
 
60
  except Exception as e:
61
  print(f"Warning: Could not load glossary.json: {e}")
62
  TERMINOLOGY_EN_TO_NO = {}
@@ -113,9 +119,10 @@ QUALITY_TEST_CASES = {
113
  MAX_FILE_SIZE = 1024 * 1024
114
  MAX_TEXT_LENGTH = 10000
115
 
116
- def post_process_terminology(text, direction, use_terminology):
 
117
  if not use_terminology or not text:
118
- return text
119
 
120
  if direction == "en_to_no":
121
  term_dict = TERMINOLOGY_EN_TO_NO
@@ -123,26 +130,60 @@ def post_process_terminology(text, direction, use_terminology):
123
  term_dict = TERMINOLOGY_NO_TO_EN
124
 
125
  if not term_dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  return text
127
 
 
 
 
 
 
128
  result = text
129
- replacements_made = []
130
 
131
- for source_term, target_term in sorted(term_dict.items(), key=lambda x: len(x[0]), reverse=True):
132
- pattern = re.compile(re.escape(source_term), re.IGNORECASE)
 
 
 
 
 
 
133
 
134
  def replace_match(match):
135
  original = match.group(0)
136
- if original[0].isupper():
137
- replacement = target_term.capitalize()
138
- else:
139
- replacement = target_term
140
-
141
- if original != replacement:
142
- replacements_made.append(f"{original} → {replacement}")
143
-
144
- return replacement
145
 
 
146
  result = pattern.sub(replace_match, result)
147
 
148
  return result
@@ -164,9 +205,12 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
164
  else:
165
  return "Unsupported language pair", 0.0, []
166
 
 
 
 
167
  preset = QUALITY_PRESETS[quality_preset]
168
 
169
- original_paragraphs = text.split('\n')
170
  final_translated_paragraphs = []
171
 
172
  for paragraph in original_paragraphs:
@@ -209,16 +253,17 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
209
 
210
  raw_translation = '\n'.join(final_translated_paragraphs)
211
 
212
- corrected_translation = post_process_terminology(raw_translation, direction, use_terminology)
 
213
 
214
  elapsed_time = time.time() - start_time
215
 
216
- return corrected_translation, elapsed_time, []
217
 
218
  @lru_cache(maxsize=512)
219
  def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
220
- result, elapsed, _ = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
221
- return result, elapsed
222
 
223
  def translate(text, source_lang, target_lang, quality_preset, use_terminology):
224
  try:
@@ -228,9 +273,9 @@ def translate(text, source_lang, target_lang, quality_preset, use_terminology):
228
  if not text.strip():
229
  return "", ""
230
 
231
- result, elapsed = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
232
 
233
- terminology_status = "with terminology enforcement" if use_terminology else "without terminology enforcement"
234
  time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
235
  return result, time_info
236
 
@@ -251,7 +296,7 @@ def run_quality_tests(use_terminology):
251
  results.append(f"\n{src_lang} to {tgt_lang}\n")
252
 
253
  for i, case in enumerate(test_cases, 1):
254
- translation, _, _ = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
255
 
256
  passed_checks = []
257
  failed_checks = []
@@ -262,25 +307,28 @@ def run_quality_tests(use_terminology):
262
  else:
263
  failed_checks.append(keyword)
264
 
265
- status = "PASS" if not failed_checks else "CHECK"
266
 
267
  results.append(f"\nTest {i}: {status}")
268
  results.append(f"Input: {case['input']}")
269
  results.append(f"Expected: {case['expected']}")
270
  results.append(f"Got: {translation}")
271
 
 
 
 
272
  if passed_checks:
273
- results.append(f"Found: {', '.join(passed_checks)}")
274
  if failed_checks:
275
- results.append(f"Missing: {', '.join(failed_checks)}")
276
 
277
  results.append("\n=== TEST COMPLETE ===")
278
 
279
- pass_count = sum(1 for r in results if "PASS" in r)
280
- check_count = sum(1 for r in results if "CHECK" in r)
281
  total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
282
 
283
- results.insert(2, f"\nScore: {pass_count}/{total} passed, {check_count}/{total} need review\n")
284
 
285
  return '\n'.join(results)
286
 
@@ -461,9 +509,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
461
  )
462
 
463
  use_terminology = gr.Checkbox(
464
- label="Enable Terminology Enforcement",
465
  value=True,
466
- info=f"Use glossary of {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms for post-processing"
467
  )
468
 
469
  with gr.Row():
@@ -523,7 +571,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
523
  elem_classes="time-info"
524
  )
525
 
526
- gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • Production-Ready System</div>")
527
 
528
  with gr.Accordion("Example Sentences", open=True):
529
  with gr.Row():
 
57
  TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
58
 
59
  print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
60
+
61
+ # 调试:打印前5个术语验证编码
62
+ print("\n术语表示例(验证编码):")
63
+ for i, (en, no) in enumerate(list(TERMINOLOGY_EN_TO_NO.items())[:5]):
64
+ print(f" {en!r} -> {no!r}")
65
+
66
  except Exception as e:
67
  print(f"Warning: Could not load glossary.json: {e}")
68
  TERMINOLOGY_EN_TO_NO = {}
 
119
  MAX_FILE_SIZE = 1024 * 1024
120
  MAX_TEXT_LENGTH = 10000
121
 
122
+ def preprocess_with_terminology(text, direction, use_terminology):
123
+ """在翻译前添加术语提示"""
124
  if not use_terminology or not text:
125
+ return text, []
126
 
127
  if direction == "en_to_no":
128
  term_dict = TERMINOLOGY_EN_TO_NO
 
130
  term_dict = TERMINOLOGY_NO_TO_EN
131
 
132
  if not term_dict:
133
+ return text, []
134
+
135
+ found_terms = []
136
+ text_lower = text.lower()
137
+
138
+ # 查找文本中的术语(按长度排序,优先匹配长术语)
139
+ for source_term in sorted(term_dict.keys(), key=len, reverse=True):
140
+ if source_term in text_lower:
141
+ target_term = term_dict[source_term]
142
+ found_terms.append((source_term, target_term))
143
+
144
+ # 如果找到术语,添加翻译提示
145
+ if found_terms:
146
+ hint = "\n[Terminology: "
147
+ hints = []
148
+ for src, tgt in found_terms[:10]: # 最多显示10个术语
149
+ hints.append(f"{src}={tgt}")
150
+ hint += ", ".join(hints) + "]"
151
+
152
+ # 将提示添加到文本末尾
153
+ text_with_hint = text + hint
154
+ return text_with_hint, found_terms
155
+
156
+ return text, []
157
+
158
+ def post_process_terminology(text, direction, found_terms, use_terminology):
159
+ """翻译后修正术语(处理模型可能的小错误)"""
160
+ if not use_terminology or not text or not found_terms:
161
  return text
162
 
163
+ if direction == "en_to_no":
164
+ term_dict = TERMINOLOGY_EN_TO_NO
165
+ else:
166
+ term_dict = TERMINOLOGY_NO_TO_EN
167
+
168
  result = text
 
169
 
170
+ # 移除可能被模型翻译的提示部分
171
+ result = re.sub(r'\[Terminology:.*?\]', '', result, flags=re.IGNORECASE).strip()
172
+
173
+ # 尝试修正可能的术语变体
174
+ for source_term, target_term in found_terms:
175
+ # 添加词边界以避免错误替换
176
+ # 例如:确保 "drilling mud" 不会匹配到 "mud" 中间
177
+ pattern = re.compile(r'\b' + re.escape(target_term) + r'\b', re.IGNORECASE)
178
 
179
  def replace_match(match):
180
  original = match.group(0)
181
+ # 保持首字母大小写
182
+ if original and original[0].isupper():
183
+ return target_term.capitalize()
184
+ return target_term.lower()
 
 
 
 
 
185
 
186
+ # 如果目标术语的变体出现,统一替换
187
  result = pattern.sub(replace_match, result)
188
 
189
  return result
 
205
  else:
206
  return "Unsupported language pair", 0.0, []
207
 
208
+ # 🆕 预处理:添加术语提示
209
+ processed_text, found_terms = preprocess_with_terminology(text, direction, use_terminology)
210
+
211
  preset = QUALITY_PRESETS[quality_preset]
212
 
213
+ original_paragraphs = processed_text.split('\n')
214
  final_translated_paragraphs = []
215
 
216
  for paragraph in original_paragraphs:
 
253
 
254
  raw_translation = '\n'.join(final_translated_paragraphs)
255
 
256
+ # 🆕 后处理:清理和修正术语
257
+ corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
258
 
259
  elapsed_time = time.time() - start_time
260
 
261
+ return corrected_translation, elapsed_time, found_terms
262
 
263
  @lru_cache(maxsize=512)
264
  def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
265
+ result, elapsed, terms = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
266
+ return result, elapsed, len(terms)
267
 
268
  def translate(text, source_lang, target_lang, quality_preset, use_terminology):
269
  try:
 
273
  if not text.strip():
274
  return "", ""
275
 
276
+ result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
277
 
278
+ terminology_status = f"with {terms_count} terminology hints" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
279
  time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
280
  return result, time_info
281
 
 
296
  results.append(f"\n{src_lang} to {tgt_lang}\n")
297
 
298
  for i, case in enumerate(test_cases, 1):
299
+ translation, _, found_terms = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
300
 
301
  passed_checks = []
302
  failed_checks = []
 
307
  else:
308
  failed_checks.append(keyword)
309
 
310
+ status = "PASS" if not failed_checks else "⚠️ CHECK"
311
 
312
  results.append(f"\nTest {i}: {status}")
313
  results.append(f"Input: {case['input']}")
314
  results.append(f"Expected: {case['expected']}")
315
  results.append(f"Got: {translation}")
316
 
317
+ if use_terminology and found_terms:
318
+ results.append(f"Terms found: {len(found_terms)}")
319
+
320
  if passed_checks:
321
+ results.append(f"Found: {', '.join(passed_checks)}")
322
  if failed_checks:
323
+ results.append(f"Missing: {', '.join(failed_checks)}")
324
 
325
  results.append("\n=== TEST COMPLETE ===")
326
 
327
+ pass_count = sum(1 for r in results if "PASS" in r)
328
+ check_count = sum(1 for r in results if "⚠️ CHECK" in r)
329
  total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
330
 
331
+ results.insert(2, f"\n📊 Score: {pass_count}/{total} passed, {check_count}/{total} need review\n")
332
 
333
  return '\n'.join(results)
334
 
 
509
  )
510
 
511
  use_terminology = gr.Checkbox(
512
+ label="Enable Terminology Enforcement (PRE+POST)",
513
  value=True,
514
+ info=f"🆕 Uses {len(TERMINOLOGY_EN_TO_NO)} terms: adds hints before translation + cleanup after"
515
  )
516
 
517
  with gr.Row():
 
571
  elem_classes="time-info"
572
  )
573
 
574
+ gr.HTML("<div class='footer-info'>🛠️ Oil & Gas Translation • English ↔ Norwegian • 🆕 Pre+Post Terminology Processing</div>")
575
 
576
  with gr.Accordion("Example Sentences", open=True):
577
  with gr.Row():