entropy25 commited on
Commit
3e226cf
·
verified ·
1 Parent(s): be31671

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -46
app.py CHANGED
@@ -58,16 +58,28 @@ try:
58
 
59
  print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
60
 
61
- # 调试:打印前5个术语验证编码
62
- print("\n术语表示例(验证编码):")
63
- for i, (en, no) in enumerate(list(TERMINOLOGY_EN_TO_NO.items())[:5]):
64
- print(f" {en!r} -> {no!r}")
65
-
66
  except Exception as e:
67
  print(f"Warning: Could not load glossary.json: {e}")
68
  TERMINOLOGY_EN_TO_NO = {}
69
  TERMINOLOGY_NO_TO_EN = {}
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  QUALITY_PRESETS = {
72
  "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
73
  "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
@@ -119,72 +131,68 @@ QUALITY_TEST_CASES = {
119
  MAX_FILE_SIZE = 1024 * 1024
120
  MAX_TEXT_LENGTH = 10000
121
 
122
- def preprocess_with_terminology(text, direction, use_terminology):
123
- """在翻译前添加术语提示"""
124
- if not use_terminology or not text:
125
- return text, []
 
 
 
 
126
 
 
 
 
127
  if direction == "en_to_no":
128
  term_dict = TERMINOLOGY_EN_TO_NO
129
  else:
130
  term_dict = TERMINOLOGY_NO_TO_EN
131
 
132
  if not term_dict:
133
- return text, []
134
 
135
  found_terms = []
136
  text_lower = text.lower()
137
 
138
- # 查找文本中的术语(按长度排序,优先匹配长术语)
139
  for source_term in sorted(term_dict.keys(), key=len, reverse=True):
140
  if source_term in text_lower:
141
  target_term = term_dict[source_term]
142
  found_terms.append((source_term, target_term))
143
 
144
- # 如果找到术语,添加翻译提示
145
- if found_terms:
146
- hint = "\n[Terminology: "
147
- hints = []
148
- for src, tgt in found_terms[:10]: # 最多显示10个术语
149
- hints.append(f"{src}={tgt}")
150
- hint += ", ".join(hints) + "]"
151
-
152
- # 将提示添加到文本末尾
153
- text_with_hint = text + hint
154
- return text_with_hint, found_terms
155
-
156
- return text, []
157
 
158
  def post_process_terminology(text, direction, found_terms, use_terminology):
159
- """翻译后修正术语(处理模型可能的小错误)"""
160
- if not use_terminology or not text or not found_terms:
161
  return text
162
 
163
  if direction == "en_to_no":
164
  term_dict = TERMINOLOGY_EN_TO_NO
 
165
  else:
166
  term_dict = TERMINOLOGY_NO_TO_EN
 
167
 
168
  result = text
169
 
170
- # 移除可能被模型翻译的提示部分
171
- result = re.sub(r'\[Terminology:.*?\]', '', result, flags=re.IGNORECASE).strip()
172
-
173
- # 尝试修正可能的术语变体
174
  for source_term, target_term in found_terms:
175
- # 添加词边界以避免错误替换
176
- # 例如:确保 "drilling mud" 不会匹配到 "mud" 中间
177
- pattern = re.compile(r'\b' + re.escape(target_term) + r'\b', re.IGNORECASE)
178
 
179
- def replace_match(match):
 
 
180
  original = match.group(0)
181
- # 保持首字母大小写
182
  if original and original[0].isupper():
183
  return target_term.capitalize()
184
  return target_term.lower()
185
 
186
- # 如果目标术语的变体出现,统一替换
187
- result = pattern.sub(replace_match, result)
 
 
 
 
 
 
188
 
189
  return result
190
 
@@ -205,12 +213,11 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
205
  else:
206
  return "Unsupported language pair", 0.0, []
207
 
208
- # 🆕 预处理:添加术语提示
209
- processed_text, found_terms = preprocess_with_terminology(text, direction, use_terminology)
210
 
211
  preset = QUALITY_PRESETS[quality_preset]
212
 
213
- original_paragraphs = processed_text.split('\n')
214
  final_translated_paragraphs = []
215
 
216
  for paragraph in original_paragraphs:
@@ -253,7 +260,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
253
 
254
  raw_translation = '\n'.join(final_translated_paragraphs)
255
 
256
- # 🆕 后处理:清理和修正术语
257
  corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
258
 
259
  elapsed_time = time.time() - start_time
@@ -275,7 +281,7 @@ def translate(text, source_lang, target_lang, quality_preset, use_terminology):
275
 
276
  result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
277
 
278
- terminology_status = f"with {terms_count} terminology hints" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
279
  time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
280
  return result, time_info
281
 
@@ -509,9 +515,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
509
  )
510
 
511
  use_terminology = gr.Checkbox(
512
- label="Enable Terminology Enforcement (PRE+POST)",
513
  value=True,
514
- info=f"🆕 Uses {len(TERMINOLOGY_EN_TO_NO)} terms: adds hints before translation + cleanup after"
515
  )
516
 
517
  with gr.Row():
@@ -571,7 +577,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
571
  elem_classes="time-info"
572
  )
573
 
574
- gr.HTML("<div class='footer-info'>🛠️ Oil & Gas Translation • English ↔ Norwegian • 🆕 Pre+Post Terminology Processing</div>")
575
 
576
  with gr.Accordion("Example Sentences", open=True):
577
  with gr.Row():
@@ -627,7 +633,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
627
  )
628
  run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
629
  run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
630
-
631
  gr.HTML(f"""
632
  <div class='disclaimer'>
633
  <strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary
 
58
 
59
  print(f"Loaded {len(TERMINOLOGY_EN_TO_NO)} terminology entries")
60
 
 
 
 
 
 
61
  except Exception as e:
62
  print(f"Warning: Could not load glossary.json: {e}")
63
  TERMINOLOGY_EN_TO_NO = {}
64
  TERMINOLOGY_NO_TO_EN = {}
65
 
66
+ COMMON_ERRORS = {
67
+ "en_to_no": {
68
+ "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
69
+ "christmas tree": ["juletræet", "jule tre", "juletre"],
70
+ "permeability": ["permeabiliteten"],
71
+ "porosity": ["porøsiteten"],
72
+ "training": ["utdanning"],
73
+ "working pressure": ["arbeidstrykk"],
74
+ },
75
+ "no_to_en": {
76
+ "slamvekt": ["slam weight", "mudweight"],
77
+ "juletre": ["yule tree", "christmas-tree"],
78
+ "permeabilitet": ["permeabiliteten"],
79
+ "porøsitet": ["porøsiteten"],
80
+ }
81
+ }
82
+
83
  QUALITY_PRESETS = {
84
  "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
85
  "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
 
131
  MAX_FILE_SIZE = 1024 * 1024
132
  MAX_TEXT_LENGTH = 10000
133
 
134
+ def fix_number_format(text, target_lang):
135
+ if target_lang == "Norwegian":
136
+ text = re.sub(r'(\d),(\d{3})', r'\1 \2', text)
137
+ text = re.sub(r'(\d)\.(\d{3})(?!\d)', r'\1 \2', text)
138
+ text = re.sub(r'(\d)\.(\d{1,2})(?=\s|$|[^\d])', r'\1,\2', text)
139
+ else:
140
+ text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
141
+ text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
142
 
143
+ return text
144
+
145
+ def find_source_terms_in_input(text, direction):
146
  if direction == "en_to_no":
147
  term_dict = TERMINOLOGY_EN_TO_NO
148
  else:
149
  term_dict = TERMINOLOGY_NO_TO_EN
150
 
151
  if not term_dict:
152
+ return []
153
 
154
  found_terms = []
155
  text_lower = text.lower()
156
 
 
157
  for source_term in sorted(term_dict.keys(), key=len, reverse=True):
158
  if source_term in text_lower:
159
  target_term = term_dict[source_term]
160
  found_terms.append((source_term, target_term))
161
 
162
+ return found_terms
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def post_process_terminology(text, direction, found_terms, use_terminology):
165
+ if not use_terminology or not text:
 
166
  return text
167
 
168
  if direction == "en_to_no":
169
  term_dict = TERMINOLOGY_EN_TO_NO
170
+ error_dict = COMMON_ERRORS.get("en_to_no", {})
171
  else:
172
  term_dict = TERMINOLOGY_NO_TO_EN
173
+ error_dict = COMMON_ERRORS.get("no_to_en", {})
174
 
175
  result = text
176
 
 
 
 
 
177
  for source_term, target_term in found_terms:
178
+ correct_term = target_term.lower()
 
 
179
 
180
+ pattern = re.compile(r'\b' + re.escape(correct_term) + r'\b', re.IGNORECASE)
181
+
182
+ def preserve_case(match):
183
  original = match.group(0)
 
184
  if original and original[0].isupper():
185
  return target_term.capitalize()
186
  return target_term.lower()
187
 
188
+ result = pattern.sub(preserve_case, result)
189
+
190
+ if source_term in error_dict:
191
+ for error_variant in error_dict[source_term]:
192
+ error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
193
+ result = error_pattern.sub(preserve_case, result)
194
+
195
+ result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
196
 
197
  return result
198
 
 
213
  else:
214
  return "Unsupported language pair", 0.0, []
215
 
216
+ found_terms = find_source_terms_in_input(text, direction)
 
217
 
218
  preset = QUALITY_PRESETS[quality_preset]
219
 
220
+ original_paragraphs = text.split('\n')
221
  final_translated_paragraphs = []
222
 
223
  for paragraph in original_paragraphs:
 
260
 
261
  raw_translation = '\n'.join(final_translated_paragraphs)
262
 
 
263
  corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
264
 
265
  elapsed_time = time.time() - start_time
 
281
 
282
  result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
283
 
284
+ terminology_status = f"with {terms_count} terms enforced" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
285
  time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
286
  return result, time_info
287
 
 
515
  )
516
 
517
  use_terminology = gr.Checkbox(
518
+ label="Enable Terminology Enforcement (POST)",
519
  value=True,
520
+ info=f"Uses {len(TERMINOLOGY_EN_TO_NO)} terms + error variants + number format fixing"
521
  )
522
 
523
  with gr.Row():
 
577
  elem_classes="time-info"
578
  )
579
 
580
+ gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • POST-only Terminology Processing</div>")
581
 
582
  with gr.Accordion("Example Sentences", open=True):
583
  with gr.Row():
 
633
  )
634
  run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
635
  run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
636
+
637
  gr.HTML(f"""
638
  <div class='disclaimer'>
639
  <strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary