entropy25 commited on
Commit
43d8748
·
verified ·
1 Parent(s): f876d44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -182
app.py CHANGED
@@ -12,7 +12,6 @@ import re
12
  try:
13
  nltk.data.find('tokenizers/punkt')
14
  except LookupError:
15
- print("Downloading NLTK punkt tokenizer...")
16
  nltk.download('punkt')
17
  try:
18
  nltk.download('punkt_tab')
@@ -27,7 +26,7 @@ ADAPTER_NO_TO_EN = os.getenv("ADAPTER_NO_TO_EN", "entropy25/mt_no_en_oil")
27
 
28
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
29
 
30
- print("Loading shared base model with 8-bit quantization...")
31
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
32
 
33
  base_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -52,7 +51,6 @@ try:
52
  for entry in glossary_data:
53
  en_term = entry['en'].strip()
54
  no_term = entry['no'].strip()
55
-
56
  TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
57
  TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
58
 
@@ -63,6 +61,16 @@ except Exception as e:
63
  TERMINOLOGY_EN_TO_NO = {}
64
  TERMINOLOGY_NO_TO_EN = {}
65
 
 
 
 
 
 
 
 
 
 
 
66
  COMMON_ERRORS = {
67
  "en_to_no": {
68
  "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
@@ -80,56 +88,29 @@ COMMON_ERRORS = {
80
  }
81
  }
82
 
83
- QUALITY_PRESETS = {
84
- "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
85
- "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
86
- "Draft (Fastest)": {"num_beams": 2, "max_length": 128, "batch_size": 5}
87
- }
 
88
 
89
- QUALITY_TEST_CASES = {
90
- "en_to_no": [
91
- {
92
- "input": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
93
- "expected": "Slamvekt justert til 1,82 spesifikk tyngde ved 3 247 meters dybde.",
94
- "check": ["slamvekt", "1,82", "3 247"]
95
- },
96
- {
97
- "input": "Christmas tree rated for 10,000 psi working pressure.",
98
- "expected": "Juletre dimensjonert for 10 000 psi arbeidstrykk.",
99
- "check": ["juletre", "10 000", "psi"]
100
- },
101
- {
102
- "input": "H2S training required before site access.",
103
- "expected": "H2S-opplæring påkrevd før tilgang til området.",
104
- "check": ["H2S", "opplæring", "påkrevd"]
105
- },
106
- {
107
- "input": "Permeability is 250 millidarcy with 22 percent porosity.",
108
- "expected": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
109
- "check": ["permeabilitet", "250", "22"]
110
- }
111
- ],
112
- "no_to_en": [
113
- {
114
- "input": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
115
- "expected": "Permeability is 250 millidarcy with 22 percent porosity.",
116
- "check": ["permeability", "250", "22"]
117
- },
118
- {
119
- "input": "Subsea produksjonssystemet består av et vertikalt juletre.",
120
- "expected": "The subsea production system consists of a vertical Christmas tree.",
121
- "check": ["subsea", "Christmas tree", "vertical"]
122
- },
123
- {
124
- "input": "Slamvekt justert til 1,82 spesifikk tyngde ved 3 247 meters dybde.",
125
- "expected": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
126
- "check": ["mud weight", "1.82", "3,247"]
127
- }
128
  ]
129
- }
130
 
131
  MAX_FILE_SIZE = 1024 * 1024
132
  MAX_TEXT_LENGTH = 10000
 
 
 
 
 
 
 
133
 
134
  def fix_number_format(text, target_lang):
135
  if target_lang == "Norwegian":
@@ -139,7 +120,6 @@ def fix_number_format(text, target_lang):
139
  else:
140
  text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
141
  text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
142
-
143
  return text
144
 
145
  def find_source_terms_in_input(text, direction):
@@ -166,33 +146,44 @@ def post_process_terminology(text, direction, found_terms, use_terminology):
166
  return text
167
 
168
  if direction == "en_to_no":
169
- error_dict = COMMON_ERRORS.get("en_to_no", {})
 
170
  else:
171
- error_dict = COMMON_ERRORS.get("no_to_en", {})
 
172
 
173
  result = text
174
 
175
  for source_term, target_term in found_terms:
176
-
177
  def preserve_case(match):
178
  original = match.group(0)
179
  if original and original[0].isupper():
180
  return target_term.capitalize()
181
  return target_term.lower()
182
 
183
- source_pattern = re.compile(r'\b' + re.escape(source_term) + r'\b', re.IGNORECASE)
184
- result = source_pattern.sub(preserve_case, result)
185
 
186
- if source_term in error_dict:
187
- for error_variant in error_dict[source_term]:
188
- error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
189
  result = error_pattern.sub(preserve_case, result)
190
 
191
  result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
192
 
193
  return result
194
 
195
- def translate_core(text, source_lang, target_lang, quality_preset, use_terminology=True):
 
 
 
 
 
 
 
 
 
 
 
196
  if not text.strip() or source_lang == target_lang:
197
  return text, 0.0, []
198
 
@@ -211,8 +202,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
211
 
212
  found_terms = find_source_terms_in_input(text, direction)
213
 
214
- preset = QUALITY_PRESETS[quality_preset]
215
-
216
  original_paragraphs = text.split('\n')
217
  final_translated_paragraphs = []
218
 
@@ -221,20 +210,18 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
221
  final_translated_paragraphs.append("")
222
  continue
223
 
224
- sentences = sent_tokenize(paragraph)
225
-
226
- batch_size = preset["batch_size"]
227
  paragraph_results = []
228
 
229
- for i in range(0, len(sentences), batch_size):
230
- batch = sentences[i:i+batch_size]
231
 
232
  inputs = tokenizer(
233
  batch,
234
  return_tensors="pt",
235
  padding=True,
236
  truncation=True,
237
- max_length=preset["max_length"]
238
  )
239
 
240
  if hasattr(model, 'device'):
@@ -244,8 +231,8 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
244
  outputs = model.generate(
245
  **inputs,
246
  forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
247
- max_length=preset["max_length"],
248
- num_beams=preset["num_beams"],
249
  early_stopping=True
250
  )
251
 
@@ -255,110 +242,62 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
255
  final_translated_paragraphs.append(" ".join(paragraph_results))
256
 
257
  raw_translation = '\n'.join(final_translated_paragraphs)
258
-
259
  corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
260
-
261
  elapsed_time = time.time() - start_time
262
 
263
  return corrected_translation, elapsed_time, found_terms
264
 
265
  @lru_cache(maxsize=512)
266
- def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
267
- result, elapsed, terms = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
268
  return result, elapsed, len(terms)
269
 
270
- def translate(text, source_lang, target_lang, quality_preset, use_terminology):
271
  try:
272
  if len(text) > MAX_TEXT_LENGTH:
273
- return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", ""
274
 
275
  if not text.strip():
276
- return "", ""
277
 
278
- result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
279
 
280
- terminology_status = f"with {terms_count} terms enforced" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
281
- time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
282
- return result, time_info
283
 
284
- except Exception as e:
285
- return f"Translation error: {str(e)}. Please try again.", ""
286
-
287
- def run_quality_tests(use_terminology):
288
- results = []
289
- results.append("=== QUALITY REGRESSION TEST ===\n")
290
- results.append(f"Terminology Enforcement: {'ENABLED' if use_terminology else 'DISABLED'}\n")
291
-
292
- for direction, test_cases in QUALITY_TEST_CASES.items():
293
- if direction == "en_to_no":
294
- src_lang, tgt_lang = "English", "Norwegian"
295
- else:
296
- src_lang, tgt_lang = "Norwegian", "English"
297
 
298
- results.append(f"\n{src_lang} to {tgt_lang}\n")
299
 
300
- for i, case in enumerate(test_cases, 1):
301
- translation, _, found_terms = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
302
-
303
- passed_checks = []
304
- failed_checks = []
305
-
306
- for keyword in case["check"]:
307
- if keyword.lower() in translation.lower():
308
- passed_checks.append(keyword)
309
- else:
310
- failed_checks.append(keyword)
311
-
312
- status = "✅ PASS" if not failed_checks else "⚠️ CHECK"
313
-
314
- results.append(f"\nTest {i}: {status}")
315
- results.append(f"Input: {case['input']}")
316
- results.append(f"Expected: {case['expected']}")
317
- results.append(f"Got: {translation}")
318
-
319
- if use_terminology and found_terms:
320
- results.append(f"Terms found: {len(found_terms)}")
321
-
322
- if passed_checks:
323
- results.append(f"✓ Found: {', '.join(passed_checks)}")
324
- if failed_checks:
325
- results.append(f"✗ Missing: {', '.join(failed_checks)}")
326
-
327
- results.append("\n=== TEST COMPLETE ===")
328
-
329
- pass_count = sum(1 for r in results if "✅ PASS" in r)
330
- check_count = sum(1 for r in results if "⚠️ CHECK" in r)
331
- total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
332
-
333
- results.insert(2, f"\n📊 Score: {pass_count}/{total} passed, {check_count}/{total} need review\n")
334
-
335
- return '\n'.join(results)
336
 
337
  def swap_languages(src, tgt, input_txt, output_txt):
338
  return tgt, src, output_txt, input_txt
339
 
340
  def load_file(file):
341
  if file is None:
342
- return "", ""
343
 
344
  try:
345
  if os.path.getsize(file.name) > MAX_FILE_SIZE:
346
- return "Error: File too large (max 1MB)", ""
347
 
348
  with open(file.name, 'r', encoding='utf-8') as f:
349
  content = f.read()
350
  if len(content) > MAX_TEXT_LENGTH:
351
- return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
352
- return content, ""
353
  except:
354
  try:
355
  with open(file.name, 'r', encoding='latin-1') as f:
356
  content = f.read()
357
  if len(content) > MAX_TEXT_LENGTH:
358
- return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
359
- return content, ""
360
  except Exception as e:
361
- return f"Error reading file: {str(e)}", ""
362
 
363
  EXAMPLES_EN = {
364
  "drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
@@ -481,11 +420,6 @@ custom_css = """
481
  font-size: 13px !important;
482
  padding: 20px !important;
483
  }
484
- .quality-selector {
485
- background: #f0f7ff !important;
486
- border: 1px solid #0f6fff !important;
487
- border-radius: 4px !important;
488
- }
489
  .disclaimer {
490
  background: #fff9e6 !important;
491
  border-left: 4px solid #ff8c00 !important;
@@ -502,18 +436,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
502
  gr.HTML("<div style='height: 20px'></div>")
503
 
504
  with gr.Row():
505
- quality_preset = gr.Radio(
506
- choices=list(QUALITY_PRESETS.keys()),
507
- value="Professional (Best Quality)",
508
- label="Translation Quality",
509
- info="Professional: beam=3, max=256 | Balanced: beam=2, max=256 | Draft: beam=2, max=128",
510
- elem_classes="quality-selector"
511
- )
512
-
513
  use_terminology = gr.Checkbox(
514
- label="Enable Terminology Enforcement (POST)",
515
  value=True,
516
- info=f"Uses {len(TERMINOLOGY_EN_TO_NO)} terms + error variants + number format fixing"
517
  )
518
 
519
  with gr.Row():
@@ -552,14 +478,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
552
  elem_classes="lang-selector",
553
  scale=1
554
  )
555
- output_text = gr.Textbox(
556
  placeholder="Translation",
557
  show_label=False,
558
  lines=8,
559
  max_lines=20,
560
  container=False,
561
  elem_classes="text-area",
562
- interactive=False
 
 
 
 
563
  )
564
 
565
  with gr.Row():
@@ -573,7 +503,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
573
  elem_classes="time-info"
574
  )
575
 
576
- gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • POST-only Terminology Processing</div>")
577
 
578
  with gr.Accordion("Example Sentences", open=True):
579
  with gr.Row():
@@ -587,18 +517,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
587
  use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
588
 
589
  with gr.Row():
590
- btn1 = gr.Button("Drilling (Short)", size="sm")
591
- btn2 = gr.Button("Drilling (Long)", size="sm")
592
- btn3 = gr.Button("Reservoir (Short)", size="sm")
593
- btn4 = gr.Button("Reservoir (Long)", size="sm")
594
- btn5 = gr.Button("Subsea (Short)", size="sm")
595
 
596
  with gr.Row():
597
- btn6 = gr.Button("Subsea (Long)", size="sm")
598
- btn7 = gr.Button("Seismic (Short)", size="sm")
599
- btn8 = gr.Button("Seismic (Long)", size="sm")
600
- btn9 = gr.Button("Safety (Short)", size="sm")
601
- btn10 = gr.Button("Safety (Long)", size="sm")
602
 
603
  btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
604
  btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
@@ -619,24 +549,14 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
619
  file_types=[".txt"],
620
  type="filepath"
621
  )
622
-
623
- with gr.Accordion("Quality Test (Developer)", open=False):
624
- test_output = gr.Textbox(
625
- label="Test Results",
626
- lines=20,
627
- max_lines=30,
628
- interactive=False
629
- )
630
- run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
631
- run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
632
 
633
  gr.HTML(f"""
634
  <div class='disclaimer'>
635
- <strong>Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary
636
  <br>
637
- <strong>Privacy & Compliance:</strong> Fine-tuned on public domain data. Local inference ensures GDPR compliance.
638
  <br>
639
- <strong>Technical Features:</strong> Sentence-level batching prevents truncation. Post-processing ensures terminology consistency.
640
  </div>
641
  """)
642
 
@@ -644,16 +564,16 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
644
 
645
  translate_btn.click(
646
  fn=translate,
647
- inputs=[input_text, source_lang, target_lang, quality_preset, use_terminology],
648
- outputs=[output_text, time_display]
649
  )
650
 
651
  swap_btn.click(
652
  fn=swap_languages,
653
- inputs=[source_lang, target_lang, input_text, output_text],
654
- outputs=[source_lang, target_lang, input_text, output_text]
655
  )
656
 
657
- file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, time_display])
658
 
659
  demo.queue().launch()
 
12
  try:
13
  nltk.data.find('tokenizers/punkt')
14
  except LookupError:
 
15
  nltk.download('punkt')
16
  try:
17
  nltk.download('punkt_tab')
 
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
28
 
29
+ print("Loading model with 8-bit quantization...")
30
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
31
 
32
  base_model = AutoModelForSeq2SeqLM.from_pretrained(
 
51
  for entry in glossary_data:
52
  en_term = entry['en'].strip()
53
  no_term = entry['no'].strip()
 
54
  TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
55
  TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
56
 
 
61
  TERMINOLOGY_EN_TO_NO = {}
62
  TERMINOLOGY_NO_TO_EN = {}
63
 
64
+ COMPILED_PATTERNS_EN_TO_NO = {
65
+ term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
66
+ for term in TERMINOLOGY_EN_TO_NO.keys()
67
+ }
68
+
69
+ COMPILED_PATTERNS_NO_TO_EN = {
70
+ term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
71
+ for term in TERMINOLOGY_NO_TO_EN.keys()
72
+ }
73
+
74
  COMMON_ERRORS = {
75
  "en_to_no": {
76
  "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
 
88
  }
89
  }
90
 
91
+ COMPILED_ERRORS_EN_TO_NO = {}
92
+ for source_term, error_variants in COMMON_ERRORS["en_to_no"].items():
93
+ COMPILED_ERRORS_EN_TO_NO[source_term] = [
94
+ re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
95
+ for variant in error_variants
96
+ ]
97
 
98
+ COMPILED_ERRORS_NO_TO_EN = {}
99
+ for source_term, error_variants in COMMON_ERRORS["no_to_en"].items():
100
+ COMPILED_ERRORS_NO_TO_EN[source_term] = [
101
+ re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
102
+ for variant in error_variants
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  ]
 
104
 
105
  MAX_FILE_SIZE = 1024 * 1024
106
  MAX_TEXT_LENGTH = 10000
107
+ BATCH_SIZE = 10
108
+ NUM_BEAMS = 3
109
+ MAX_LENGTH = 256
110
+
111
+ @lru_cache(maxsize=512)
112
+ def cached_sent_tokenize(text):
113
+ return tuple(sent_tokenize(text))
114
 
115
  def fix_number_format(text, target_lang):
116
  if target_lang == "Norwegian":
 
120
  else:
121
  text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
122
  text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
 
123
  return text
124
 
125
  def find_source_terms_in_input(text, direction):
 
146
  return text
147
 
148
  if direction == "en_to_no":
149
+ compiled_patterns = COMPILED_PATTERNS_EN_TO_NO
150
+ compiled_errors = COMPILED_ERRORS_EN_TO_NO
151
  else:
152
+ compiled_patterns = COMPILED_PATTERNS_NO_TO_EN
153
+ compiled_errors = COMPILED_ERRORS_NO_TO_EN
154
 
155
  result = text
156
 
157
  for source_term, target_term in found_terms:
 
158
  def preserve_case(match):
159
  original = match.group(0)
160
  if original and original[0].isupper():
161
  return target_term.capitalize()
162
  return target_term.lower()
163
 
164
+ if source_term in compiled_patterns:
165
+ result = compiled_patterns[source_term].sub(preserve_case, result)
166
 
167
+ if source_term in compiled_errors:
168
+ for error_pattern in compiled_errors[source_term]:
 
169
  result = error_pattern.sub(preserve_case, result)
170
 
171
  result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
172
 
173
  return result
174
 
175
+ def highlight_terminology(text, found_terms):
176
+ if not found_terms:
177
+ return text
178
+
179
+ highlighted = text
180
+ for source_term, target_term in found_terms:
181
+ pattern = re.compile(r'\b(' + re.escape(target_term) + r')\b', re.IGNORECASE)
182
+ highlighted = pattern.sub(r'<mark style="background-color: #fff3cd; padding: 2px 4px; border-radius: 3px;">\1</mark>', highlighted)
183
+
184
+ return highlighted
185
+
186
+ def translate_core(text, source_lang, target_lang, use_terminology=True):
187
  if not text.strip() or source_lang == target_lang:
188
  return text, 0.0, []
189
 
 
202
 
203
  found_terms = find_source_terms_in_input(text, direction)
204
 
 
 
205
  original_paragraphs = text.split('\n')
206
  final_translated_paragraphs = []
207
 
 
210
  final_translated_paragraphs.append("")
211
  continue
212
 
213
+ sentences = cached_sent_tokenize(paragraph)
 
 
214
  paragraph_results = []
215
 
216
+ for i in range(0, len(sentences), BATCH_SIZE):
217
+ batch = sentences[i:i+BATCH_SIZE]
218
 
219
  inputs = tokenizer(
220
  batch,
221
  return_tensors="pt",
222
  padding=True,
223
  truncation=True,
224
+ max_length=MAX_LENGTH
225
  )
226
 
227
  if hasattr(model, 'device'):
 
231
  outputs = model.generate(
232
  **inputs,
233
  forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
234
+ max_length=MAX_LENGTH,
235
+ num_beams=NUM_BEAMS,
236
  early_stopping=True
237
  )
238
 
 
242
  final_translated_paragraphs.append(" ".join(paragraph_results))
243
 
244
  raw_translation = '\n'.join(final_translated_paragraphs)
 
245
  corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
 
246
  elapsed_time = time.time() - start_time
247
 
248
  return corrected_translation, elapsed_time, found_terms
249
 
250
  @lru_cache(maxsize=512)
251
+ def translate_cached(text, source_lang, target_lang, use_terminology):
252
+ result, elapsed, terms = translate_core(text, source_lang, target_lang, use_terminology)
253
  return result, elapsed, len(terms)
254
 
255
+ def translate(text, source_lang, target_lang, use_terminology):
256
  try:
257
  if len(text) > MAX_TEXT_LENGTH:
258
+ return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
259
 
260
  if not text.strip():
261
+ return "", "", ""
262
 
263
+ result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, use_terminology)
264
 
265
+ terminology_status = f"{terms_count} terms enforced" if use_terminology and terms_count > 0 else "No terminology enforcement" if not use_terminology else "No terms found"
266
+ time_info = f"Completed in {elapsed:.2f}s | {terminology_status}"
 
267
 
268
+ found_terms = find_source_terms_in_input(text, "en_to_no" if source_lang == "English" else "no_to_en")
269
+ highlighted_result = highlight_terminology(result, found_terms) if use_terminology else result
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ return result, highlighted_result, time_info
272
 
273
+ except Exception as e:
274
+ return f"Translation error: {str(e)}", "", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
  def swap_languages(src, tgt, input_txt, output_txt):
277
  return tgt, src, output_txt, input_txt
278
 
279
  def load_file(file):
280
  if file is None:
281
+ return "", "", ""
282
 
283
  try:
284
  if os.path.getsize(file.name) > MAX_FILE_SIZE:
285
+ return "Error: File too large (max 1MB)", "", ""
286
 
287
  with open(file.name, 'r', encoding='utf-8') as f:
288
  content = f.read()
289
  if len(content) > MAX_TEXT_LENGTH:
290
+ return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
291
+ return content, "", ""
292
  except:
293
  try:
294
  with open(file.name, 'r', encoding='latin-1') as f:
295
  content = f.read()
296
  if len(content) > MAX_TEXT_LENGTH:
297
+ return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
298
+ return content, "", ""
299
  except Exception as e:
300
+ return f"Error reading file: {str(e)}", "", ""
301
 
302
  EXAMPLES_EN = {
303
  "drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
 
420
  font-size: 13px !important;
421
  padding: 20px !important;
422
  }
 
 
 
 
 
423
  .disclaimer {
424
  background: #fff9e6 !important;
425
  border-left: 4px solid #ff8c00 !important;
 
436
  gr.HTML("<div style='height: 20px'></div>")
437
 
438
  with gr.Row():
 
 
 
 
 
 
 
 
439
  use_terminology = gr.Checkbox(
440
+ label="Enable Terminology Enforcement",
441
  value=True,
442
+ info=f"Post-processing with {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms"
443
  )
444
 
445
  with gr.Row():
 
478
  elem_classes="lang-selector",
479
  scale=1
480
  )
481
+ output_text_plain = gr.Textbox(
482
  placeholder="Translation",
483
  show_label=False,
484
  lines=8,
485
  max_lines=20,
486
  container=False,
487
  elem_classes="text-area",
488
+ interactive=False,
489
+ visible=False
490
+ )
491
+ output_text_html = gr.HTML(
492
+ value="<div style='padding: 20px; min-height: 200px; font-size: 17px; line-height: 1.7;'>Translation</div>"
493
  )
494
 
495
  with gr.Row():
 
503
  elem_classes="time-info"
504
  )
505
 
506
+ gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • Terminology Highlighting</div>")
507
 
508
  with gr.Accordion("Example Sentences", open=True):
509
  with gr.Row():
 
517
  use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
518
 
519
  with gr.Row():
520
+ btn1 = gr.Button("Drilling Short", size="sm")
521
+ btn2 = gr.Button("Drilling Long", size="sm")
522
+ btn3 = gr.Button("Reservoir Short", size="sm")
523
+ btn4 = gr.Button("Reservoir Long", size="sm")
524
+ btn5 = gr.Button("Subsea Short", size="sm")
525
 
526
  with gr.Row():
527
+ btn6 = gr.Button("Subsea Long", size="sm")
528
+ btn7 = gr.Button("Seismic Short", size="sm")
529
+ btn8 = gr.Button("Seismic Long", size="sm")
530
+ btn9 = gr.Button("Safety Short", size="sm")
531
+ btn10 = gr.Button("Safety Long", size="sm")
532
 
533
  btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
534
  btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
 
549
  file_types=[".txt"],
550
  type="filepath"
551
  )
 
 
 
 
 
 
 
 
 
 
552
 
553
  gr.HTML(f"""
554
  <div class='disclaimer'>
555
+ <strong>Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms with automatic highlighting
556
  <br>
557
+ <strong>Privacy & Compliance:</strong> Local inference ensures GDPR compliance
558
  <br>
559
+ <strong>Technical Features:</strong> Optimized batch processing with pre-compiled regex patterns
560
  </div>
561
  """)
562
 
 
564
 
565
  translate_btn.click(
566
  fn=translate,
567
+ inputs=[input_text, source_lang, target_lang, use_terminology],
568
+ outputs=[output_text_plain, output_text_html, time_display]
569
  )
570
 
571
  swap_btn.click(
572
  fn=swap_languages,
573
+ inputs=[source_lang, target_lang, input_text, output_text_plain],
574
+ outputs=[source_lang, target_lang, input_text, output_text_plain]
575
  )
576
 
577
+ file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, output_text_html, time_display])
578
 
579
  demo.queue().launch()