Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,6 @@ import re
|
|
| 12 |
try:
|
| 13 |
nltk.data.find('tokenizers/punkt')
|
| 14 |
except LookupError:
|
| 15 |
-
print("Downloading NLTK punkt tokenizer...")
|
| 16 |
nltk.download('punkt')
|
| 17 |
try:
|
| 18 |
nltk.download('punkt_tab')
|
|
@@ -27,7 +26,7 @@ ADAPTER_NO_TO_EN = os.getenv("ADAPTER_NO_TO_EN", "entropy25/mt_no_en_oil")
|
|
| 27 |
|
| 28 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 29 |
|
| 30 |
-
print("Loading
|
| 31 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
| 32 |
|
| 33 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
@@ -52,7 +51,6 @@ try:
|
|
| 52 |
for entry in glossary_data:
|
| 53 |
en_term = entry['en'].strip()
|
| 54 |
no_term = entry['no'].strip()
|
| 55 |
-
|
| 56 |
TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
|
| 57 |
TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
|
| 58 |
|
|
@@ -63,6 +61,16 @@ except Exception as e:
|
|
| 63 |
TERMINOLOGY_EN_TO_NO = {}
|
| 64 |
TERMINOLOGY_NO_TO_EN = {}
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
COMMON_ERRORS = {
|
| 67 |
"en_to_no": {
|
| 68 |
"mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
|
|
@@ -80,56 +88,29 @@ COMMON_ERRORS = {
|
|
| 80 |
}
|
| 81 |
}
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
"check": ["slamvekt", "1,82", "3 247"]
|
| 95 |
-
},
|
| 96 |
-
{
|
| 97 |
-
"input": "Christmas tree rated for 10,000 psi working pressure.",
|
| 98 |
-
"expected": "Juletre dimensjonert for 10 000 psi arbeidstrykk.",
|
| 99 |
-
"check": ["juletre", "10 000", "psi"]
|
| 100 |
-
},
|
| 101 |
-
{
|
| 102 |
-
"input": "H2S training required before site access.",
|
| 103 |
-
"expected": "H2S-opplæring påkrevd før tilgang til området.",
|
| 104 |
-
"check": ["H2S", "opplæring", "påkrevd"]
|
| 105 |
-
},
|
| 106 |
-
{
|
| 107 |
-
"input": "Permeability is 250 millidarcy with 22 percent porosity.",
|
| 108 |
-
"expected": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
|
| 109 |
-
"check": ["permeabilitet", "250", "22"]
|
| 110 |
-
}
|
| 111 |
-
],
|
| 112 |
-
"no_to_en": [
|
| 113 |
-
{
|
| 114 |
-
"input": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
|
| 115 |
-
"expected": "Permeability is 250 millidarcy with 22 percent porosity.",
|
| 116 |
-
"check": ["permeability", "250", "22"]
|
| 117 |
-
},
|
| 118 |
-
{
|
| 119 |
-
"input": "Subsea produksjonssystemet består av et vertikalt juletre.",
|
| 120 |
-
"expected": "The subsea production system consists of a vertical Christmas tree.",
|
| 121 |
-
"check": ["subsea", "Christmas tree", "vertical"]
|
| 122 |
-
},
|
| 123 |
-
{
|
| 124 |
-
"input": "Slamvekt justert til 1,82 spesifikk tyngde ved 3 247 meters dybde.",
|
| 125 |
-
"expected": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
|
| 126 |
-
"check": ["mud weight", "1.82", "3,247"]
|
| 127 |
-
}
|
| 128 |
]
|
| 129 |
-
}
|
| 130 |
|
| 131 |
MAX_FILE_SIZE = 1024 * 1024
|
| 132 |
MAX_TEXT_LENGTH = 10000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
def fix_number_format(text, target_lang):
|
| 135 |
if target_lang == "Norwegian":
|
|
@@ -139,7 +120,6 @@ def fix_number_format(text, target_lang):
|
|
| 139 |
else:
|
| 140 |
text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
|
| 141 |
text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
|
| 142 |
-
|
| 143 |
return text
|
| 144 |
|
| 145 |
def find_source_terms_in_input(text, direction):
|
|
@@ -166,33 +146,44 @@ def post_process_terminology(text, direction, found_terms, use_terminology):
|
|
| 166 |
return text
|
| 167 |
|
| 168 |
if direction == "en_to_no":
|
| 169 |
-
|
|
|
|
| 170 |
else:
|
| 171 |
-
|
|
|
|
| 172 |
|
| 173 |
result = text
|
| 174 |
|
| 175 |
for source_term, target_term in found_terms:
|
| 176 |
-
|
| 177 |
def preserve_case(match):
|
| 178 |
original = match.group(0)
|
| 179 |
if original and original[0].isupper():
|
| 180 |
return target_term.capitalize()
|
| 181 |
return target_term.lower()
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
-
if source_term in
|
| 187 |
-
for
|
| 188 |
-
error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
|
| 189 |
result = error_pattern.sub(preserve_case, result)
|
| 190 |
|
| 191 |
result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
|
| 192 |
|
| 193 |
return result
|
| 194 |
|
| 195 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
if not text.strip() or source_lang == target_lang:
|
| 197 |
return text, 0.0, []
|
| 198 |
|
|
@@ -211,8 +202,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 211 |
|
| 212 |
found_terms = find_source_terms_in_input(text, direction)
|
| 213 |
|
| 214 |
-
preset = QUALITY_PRESETS[quality_preset]
|
| 215 |
-
|
| 216 |
original_paragraphs = text.split('\n')
|
| 217 |
final_translated_paragraphs = []
|
| 218 |
|
|
@@ -221,20 +210,18 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 221 |
final_translated_paragraphs.append("")
|
| 222 |
continue
|
| 223 |
|
| 224 |
-
sentences =
|
| 225 |
-
|
| 226 |
-
batch_size = preset["batch_size"]
|
| 227 |
paragraph_results = []
|
| 228 |
|
| 229 |
-
for i in range(0, len(sentences),
|
| 230 |
-
batch = sentences[i:i+
|
| 231 |
|
| 232 |
inputs = tokenizer(
|
| 233 |
batch,
|
| 234 |
return_tensors="pt",
|
| 235 |
padding=True,
|
| 236 |
truncation=True,
|
| 237 |
-
max_length=
|
| 238 |
)
|
| 239 |
|
| 240 |
if hasattr(model, 'device'):
|
|
@@ -244,8 +231,8 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 244 |
outputs = model.generate(
|
| 245 |
**inputs,
|
| 246 |
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
|
| 247 |
-
max_length=
|
| 248 |
-
num_beams=
|
| 249 |
early_stopping=True
|
| 250 |
)
|
| 251 |
|
|
@@ -255,110 +242,62 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
|
|
| 255 |
final_translated_paragraphs.append(" ".join(paragraph_results))
|
| 256 |
|
| 257 |
raw_translation = '\n'.join(final_translated_paragraphs)
|
| 258 |
-
|
| 259 |
corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
|
| 260 |
-
|
| 261 |
elapsed_time = time.time() - start_time
|
| 262 |
|
| 263 |
return corrected_translation, elapsed_time, found_terms
|
| 264 |
|
| 265 |
@lru_cache(maxsize=512)
|
| 266 |
-
def translate_cached(text, source_lang, target_lang,
|
| 267 |
-
result, elapsed, terms = translate_core(text, source_lang, target_lang,
|
| 268 |
return result, elapsed, len(terms)
|
| 269 |
|
| 270 |
-
def translate(text, source_lang, target_lang,
|
| 271 |
try:
|
| 272 |
if len(text) > MAX_TEXT_LENGTH:
|
| 273 |
-
return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", ""
|
| 274 |
|
| 275 |
if not text.strip():
|
| 276 |
-
return "", ""
|
| 277 |
|
| 278 |
-
result, elapsed, terms_count = translate_cached(text, source_lang, target_lang,
|
| 279 |
|
| 280 |
-
terminology_status = f"
|
| 281 |
-
time_info = f"
|
| 282 |
-
return result, time_info
|
| 283 |
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
def run_quality_tests(use_terminology):
|
| 288 |
-
results = []
|
| 289 |
-
results.append("=== QUALITY REGRESSION TEST ===\n")
|
| 290 |
-
results.append(f"Terminology Enforcement: {'ENABLED' if use_terminology else 'DISABLED'}\n")
|
| 291 |
-
|
| 292 |
-
for direction, test_cases in QUALITY_TEST_CASES.items():
|
| 293 |
-
if direction == "en_to_no":
|
| 294 |
-
src_lang, tgt_lang = "English", "Norwegian"
|
| 295 |
-
else:
|
| 296 |
-
src_lang, tgt_lang = "Norwegian", "English"
|
| 297 |
|
| 298 |
-
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
passed_checks = []
|
| 304 |
-
failed_checks = []
|
| 305 |
-
|
| 306 |
-
for keyword in case["check"]:
|
| 307 |
-
if keyword.lower() in translation.lower():
|
| 308 |
-
passed_checks.append(keyword)
|
| 309 |
-
else:
|
| 310 |
-
failed_checks.append(keyword)
|
| 311 |
-
|
| 312 |
-
status = "✅ PASS" if not failed_checks else "⚠️ CHECK"
|
| 313 |
-
|
| 314 |
-
results.append(f"\nTest {i}: {status}")
|
| 315 |
-
results.append(f"Input: {case['input']}")
|
| 316 |
-
results.append(f"Expected: {case['expected']}")
|
| 317 |
-
results.append(f"Got: {translation}")
|
| 318 |
-
|
| 319 |
-
if use_terminology and found_terms:
|
| 320 |
-
results.append(f"Terms found: {len(found_terms)}")
|
| 321 |
-
|
| 322 |
-
if passed_checks:
|
| 323 |
-
results.append(f"✓ Found: {', '.join(passed_checks)}")
|
| 324 |
-
if failed_checks:
|
| 325 |
-
results.append(f"✗ Missing: {', '.join(failed_checks)}")
|
| 326 |
-
|
| 327 |
-
results.append("\n=== TEST COMPLETE ===")
|
| 328 |
-
|
| 329 |
-
pass_count = sum(1 for r in results if "✅ PASS" in r)
|
| 330 |
-
check_count = sum(1 for r in results if "⚠️ CHECK" in r)
|
| 331 |
-
total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
|
| 332 |
-
|
| 333 |
-
results.insert(2, f"\n📊 Score: {pass_count}/{total} passed, {check_count}/{total} need review\n")
|
| 334 |
-
|
| 335 |
-
return '\n'.join(results)
|
| 336 |
|
| 337 |
def swap_languages(src, tgt, input_txt, output_txt):
|
| 338 |
return tgt, src, output_txt, input_txt
|
| 339 |
|
| 340 |
def load_file(file):
|
| 341 |
if file is None:
|
| 342 |
-
return "", ""
|
| 343 |
|
| 344 |
try:
|
| 345 |
if os.path.getsize(file.name) > MAX_FILE_SIZE:
|
| 346 |
-
return "Error: File too large (max 1MB)", ""
|
| 347 |
|
| 348 |
with open(file.name, 'r', encoding='utf-8') as f:
|
| 349 |
content = f.read()
|
| 350 |
if len(content) > MAX_TEXT_LENGTH:
|
| 351 |
-
return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
|
| 352 |
-
return content, ""
|
| 353 |
except:
|
| 354 |
try:
|
| 355 |
with open(file.name, 'r', encoding='latin-1') as f:
|
| 356 |
content = f.read()
|
| 357 |
if len(content) > MAX_TEXT_LENGTH:
|
| 358 |
-
return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
|
| 359 |
-
return content, ""
|
| 360 |
except Exception as e:
|
| 361 |
-
return f"Error reading file: {str(e)}", ""
|
| 362 |
|
| 363 |
EXAMPLES_EN = {
|
| 364 |
"drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
|
|
@@ -481,11 +420,6 @@ custom_css = """
|
|
| 481 |
font-size: 13px !important;
|
| 482 |
padding: 20px !important;
|
| 483 |
}
|
| 484 |
-
.quality-selector {
|
| 485 |
-
background: #f0f7ff !important;
|
| 486 |
-
border: 1px solid #0f6fff !important;
|
| 487 |
-
border-radius: 4px !important;
|
| 488 |
-
}
|
| 489 |
.disclaimer {
|
| 490 |
background: #fff9e6 !important;
|
| 491 |
border-left: 4px solid #ff8c00 !important;
|
|
@@ -502,18 +436,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 502 |
gr.HTML("<div style='height: 20px'></div>")
|
| 503 |
|
| 504 |
with gr.Row():
|
| 505 |
-
quality_preset = gr.Radio(
|
| 506 |
-
choices=list(QUALITY_PRESETS.keys()),
|
| 507 |
-
value="Professional (Best Quality)",
|
| 508 |
-
label="Translation Quality",
|
| 509 |
-
info="Professional: beam=3, max=256 | Balanced: beam=2, max=256 | Draft: beam=2, max=128",
|
| 510 |
-
elem_classes="quality-selector"
|
| 511 |
-
)
|
| 512 |
-
|
| 513 |
use_terminology = gr.Checkbox(
|
| 514 |
-
label="Enable Terminology Enforcement
|
| 515 |
value=True,
|
| 516 |
-
info=f"
|
| 517 |
)
|
| 518 |
|
| 519 |
with gr.Row():
|
|
@@ -552,14 +478,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 552 |
elem_classes="lang-selector",
|
| 553 |
scale=1
|
| 554 |
)
|
| 555 |
-
|
| 556 |
placeholder="Translation",
|
| 557 |
show_label=False,
|
| 558 |
lines=8,
|
| 559 |
max_lines=20,
|
| 560 |
container=False,
|
| 561 |
elem_classes="text-area",
|
| 562 |
-
interactive=False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
)
|
| 564 |
|
| 565 |
with gr.Row():
|
|
@@ -573,7 +503,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 573 |
elem_classes="time-info"
|
| 574 |
)
|
| 575 |
|
| 576 |
-
gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian •
|
| 577 |
|
| 578 |
with gr.Accordion("Example Sentences", open=True):
|
| 579 |
with gr.Row():
|
|
@@ -587,18 +517,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 587 |
use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
|
| 588 |
|
| 589 |
with gr.Row():
|
| 590 |
-
btn1 = gr.Button("Drilling
|
| 591 |
-
btn2 = gr.Button("Drilling
|
| 592 |
-
btn3 = gr.Button("Reservoir
|
| 593 |
-
btn4 = gr.Button("Reservoir
|
| 594 |
-
btn5 = gr.Button("Subsea
|
| 595 |
|
| 596 |
with gr.Row():
|
| 597 |
-
btn6 = gr.Button("Subsea
|
| 598 |
-
btn7 = gr.Button("Seismic
|
| 599 |
-
btn8 = gr.Button("Seismic
|
| 600 |
-
btn9 = gr.Button("Safety
|
| 601 |
-
btn10 = gr.Button("Safety
|
| 602 |
|
| 603 |
btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
|
| 604 |
btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
|
|
@@ -619,24 +549,14 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 619 |
file_types=[".txt"],
|
| 620 |
type="filepath"
|
| 621 |
)
|
| 622 |
-
|
| 623 |
-
with gr.Accordion("Quality Test (Developer)", open=False):
|
| 624 |
-
test_output = gr.Textbox(
|
| 625 |
-
label="Test Results",
|
| 626 |
-
lines=20,
|
| 627 |
-
max_lines=30,
|
| 628 |
-
interactive=False
|
| 629 |
-
)
|
| 630 |
-
run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
|
| 631 |
-
run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
|
| 632 |
|
| 633 |
gr.HTML(f"""
|
| 634 |
<div class='disclaimer'>
|
| 635 |
-
<strong>
|
| 636 |
<br>
|
| 637 |
-
<strong>
|
| 638 |
<br>
|
| 639 |
-
<strong>
|
| 640 |
</div>
|
| 641 |
""")
|
| 642 |
|
|
@@ -644,16 +564,16 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
|
|
| 644 |
|
| 645 |
translate_btn.click(
|
| 646 |
fn=translate,
|
| 647 |
-
inputs=[input_text, source_lang, target_lang,
|
| 648 |
-
outputs=[
|
| 649 |
)
|
| 650 |
|
| 651 |
swap_btn.click(
|
| 652 |
fn=swap_languages,
|
| 653 |
-
inputs=[source_lang, target_lang, input_text,
|
| 654 |
-
outputs=[source_lang, target_lang, input_text,
|
| 655 |
)
|
| 656 |
|
| 657 |
-
file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, time_display])
|
| 658 |
|
| 659 |
demo.queue().launch()
|
|
|
|
| 12 |
try:
|
| 13 |
nltk.data.find('tokenizers/punkt')
|
| 14 |
except LookupError:
|
|
|
|
| 15 |
nltk.download('punkt')
|
| 16 |
try:
|
| 17 |
nltk.download('punkt_tab')
|
|
|
|
| 26 |
|
| 27 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 28 |
|
| 29 |
+
print("Loading model with 8-bit quantization...")
|
| 30 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
| 31 |
|
| 32 |
base_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
|
|
| 51 |
for entry in glossary_data:
|
| 52 |
en_term = entry['en'].strip()
|
| 53 |
no_term = entry['no'].strip()
|
|
|
|
| 54 |
TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
|
| 55 |
TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
|
| 56 |
|
|
|
|
| 61 |
TERMINOLOGY_EN_TO_NO = {}
|
| 62 |
TERMINOLOGY_NO_TO_EN = {}
|
| 63 |
|
| 64 |
+
COMPILED_PATTERNS_EN_TO_NO = {
|
| 65 |
+
term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
|
| 66 |
+
for term in TERMINOLOGY_EN_TO_NO.keys()
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
COMPILED_PATTERNS_NO_TO_EN = {
|
| 70 |
+
term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
|
| 71 |
+
for term in TERMINOLOGY_NO_TO_EN.keys()
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
COMMON_ERRORS = {
|
| 75 |
"en_to_no": {
|
| 76 |
"mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
|
|
|
|
| 88 |
}
|
| 89 |
}
|
| 90 |
|
| 91 |
+
COMPILED_ERRORS_EN_TO_NO = {}
|
| 92 |
+
for source_term, error_variants in COMMON_ERRORS["en_to_no"].items():
|
| 93 |
+
COMPILED_ERRORS_EN_TO_NO[source_term] = [
|
| 94 |
+
re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
|
| 95 |
+
for variant in error_variants
|
| 96 |
+
]
|
| 97 |
|
| 98 |
+
COMPILED_ERRORS_NO_TO_EN = {}
|
| 99 |
+
for source_term, error_variants in COMMON_ERRORS["no_to_en"].items():
|
| 100 |
+
COMPILED_ERRORS_NO_TO_EN[source_term] = [
|
| 101 |
+
re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
|
| 102 |
+
for variant in error_variants
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
]
|
|
|
|
| 104 |
|
| 105 |
MAX_FILE_SIZE = 1024 * 1024
|
| 106 |
MAX_TEXT_LENGTH = 10000
|
| 107 |
+
BATCH_SIZE = 10
|
| 108 |
+
NUM_BEAMS = 3
|
| 109 |
+
MAX_LENGTH = 256
|
| 110 |
+
|
| 111 |
+
@lru_cache(maxsize=512)
|
| 112 |
+
def cached_sent_tokenize(text):
|
| 113 |
+
return tuple(sent_tokenize(text))
|
| 114 |
|
| 115 |
def fix_number_format(text, target_lang):
|
| 116 |
if target_lang == "Norwegian":
|
|
|
|
| 120 |
else:
|
| 121 |
text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
|
| 122 |
text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
|
|
|
|
| 123 |
return text
|
| 124 |
|
| 125 |
def find_source_terms_in_input(text, direction):
|
|
|
|
| 146 |
return text
|
| 147 |
|
| 148 |
if direction == "en_to_no":
|
| 149 |
+
compiled_patterns = COMPILED_PATTERNS_EN_TO_NO
|
| 150 |
+
compiled_errors = COMPILED_ERRORS_EN_TO_NO
|
| 151 |
else:
|
| 152 |
+
compiled_patterns = COMPILED_PATTERNS_NO_TO_EN
|
| 153 |
+
compiled_errors = COMPILED_ERRORS_NO_TO_EN
|
| 154 |
|
| 155 |
result = text
|
| 156 |
|
| 157 |
for source_term, target_term in found_terms:
|
|
|
|
| 158 |
def preserve_case(match):
|
| 159 |
original = match.group(0)
|
| 160 |
if original and original[0].isupper():
|
| 161 |
return target_term.capitalize()
|
| 162 |
return target_term.lower()
|
| 163 |
|
| 164 |
+
if source_term in compiled_patterns:
|
| 165 |
+
result = compiled_patterns[source_term].sub(preserve_case, result)
|
| 166 |
|
| 167 |
+
if source_term in compiled_errors:
|
| 168 |
+
for error_pattern in compiled_errors[source_term]:
|
|
|
|
| 169 |
result = error_pattern.sub(preserve_case, result)
|
| 170 |
|
| 171 |
result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
|
| 172 |
|
| 173 |
return result
|
| 174 |
|
| 175 |
+
def highlight_terminology(text, found_terms):
|
| 176 |
+
if not found_terms:
|
| 177 |
+
return text
|
| 178 |
+
|
| 179 |
+
highlighted = text
|
| 180 |
+
for source_term, target_term in found_terms:
|
| 181 |
+
pattern = re.compile(r'\b(' + re.escape(target_term) + r')\b', re.IGNORECASE)
|
| 182 |
+
highlighted = pattern.sub(r'<mark style="background-color: #fff3cd; padding: 2px 4px; border-radius: 3px;">\1</mark>', highlighted)
|
| 183 |
+
|
| 184 |
+
return highlighted
|
| 185 |
+
|
| 186 |
+
def translate_core(text, source_lang, target_lang, use_terminology=True):
|
| 187 |
if not text.strip() or source_lang == target_lang:
|
| 188 |
return text, 0.0, []
|
| 189 |
|
|
|
|
| 202 |
|
| 203 |
found_terms = find_source_terms_in_input(text, direction)
|
| 204 |
|
|
|
|
|
|
|
| 205 |
original_paragraphs = text.split('\n')
|
| 206 |
final_translated_paragraphs = []
|
| 207 |
|
|
|
|
| 210 |
final_translated_paragraphs.append("")
|
| 211 |
continue
|
| 212 |
|
| 213 |
+
sentences = cached_sent_tokenize(paragraph)
|
|
|
|
|
|
|
| 214 |
paragraph_results = []
|
| 215 |
|
| 216 |
+
for i in range(0, len(sentences), BATCH_SIZE):
|
| 217 |
+
batch = sentences[i:i+BATCH_SIZE]
|
| 218 |
|
| 219 |
inputs = tokenizer(
|
| 220 |
batch,
|
| 221 |
return_tensors="pt",
|
| 222 |
padding=True,
|
| 223 |
truncation=True,
|
| 224 |
+
max_length=MAX_LENGTH
|
| 225 |
)
|
| 226 |
|
| 227 |
if hasattr(model, 'device'):
|
|
|
|
| 231 |
outputs = model.generate(
|
| 232 |
**inputs,
|
| 233 |
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
|
| 234 |
+
max_length=MAX_LENGTH,
|
| 235 |
+
num_beams=NUM_BEAMS,
|
| 236 |
early_stopping=True
|
| 237 |
)
|
| 238 |
|
|
|
|
| 242 |
final_translated_paragraphs.append(" ".join(paragraph_results))
|
| 243 |
|
| 244 |
raw_translation = '\n'.join(final_translated_paragraphs)
|
|
|
|
| 245 |
corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
|
|
|
|
| 246 |
elapsed_time = time.time() - start_time
|
| 247 |
|
| 248 |
return corrected_translation, elapsed_time, found_terms
|
| 249 |
|
| 250 |
@lru_cache(maxsize=512)
|
| 251 |
+
def translate_cached(text, source_lang, target_lang, use_terminology):
|
| 252 |
+
result, elapsed, terms = translate_core(text, source_lang, target_lang, use_terminology)
|
| 253 |
return result, elapsed, len(terms)
|
| 254 |
|
| 255 |
+
def translate(text, source_lang, target_lang, use_terminology):
|
| 256 |
try:
|
| 257 |
if len(text) > MAX_TEXT_LENGTH:
|
| 258 |
+
return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
|
| 259 |
|
| 260 |
if not text.strip():
|
| 261 |
+
return "", "", ""
|
| 262 |
|
| 263 |
+
result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, use_terminology)
|
| 264 |
|
| 265 |
+
terminology_status = f"{terms_count} terms enforced" if use_terminology and terms_count > 0 else "No terminology enforcement" if not use_terminology else "No terms found"
|
| 266 |
+
time_info = f"Completed in {elapsed:.2f}s | {terminology_status}"
|
|
|
|
| 267 |
|
| 268 |
+
found_terms = find_source_terms_in_input(text, "en_to_no" if source_lang == "English" else "no_to_en")
|
| 269 |
+
highlighted_result = highlight_terminology(result, found_terms) if use_terminology else result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
+
return result, highlighted_result, time_info
|
| 272 |
|
| 273 |
+
except Exception as e:
|
| 274 |
+
return f"Translation error: {str(e)}", "", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
def swap_languages(src, tgt, input_txt, output_txt):
|
| 277 |
return tgt, src, output_txt, input_txt
|
| 278 |
|
| 279 |
def load_file(file):
|
| 280 |
if file is None:
|
| 281 |
+
return "", "", ""
|
| 282 |
|
| 283 |
try:
|
| 284 |
if os.path.getsize(file.name) > MAX_FILE_SIZE:
|
| 285 |
+
return "Error: File too large (max 1MB)", "", ""
|
| 286 |
|
| 287 |
with open(file.name, 'r', encoding='utf-8') as f:
|
| 288 |
content = f.read()
|
| 289 |
if len(content) > MAX_TEXT_LENGTH:
|
| 290 |
+
return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
|
| 291 |
+
return content, "", ""
|
| 292 |
except:
|
| 293 |
try:
|
| 294 |
with open(file.name, 'r', encoding='latin-1') as f:
|
| 295 |
content = f.read()
|
| 296 |
if len(content) > MAX_TEXT_LENGTH:
|
| 297 |
+
return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
|
| 298 |
+
return content, "", ""
|
| 299 |
except Exception as e:
|
| 300 |
+
return f"Error reading file: {str(e)}", "", ""
|
| 301 |
|
| 302 |
EXAMPLES_EN = {
|
| 303 |
"drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
|
|
|
|
| 420 |
font-size: 13px !important;
|
| 421 |
padding: 20px !important;
|
| 422 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
.disclaimer {
|
| 424 |
background: #fff9e6 !important;
|
| 425 |
border-left: 4px solid #ff8c00 !important;
|
|
|
|
| 436 |
gr.HTML("<div style='height: 20px'></div>")
|
| 437 |
|
| 438 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
use_terminology = gr.Checkbox(
|
| 440 |
+
label="Enable Terminology Enforcement",
|
| 441 |
value=True,
|
| 442 |
+
info=f"Post-processing with {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms"
|
| 443 |
)
|
| 444 |
|
| 445 |
with gr.Row():
|
|
|
|
| 478 |
elem_classes="lang-selector",
|
| 479 |
scale=1
|
| 480 |
)
|
| 481 |
+
output_text_plain = gr.Textbox(
|
| 482 |
placeholder="Translation",
|
| 483 |
show_label=False,
|
| 484 |
lines=8,
|
| 485 |
max_lines=20,
|
| 486 |
container=False,
|
| 487 |
elem_classes="text-area",
|
| 488 |
+
interactive=False,
|
| 489 |
+
visible=False
|
| 490 |
+
)
|
| 491 |
+
output_text_html = gr.HTML(
|
| 492 |
+
value="<div style='padding: 20px; min-height: 200px; font-size: 17px; line-height: 1.7;'>Translation</div>"
|
| 493 |
)
|
| 494 |
|
| 495 |
with gr.Row():
|
|
|
|
| 503 |
elem_classes="time-info"
|
| 504 |
)
|
| 505 |
|
| 506 |
+
gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • Terminology Highlighting</div>")
|
| 507 |
|
| 508 |
with gr.Accordion("Example Sentences", open=True):
|
| 509 |
with gr.Row():
|
|
|
|
| 517 |
use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
|
| 518 |
|
| 519 |
with gr.Row():
|
| 520 |
+
btn1 = gr.Button("Drilling Short", size="sm")
|
| 521 |
+
btn2 = gr.Button("Drilling Long", size="sm")
|
| 522 |
+
btn3 = gr.Button("Reservoir Short", size="sm")
|
| 523 |
+
btn4 = gr.Button("Reservoir Long", size="sm")
|
| 524 |
+
btn5 = gr.Button("Subsea Short", size="sm")
|
| 525 |
|
| 526 |
with gr.Row():
|
| 527 |
+
btn6 = gr.Button("Subsea Long", size="sm")
|
| 528 |
+
btn7 = gr.Button("Seismic Short", size="sm")
|
| 529 |
+
btn8 = gr.Button("Seismic Long", size="sm")
|
| 530 |
+
btn9 = gr.Button("Safety Short", size="sm")
|
| 531 |
+
btn10 = gr.Button("Safety Long", size="sm")
|
| 532 |
|
| 533 |
btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
|
| 534 |
btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
|
|
|
|
| 549 |
file_types=[".txt"],
|
| 550 |
type="filepath"
|
| 551 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
|
| 553 |
gr.HTML(f"""
|
| 554 |
<div class='disclaimer'>
|
| 555 |
+
<strong>Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms with automatic highlighting
|
| 556 |
<br>
|
| 557 |
+
<strong>Privacy & Compliance:</strong> Local inference ensures GDPR compliance
|
| 558 |
<br>
|
| 559 |
+
<strong>Technical Features:</strong> Optimized batch processing with pre-compiled regex patterns
|
| 560 |
</div>
|
| 561 |
""")
|
| 562 |
|
|
|
|
| 564 |
|
| 565 |
translate_btn.click(
|
| 566 |
fn=translate,
|
| 567 |
+
inputs=[input_text, source_lang, target_lang, use_terminology],
|
| 568 |
+
outputs=[output_text_plain, output_text_html, time_display]
|
| 569 |
)
|
| 570 |
|
| 571 |
swap_btn.click(
|
| 572 |
fn=swap_languages,
|
| 573 |
+
inputs=[source_lang, target_lang, input_text, output_text_plain],
|
| 574 |
+
outputs=[source_lang, target_lang, input_text, output_text_plain]
|
| 575 |
)
|
| 576 |
|
| 577 |
+
file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, output_text_html, time_display])
|
| 578 |
|
| 579 |
demo.queue().launch()
|