Spaces:

entropy25
/

mt

Running

App Files Files Community

entropy25 commited on Jan 6

Commit

43d8748

verified ·

1 Parent(s): f876d44

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -182

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import re
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
-    print("Downloading NLTK punkt tokenizer...")
     nltk.download('punkt')
     try:
         nltk.download('punkt_tab')
@@ -27,7 +26,7 @@ ADAPTER_NO_TO_EN = os.getenv("ADAPTER_NO_TO_EN", "entropy25/mt_no_en_oil")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
-print("Loading shared base model with 8-bit quantization...")
 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 base_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -52,7 +51,6 @@ try:
     for entry in glossary_data:
         en_term = entry['en'].strip()
         no_term = entry['no'].strip()
         TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
         TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
@@ -63,6 +61,16 @@ except Exception as e:
     TERMINOLOGY_EN_TO_NO = {}
     TERMINOLOGY_NO_TO_EN = {}
 COMMON_ERRORS = {
     "en_to_no": {
         "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
@@ -80,56 +88,29 @@ COMMON_ERRORS = {
     }
 }
-QUALITY_PRESETS = {
-    "Professional (Best Quality)": {"num_beams": 3, "max_length": 256, "batch_size": 4},
-    "Balanced (Faster)": {"num_beams": 2, "max_length": 256, "batch_size": 5},
-    "Draft (Fastest)": {"num_beams": 2, "max_length": 128, "batch_size": 5}
-}
-QUALITY_TEST_CASES = {
-    "en_to_no": [
-        {
-            "input": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
-            "expected": "Slamvekt justert til 1,82 spesifikk tyngde ved 3 247 meters dybde.",
-            "check": ["slamvekt", "1,82", "3 247"]
-        },
-        {
-            "input": "Christmas tree rated for 10,000 psi working pressure.",
-            "expected": "Juletre dimensjonert for 10 000 psi arbeidstrykk.",
-            "check": ["juletre", "10 000", "psi"]
-        },
-        {
-            "input": "H2S training required before site access.",
-            "expected": "H2S-opplæring påkrevd før tilgang til området.",
-            "check": ["H2S", "opplæring", "påkrevd"]
-        },
-        {
-            "input": "Permeability is 250 millidarcy with 22 percent porosity.",
-            "expected": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
-            "check": ["permeabilitet", "250", "22"]
-        }
-    ],
-    "no_to_en": [
-        {
-            "input": "Permeabilitet er 250 millidarcy med 22 prosent porøsitet.",
-            "expected": "Permeability is 250 millidarcy with 22 percent porosity.",
-            "check": ["permeability", "250", "22"]
-        },
-        {
-            "input": "Subsea produksjonssystemet består av et vertikalt juletre.",
-            "expected": "The subsea production system consists of a vertical Christmas tree.",
-            "check": ["subsea", "Christmas tree", "vertical"]
-        },
-        {
-            "input": "Slamvekt justert til 1,82 spesifikk tyngde ved 3 247 meters dybde.",
-            "expected": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
-            "check": ["mud weight", "1.82", "3,247"]
-        }
     ]
-}
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
 def fix_number_format(text, target_lang):
     if target_lang == "Norwegian":
@@ -139,7 +120,6 @@ def fix_number_format(text, target_lang):
     else:
         text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
         text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
     return text
 def find_source_terms_in_input(text, direction):
@@ -166,33 +146,44 @@ def post_process_terminology(text, direction, found_terms, use_terminology):
         return text
     if direction == "en_to_no":
-        error_dict = COMMON_ERRORS.get("en_to_no", {})
     else:
-        error_dict = COMMON_ERRORS.get("no_to_en", {})
     result = text
     for source_term, target_term in found_terms:
         def preserve_case(match):
             original = match.group(0)
             if original and original[0].isupper():
                 return target_term.capitalize()
             return target_term.lower()
-        source_pattern = re.compile(r'\b' + re.escape(source_term) + r'\b', re.IGNORECASE)
-        result = source_pattern.sub(preserve_case, result)
-        if source_term in error_dict:
-            for error_variant in error_dict[source_term]:
-                error_pattern = re.compile(r'\b' + re.escape(error_variant) + r'\b', re.IGNORECASE)
                 result = error_pattern.sub(preserve_case, result)
     result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
     return result
-def translate_core(text, source_lang, target_lang, quality_preset, use_terminology=True):
     if not text.strip() or source_lang == target_lang:
         return text, 0.0, []
@@ -211,8 +202,6 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
     found_terms = find_source_terms_in_input(text, direction)
-    preset = QUALITY_PRESETS[quality_preset]
     original_paragraphs = text.split('\n')
     final_translated_paragraphs = []
@@ -221,20 +210,18 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
             final_translated_paragraphs.append("")
             continue
-        sentences = sent_tokenize(paragraph)
-        batch_size = preset["batch_size"]
         paragraph_results = []
-        for i in range(0, len(sentences), batch_size):
-            batch = sentences[i:i+batch_size]
             inputs = tokenizer(
                 batch,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
-                max_length=preset["max_length"]
             )
             if hasattr(model, 'device'):
@@ -244,8 +231,8 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
                 outputs = model.generate(
                     **inputs,
                     forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
-                    max_length=preset["max_length"],
-                    num_beams=preset["num_beams"],
                     early_stopping=True
                 )
@@ -255,110 +242,62 @@ def translate_core(text, source_lang, target_lang, quality_preset, use_terminolo
         final_translated_paragraphs.append(" ".join(paragraph_results))
     raw_translation = '\n'.join(final_translated_paragraphs)
     corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
     elapsed_time = time.time() - start_time
     return corrected_translation, elapsed_time, found_terms
 @lru_cache(maxsize=512)
-def translate_cached(text, source_lang, target_lang, quality_preset, use_terminology):
-    result, elapsed, terms = translate_core(text, source_lang, target_lang, quality_preset, use_terminology)
     return result, elapsed, len(terms)
-def translate(text, source_lang, target_lang, quality_preset, use_terminology):
     try:
         if len(text) > MAX_TEXT_LENGTH:
-            return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", ""
         if not text.strip():
-            return "", ""
-        result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, quality_preset, use_terminology)
-        terminology_status = f"with {terms_count} terms enforced" if use_terminology and terms_count > 0 else "without terminology enforcement" if not use_terminology else "no terms found"
-        time_info = f"Translation completed in {elapsed:.2f}s ({terminology_status})"
-        return result, time_info
-    except Exception as e:
-        return f"Translation error: {str(e)}. Please try again.", ""
-def run_quality_tests(use_terminology):
-    results = []
-    results.append("=== QUALITY REGRESSION TEST ===\n")
-    results.append(f"Terminology Enforcement: {'ENABLED' if use_terminology else 'DISABLED'}\n")
-    for direction, test_cases in QUALITY_TEST_CASES.items():
-        if direction == "en_to_no":
-            src_lang, tgt_lang = "English", "Norwegian"
-        else:
-            src_lang, tgt_lang = "Norwegian", "English"
-        results.append(f"\n{src_lang} to {tgt_lang}\n")
-        for i, case in enumerate(test_cases, 1):
-            translation, _, found_terms = translate_core(case["input"], src_lang, tgt_lang, "Professional (Best Quality)", use_terminology)
-            passed_checks = []
-            failed_checks = []
-            for keyword in case["check"]:
-                if keyword.lower() in translation.lower():
-                    passed_checks.append(keyword)
-                else:
-                    failed_checks.append(keyword)
-            status = "✅ PASS" if not failed_checks else "⚠️ CHECK"
-            results.append(f"\nTest {i}: {status}")
-            results.append(f"Input:    {case['input']}")
-            results.append(f"Expected: {case['expected']}")
-            results.append(f"Got:      {translation}")
-            if use_terminology and found_terms:
-                results.append(f"Terms found: {len(found_terms)}")
-            if passed_checks:
-                results.append(f"✓ Found:  {', '.join(passed_checks)}")
-            if failed_checks:
-                results.append(f"✗ Missing: {', '.join(failed_checks)}")
-    results.append("\n=== TEST COMPLETE ===")
-    pass_count = sum(1 for r in results if "✅ PASS" in r)
-    check_count = sum(1 for r in results if "⚠️ CHECK" in r)
-    total = len(QUALITY_TEST_CASES["en_to_no"]) + len(QUALITY_TEST_CASES["no_to_en"])
-    results.insert(2, f"\n📊 Score: {pass_count}/{total} passed, {check_count}/{total} need review\n")
-    return '\n'.join(results)
 def swap_languages(src, tgt, input_txt, output_txt):
     return tgt, src, output_txt, input_txt
 def load_file(file):
     if file is None:
-        return "", ""
     try:
         if os.path.getsize(file.name) > MAX_FILE_SIZE:
-            return "Error: File too large (max 1MB)", ""
         with open(file.name, 'r', encoding='utf-8') as f:
             content = f.read()
             if len(content) > MAX_TEXT_LENGTH:
-                return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
-            return content, ""
     except:
         try:
             with open(file.name, 'r', encoding='latin-1') as f:
                 content = f.read()
                 if len(content) > MAX_TEXT_LENGTH:
-                    return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", ""
-                return content, ""
         except Exception as e:
-            return f"Error reading file: {str(e)}", ""
 EXAMPLES_EN = {
     "drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
@@ -481,11 +420,6 @@ custom_css = """
     font-size: 13px !important;
     padding: 20px !important;
 }
-.quality-selector {
-    background: #f0f7ff !important;
-    border: 1px solid #0f6fff !important;
-    border-radius: 4px !important;
-}
 .disclaimer {
     background: #fff9e6 !important;
     border-left: 4px solid #ff8c00 !important;
@@ -502,18 +436,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
     gr.HTML("<div style='height: 20px'></div>")
     with gr.Row():
-        quality_preset = gr.Radio(
-            choices=list(QUALITY_PRESETS.keys()),
-            value="Professional (Best Quality)",
-            label="Translation Quality",
-            info="Professional: beam=3, max=256 | Balanced: beam=2, max=256 | Draft: beam=2, max=128",
-            elem_classes="quality-selector"
-        )
         use_terminology = gr.Checkbox(
-            label="Enable Terminology Enforcement (POST)",
             value=True,
-            info=f"Uses {len(TERMINOLOGY_EN_TO_NO)} terms + error variants + number format fixing"
         )
     with gr.Row():
@@ -552,14 +478,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
                         elem_classes="lang-selector",
                         scale=1
                     )
-                output_text = gr.Textbox(
                     placeholder="Translation",
                     show_label=False,
                     lines=8,
                     max_lines=20,
                     container=False,
                     elem_classes="text-area",
-                    interactive=False
                 )
     with gr.Row():
@@ -573,7 +503,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
             elem_classes="time-info"
         )
-    gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • POST-only Terminology Processing</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():
@@ -587,18 +517,18 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
             use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
         with gr.Row():
-            btn1 = gr.Button("Drilling (Short)", size="sm")
-            btn2 = gr.Button("Drilling (Long)", size="sm")
-            btn3 = gr.Button("Reservoir (Short)", size="sm")
-            btn4 = gr.Button("Reservoir (Long)", size="sm")
-            btn5 = gr.Button("Subsea (Short)", size="sm")
         with gr.Row():
-            btn6 = gr.Button("Subsea (Long)", size="sm")
-            btn7 = gr.Button("Seismic (Short)", size="sm")
-            btn8 = gr.Button("Seismic (Long)", size="sm")
-            btn9 = gr.Button("Safety (Short)", size="sm")
-            btn10 = gr.Button("Safety (Long)", size="sm")
         btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
         btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
@@ -619,24 +549,14 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
             file_types=[".txt"],
             type="filepath"
         )
-    with gr.Accordion("Quality Test (Developer)", open=False):
-        test_output = gr.Textbox(
-            label="Test Results",
-            lines=20,
-            max_lines=30,
-            interactive=False
-        )
-        run_test_btn = gr.Button("Run Quality Regression Test", variant="secondary")
-        run_test_btn.click(fn=run_quality_tests, inputs=[use_terminology], outputs=test_output)
     gr.HTML(f"""
     <div class='disclaimer'>
-        <strong>✓ Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms from NPD glossary
         <br>
-        <strong>✓ Privacy & Compliance:</strong> Fine-tuned on public domain data. Local inference ensures GDPR compliance.
         <br>
-        <strong>✓ Technical Features:</strong> Sentence-level batching prevents truncation. Post-processing ensures terminology consistency.
     </div>
     """)
@@ -644,16 +564,16 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Default()) as demo:
     translate_btn.click(
         fn=translate,
-        inputs=[input_text, source_lang, target_lang, quality_preset, use_terminology],
-        outputs=[output_text, time_display]
     )
     swap_btn.click(
         fn=swap_languages,
-        inputs=[source_lang, target_lang, input_text, output_text],
-        outputs=[source_lang, target_lang, input_text, output_text]
     )
-    file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, time_display])
 demo.queue().launch()

 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt')
     try:
         nltk.download('punkt_tab')
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+print("Loading model with 8-bit quantization...")
 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 base_model = AutoModelForSeq2SeqLM.from_pretrained(
     for entry in glossary_data:
         en_term = entry['en'].strip()
         no_term = entry['no'].strip()
         TERMINOLOGY_EN_TO_NO[en_term.lower()] = no_term
         TERMINOLOGY_NO_TO_EN[no_term.lower()] = en_term
     TERMINOLOGY_EN_TO_NO = {}
     TERMINOLOGY_NO_TO_EN = {}
+COMPILED_PATTERNS_EN_TO_NO = {
+    term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
+    for term in TERMINOLOGY_EN_TO_NO.keys()
+}
+COMPILED_PATTERNS_NO_TO_EN = {
+    term: re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
+    for term in TERMINOLOGY_NO_TO_EN.keys()
+}
 COMMON_ERRORS = {
     "en_to_no": {
         "mud weight": ["mudgevekten", "mudvekt", "slam vekt"],
     }
 }
+COMPILED_ERRORS_EN_TO_NO = {}
+for source_term, error_variants in COMMON_ERRORS["en_to_no"].items():
+    COMPILED_ERRORS_EN_TO_NO[source_term] = [
+        re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
+        for variant in error_variants
+    ]
+COMPILED_ERRORS_NO_TO_EN = {}
+for source_term, error_variants in COMMON_ERRORS["no_to_en"].items():
+    COMPILED_ERRORS_NO_TO_EN[source_term] = [
+        re.compile(r'\b' + re.escape(variant) + r'\b', re.IGNORECASE)
+        for variant in error_variants
     ]
 MAX_FILE_SIZE = 1024 * 1024
 MAX_TEXT_LENGTH = 10000
+BATCH_SIZE = 10
+NUM_BEAMS = 3
+MAX_LENGTH = 256
+@lru_cache(maxsize=512)
+def cached_sent_tokenize(text):
+    return tuple(sent_tokenize(text))
 def fix_number_format(text, target_lang):
     if target_lang == "Norwegian":
     else:
         text = re.sub(r'(\d)\s(\d{3})', r'\1,\2', text)
         text = re.sub(r'(\d),(\d{1,2})(?=\s|$|[^\d])', r'\1.\2', text)
     return text
 def find_source_terms_in_input(text, direction):
         return text
     if direction == "en_to_no":
+        compiled_patterns = COMPILED_PATTERNS_EN_TO_NO
+        compiled_errors = COMPILED_ERRORS_EN_TO_NO
     else:
+        compiled_patterns = COMPILED_PATTERNS_NO_TO_EN
+        compiled_errors = COMPILED_ERRORS_NO_TO_EN
     result = text
     for source_term, target_term in found_terms:
         def preserve_case(match):
             original = match.group(0)
             if original and original[0].isupper():
                 return target_term.capitalize()
             return target_term.lower()
+        if source_term in compiled_patterns:
+            result = compiled_patterns[source_term].sub(preserve_case, result)
+        if source_term in compiled_errors:
+            for error_pattern in compiled_errors[source_term]:
                 result = error_pattern.sub(preserve_case, result)
     result = fix_number_format(result, "Norwegian" if direction == "en_to_no" else "English")
     return result
+def highlight_terminology(text, found_terms):
+    if not found_terms:
+        return text
+    highlighted = text
+    for source_term, target_term in found_terms:
+        pattern = re.compile(r'\b(' + re.escape(target_term) + r')\b', re.IGNORECASE)
+        highlighted = pattern.sub(r'<mark style="background-color: #fff3cd; padding: 2px 4px; border-radius: 3px;">\1</mark>', highlighted)
+    return highlighted
+def translate_core(text, source_lang, target_lang, use_terminology=True):
     if not text.strip() or source_lang == target_lang:
         return text, 0.0, []
     found_terms = find_source_terms_in_input(text, direction)
     original_paragraphs = text.split('\n')
     final_translated_paragraphs = []
             final_translated_paragraphs.append("")
             continue
+        sentences = cached_sent_tokenize(paragraph)
         paragraph_results = []
+        for i in range(0, len(sentences), BATCH_SIZE):
+            batch = sentences[i:i+BATCH_SIZE]
             inputs = tokenizer(
                 batch,
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
+                max_length=MAX_LENGTH
             )
             if hasattr(model, 'device'):
                 outputs = model.generate(
                     **inputs,
                     forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
+                    max_length=MAX_LENGTH,
+                    num_beams=NUM_BEAMS,
                     early_stopping=True
                 )
         final_translated_paragraphs.append(" ".join(paragraph_results))
     raw_translation = '\n'.join(final_translated_paragraphs)
     corrected_translation = post_process_terminology(raw_translation, direction, found_terms, use_terminology)
     elapsed_time = time.time() - start_time
     return corrected_translation, elapsed_time, found_terms
 @lru_cache(maxsize=512)
+def translate_cached(text, source_lang, target_lang, use_terminology):
+    result, elapsed, terms = translate_core(text, source_lang, target_lang, use_terminology)
     return result, elapsed, len(terms)
+def translate(text, source_lang, target_lang, use_terminology):
     try:
         if len(text) > MAX_TEXT_LENGTH:
+            return f"Error: Text too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
         if not text.strip():
+            return "", "", ""
+        result, elapsed, terms_count = translate_cached(text, source_lang, target_lang, use_terminology)
+        terminology_status = f"{terms_count} terms enforced" if use_terminology and terms_count > 0 else "No terminology enforcement" if not use_terminology else "No terms found"
+        time_info = f"Completed in {elapsed:.2f}s | {terminology_status}"
+        found_terms = find_source_terms_in_input(text, "en_to_no" if source_lang == "English" else "no_to_en")
+        highlighted_result = highlight_terminology(result, found_terms) if use_terminology else result
+        return result, highlighted_result, time_info
+    except Exception as e:
+        return f"Translation error: {str(e)}", "", ""
 def swap_languages(src, tgt, input_txt, output_txt):
     return tgt, src, output_txt, input_txt
 def load_file(file):
     if file is None:
+        return "", "", ""
     try:
         if os.path.getsize(file.name) > MAX_FILE_SIZE:
+            return "Error: File too large (max 1MB)", "", ""
         with open(file.name, 'r', encoding='utf-8') as f:
             content = f.read()
             if len(content) > MAX_TEXT_LENGTH:
+                return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
+            return content, "", ""
     except:
         try:
             with open(file.name, 'r', encoding='latin-1') as f:
                 content = f.read()
                 if len(content) > MAX_TEXT_LENGTH:
+                    return f"Error: File content too long (max {MAX_TEXT_LENGTH:,} characters)", "", ""
+                return content, "", ""
         except Exception as e:
+            return f"Error reading file: {str(e)}", "", ""
 EXAMPLES_EN = {
     "drilling_short": "Mud weight adjusted to 1.82 specific gravity at 3,247 meters depth.",
     font-size: 13px !important;
     padding: 20px !important;
 }
 .disclaimer {
     background: #fff9e6 !important;
     border-left: 4px solid #ff8c00 !important;
     gr.HTML("<div style='height: 20px'></div>")
     with gr.Row():
         use_terminology = gr.Checkbox(
+            label="Enable Terminology Enforcement",
             value=True,
+            info=f"Post-processing with {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms"
         )
     with gr.Row():
                         elem_classes="lang-selector",
                         scale=1
                     )
+                output_text_plain = gr.Textbox(
                     placeholder="Translation",
                     show_label=False,
                     lines=8,
                     max_lines=20,
                     container=False,
                     elem_classes="text-area",
+                    interactive=False,
+                    visible=False
+                )
+                output_text_html = gr.HTML(
+                    value="<div style='padding: 20px; min-height: 200px; font-size: 17px; line-height: 1.7;'>Translation</div>"
                 )
     with gr.Row():
             elem_classes="time-info"
         )
+    gr.HTML("<div class='footer-info'>Oil & Gas Translation • English ↔ Norwegian • Terminology Highlighting</div>")
     with gr.Accordion("Example Sentences", open=True):
         with gr.Row():
             use_example_btn = gr.Button("Use This Example", variant="primary", size="sm")
         with gr.Row():
+            btn1 = gr.Button("Drilling Short", size="sm")
+            btn2 = gr.Button("Drilling Long", size="sm")
+            btn3 = gr.Button("Reservoir Short", size="sm")
+            btn4 = gr.Button("Reservoir Long", size="sm")
+            btn5 = gr.Button("Subsea Short", size="sm")
         with gr.Row():
+            btn6 = gr.Button("Subsea Long", size="sm")
+            btn7 = gr.Button("Seismic Short", size="sm")
+            btn8 = gr.Button("Seismic Long", size="sm")
+            btn9 = gr.Button("Safety Short", size="sm")
+            btn10 = gr.Button("Safety Long", size="sm")
         btn1.click(lambda sl: get_example("drilling_short", sl), inputs=[source_lang], outputs=example_text)
         btn2.click(lambda sl: get_example("drilling_long", sl), inputs=[source_lang], outputs=example_text)
             file_types=[".txt"],
             type="filepath"
         )
     gr.HTML(f"""
     <div class='disclaimer'>
+        <strong>Terminology Enforcement:</strong> {len(TERMINOLOGY_EN_TO_NO)} oil & gas terms with automatic highlighting
         <br>
+        <strong>Privacy & Compliance:</strong> Local inference ensures GDPR compliance
         <br>
+        <strong>Technical Features:</strong> Optimized batch processing with pre-compiled regex patterns
     </div>
     """)
     translate_btn.click(
         fn=translate,
+        inputs=[input_text, source_lang, target_lang, use_terminology],
+        outputs=[output_text_plain, output_text_html, time_display]
     )
     swap_btn.click(
         fn=swap_languages,
+        inputs=[source_lang, target_lang, input_text, output_text_plain],
+        outputs=[source_lang, target_lang, input_text, output_text_plain]
     )
+    file_input.change(fn=load_file, inputs=file_input, outputs=[input_text, output_text_html, time_display])
 demo.queue().launch()