Spaces:

SCBconsulting
/

synclm-demo

Sleeping

App Files Files Community

SCBconsulting commited on Aug 28, 2025

Commit

bd5c699

verified ·

1 Parent(s): 7aa9e67

Update utils/translator.py

Browse files

Files changed (1) hide show

utils/translator.py +47 -57

utils/translator.py CHANGED Viewed

@@ -1,31 +1,33 @@
-# utils/translate.py
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 from docx import Document
-# ========== Load EN → PT model ==========
-en_pt_model_name = "unicamp-dl/translation-en-pt-t5"
-tokenizer_en_pt = AutoTokenizer.from_pretrained(en_pt_model_name)
-model_en_pt = AutoModelForSeq2SeqLM.from_pretrained(en_pt_model_name)
-# ========== Load PT → EN model ==========
-pt_en_model_name = "unicamp-dl/translation-pt-en-t5"
-tokenizer_pt_en = AutoTokenizer.from_pretrained(pt_en_model_name)
-model_pt_en = AutoModelForSeq2SeqLM.from_pretrained(pt_en_model_name)
-# ========== Text Cleaning & Chunking ==========
-def clean_text(text):
     return text.replace("\n", " ").replace("  ", " ").strip()
-def chunk_text(text, max_chunk_chars=500):
     """
-    🔪 Break long input into token-safe chunks.
     """
     words = text.split()
-    chunks = []
-    current_chunk = ""
     for word in words:
         if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
@@ -35,71 +37,59 @@ def chunk_text(text, max_chunk_chars=500):
             current_chunk = word
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
-# ========== Translation Functions ==========
-def translate_to_portuguese(text):
     """
-    🇺🇸 ➡️ 🇧🇷 Translate English to Portuguese.
     """
     if not text.strip():
         return "No input provided."
-    text = clean_text(text)
-    chunks = chunk_text(text)
-    translated_chunks = []
-    for chunk in chunks:
-        inputs = tokenizer_en_pt(chunk, return_tensors="pt", truncation=True, padding=True)
-        with torch.no_grad():
-            outputs = model_en_pt.generate(**inputs, max_length=512, num_beams=4)
-        translated = tokenizer_en_pt.decode(outputs[0], skip_special_tokens=True)
-        translated_chunks.append(translated)
-    return " ".join(translated_chunks)
-def translate_to_english(text):
     """
-    🇧🇷 ➡️ 🇺🇸 Translate Portuguese to English.
     """
     if not text.strip():
         return "No input provided."
-    text = clean_text(text)
-    chunks = chunk_text(text)
-    translated_chunks = []
-    for chunk in chunks:
-        inputs = tokenizer_pt_en(chunk, return_tensors="pt", truncation=True, padding=True)
-        with torch.no_grad():
-            outputs = model_pt_en.generate(**inputs, max_length=512, num_beams=4)
-        translated = tokenizer_pt_en.decode(outputs[0], skip_special_tokens=True)
-        translated_chunks.append(translated)
-    return " ".join(translated_chunks)
-# ========== Bilingual Layout ==========
-def bilingual_clauses(english_text):
     """
-    📄 Generate side-by-side bilingual clauses.
     """
     clauses_en = chunk_text(clean_text(english_text), max_chunk_chars=300)
-    clauses_pt = [translate_to_portuguese(c) for c in clauses_en]
-    bilingual = []
     for en, pt in zip(clauses_en, clauses_pt):
-        bilingual.append(f"📘 EN: {en}\n📗 PT: {pt}\n" + "-" * 60)
-    return "\n\n".join(bilingual)
 # ========== Export to DOCX ==========
-def export_to_word(text, filename="translated_contract.docx"):
     """
-    📝 Export text block to Word document.
     """
     doc = Document()
     doc.add_heading("Legal Translation Output", level=1)

+# utils/translator.py
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 from docx import Document
+# ========== Model Loading (Cached Once) ==========
+def load_model_and_tokenizer(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    return tokenizer, model
+# English → Portuguese
+tokenizer_en_pt, model_en_pt = load_model_and_tokenizer("unicamp-dl/translation-en-pt-t5")
+# Portuguese → English
+tokenizer_pt_en, model_pt_en = load_model_and_tokenizer("unicamp-dl/translation-pt-en-t5")
+# ========== Preprocessing ==========
+def clean_text(text: str) -> str:
     return text.replace("\n", " ").replace("  ", " ").strip()
+def chunk_text(text: str, max_chunk_chars: int = 500):
     """
+    Split long text into token-safe chunks.
     """
     words = text.split()
+    chunks, current_chunk = [], ""
     for word in words:
         if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
             current_chunk = word
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
+# ========== Translation Core Logic ==========
+def translate_chunks(chunks, tokenizer, model):
+    translated = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_length=512, num_beams=4)
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        translated.append(decoded)
+    return " ".join(translated)
+def translate_to_portuguese(text: str) -> str:
     """
+    🇺🇸 ➡️ 🇧🇷 Translate from English to Portuguese.
     """
     if not text.strip():
         return "No input provided."
+    chunks = chunk_text(clean_text(text))
+    return translate_chunks(chunks, tokenizer_en_pt, model_en_pt)
+def translate_to_english(text: str) -> str:
     """
+    🇧🇷 ➡️ 🇺🇸 Translate from Portuguese to English.
     """
     if not text.strip():
         return "No input provided."
+    chunks = chunk_text(clean_text(text))
+    return translate_chunks(chunks, tokenizer_pt_en, model_pt_en)
+# ========== Bilingual View ==========
+def bilingual_clauses(english_text: str) -> str:
     """
+    Create side-by-side bilingual clauses.
     """
     clauses_en = chunk_text(clean_text(english_text), max_chunk_chars=300)
+    clauses_pt = [translate_to_portuguese(clause) for clause in clauses_en]
+    bilingual_output = []
     for en, pt in zip(clauses_en, clauses_pt):
+        bilingual_output.append(f"📘 EN: {en}\n📗 PT: {pt}\n" + "-" * 60)
+    return "\n\n".join(bilingual_output)
 # ========== Export to DOCX ==========
+def export_to_word(text: str, filename: str = "translated_contract.docx") -> str:
     """
+    Export bilingual translation to a Word document.
     """
     doc = Document()
     doc.add_heading("Legal Translation Output", level=1)