Spaces:
Sleeping
Sleeping
| # utils/translator.py | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| import torch | |
| from docx import Document | |
| # ========== Model Loading (Cached Once) ========== | |
| def load_model_and_tokenizer(model_name): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| return tokenizer, model | |
| # English β Portuguese | |
| tokenizer_en_pt, model_en_pt = load_model_and_tokenizer("unicamp-dl/translation-en-pt-t5") | |
| # Portuguese β English | |
| tokenizer_pt_en, model_pt_en = load_model_and_tokenizer("unicamp-dl/translation-pt-en-t5") | |
| # ========== Preprocessing ========== | |
| def clean_text(text: str) -> str: | |
| return text.replace("\n", " ").replace(" ", " ").strip() | |
| def chunk_text(text: str, max_chunk_chars: int = 500): | |
| """ | |
| Split long text into chunks based on character count. | |
| """ | |
| words = text.split() | |
| chunks, current_chunk = [], "" | |
| for word in words: | |
| if len(current_chunk) + len(word) + 1 <= max_chunk_chars: | |
| current_chunk += " " + word | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = word | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # ========== Translation Core Logic ========== | |
| def translate_chunks(chunks, tokenizer, model): | |
| translated = [] | |
| for chunk in chunks: | |
| inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_length=512, num_beams=4) | |
| decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| translated.append(decoded) | |
| return " ".join(translated) | |
| def translate_to_portuguese(text: str) -> str: | |
| if not text.strip(): | |
| return "No input provided." | |
| chunks = chunk_text(clean_text(text)) | |
| return translate_chunks(chunks, tokenizer_en_pt, model_en_pt) | |
| def translate_to_english(text: str) -> str: | |
| if not text.strip(): | |
| return "No input provided." | |
| chunks = chunk_text(clean_text(text)) | |
| return translate_chunks(chunks, tokenizer_pt_en, model_pt_en) | |
| def translate_text(text: str, direction: str = "en-pt") -> str: | |
| """ | |
| direction = 'en-pt' or 'pt-en' | |
| """ | |
| if direction == "en-pt": | |
| return translate_to_portuguese(text) | |
| elif direction == "pt-en": | |
| return translate_to_english(text) | |
| else: | |
| return "Unsupported translation direction." | |
| # ========== Bilingual View ========== | |
| def bilingual_clauses(text: str) -> str: | |
| """ | |
| Create bilingual clause-by-clause output (EN + PT). | |
| """ | |
| if not text.strip(): | |
| return "No input provided." | |
| clauses_en = chunk_text(clean_text(text), max_chunk_chars=300) | |
| bilingual_output = [] | |
| for clause in clauses_en: | |
| translated = translate_to_portuguese(clause) | |
| bilingual_output.append(f"π EN: {clause}\nπ PT: {translated}\n" + "-" * 60) | |
| return "\n\n".join(bilingual_output) | |
| # ========== Export to DOCX ========== | |
| def export_to_word(text: str, filename: str = "translated_contract.docx") -> str: | |
| """ | |
| Export text (bilingual or full) to Word DOCX. | |
| """ | |
| doc = Document() | |
| doc.add_heading("Legal Translation Output", level=1) | |
| for para in text.split("\n\n"): | |
| doc.add_paragraph(para) | |
| doc.save(filename) | |
| return filename | |