SCBconsulting commited on
Commit
bd5c699
Β·
verified Β·
1 Parent(s): 7aa9e67

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +47 -57
utils/translator.py CHANGED
@@ -1,31 +1,33 @@
1
- # utils/translate.py
2
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import torch
5
  from docx import Document
6
 
7
- # ========== Load EN β†’ PT model ==========
8
- en_pt_model_name = "unicamp-dl/translation-en-pt-t5"
9
- tokenizer_en_pt = AutoTokenizer.from_pretrained(en_pt_model_name)
10
- model_en_pt = AutoModelForSeq2SeqLM.from_pretrained(en_pt_model_name)
11
 
12
- # ========== Load PT β†’ EN model ==========
13
- pt_en_model_name = "unicamp-dl/translation-pt-en-t5"
14
- tokenizer_pt_en = AutoTokenizer.from_pretrained(pt_en_model_name)
15
- model_pt_en = AutoModelForSeq2SeqLM.from_pretrained(pt_en_model_name)
16
 
17
- # ========== Text Cleaning & Chunking ==========
 
18
 
19
- def clean_text(text):
 
 
 
 
 
20
  return text.replace("\n", " ").replace(" ", " ").strip()
21
 
22
- def chunk_text(text, max_chunk_chars=500):
23
  """
24
- πŸ”ͺ Break long input into token-safe chunks.
25
  """
26
  words = text.split()
27
- chunks = []
28
- current_chunk = ""
29
 
30
  for word in words:
31
  if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
@@ -35,71 +37,59 @@ def chunk_text(text, max_chunk_chars=500):
35
  current_chunk = word
36
  if current_chunk:
37
  chunks.append(current_chunk.strip())
38
-
39
  return chunks
40
 
41
- # ========== Translation Functions ==========
 
 
 
 
 
 
 
 
 
 
42
 
43
- def translate_to_portuguese(text):
44
  """
45
- πŸ‡ΊπŸ‡Έ ➑️ πŸ‡§πŸ‡· Translate English to Portuguese.
46
  """
47
  if not text.strip():
48
  return "No input provided."
 
 
49
 
50
- text = clean_text(text)
51
- chunks = chunk_text(text)
52
-
53
- translated_chunks = []
54
- for chunk in chunks:
55
- inputs = tokenizer_en_pt(chunk, return_tensors="pt", truncation=True, padding=True)
56
- with torch.no_grad():
57
- outputs = model_en_pt.generate(**inputs, max_length=512, num_beams=4)
58
- translated = tokenizer_en_pt.decode(outputs[0], skip_special_tokens=True)
59
- translated_chunks.append(translated)
60
-
61
- return " ".join(translated_chunks)
62
-
63
-
64
- def translate_to_english(text):
65
  """
66
- πŸ‡§πŸ‡· ➑️ πŸ‡ΊπŸ‡Έ Translate Portuguese to English.
67
  """
68
  if not text.strip():
69
  return "No input provided."
 
 
70
 
71
- text = clean_text(text)
72
- chunks = chunk_text(text)
73
 
74
- translated_chunks = []
75
- for chunk in chunks:
76
- inputs = tokenizer_pt_en(chunk, return_tensors="pt", truncation=True, padding=True)
77
- with torch.no_grad():
78
- outputs = model_pt_en.generate(**inputs, max_length=512, num_beams=4)
79
- translated = tokenizer_pt_en.decode(outputs[0], skip_special_tokens=True)
80
- translated_chunks.append(translated)
81
-
82
- return " ".join(translated_chunks)
83
-
84
- # ========== Bilingual Layout ==========
85
-
86
- def bilingual_clauses(english_text):
87
  """
88
- πŸ“„ Generate side-by-side bilingual clauses.
89
  """
90
  clauses_en = chunk_text(clean_text(english_text), max_chunk_chars=300)
91
- clauses_pt = [translate_to_portuguese(c) for c in clauses_en]
92
 
93
- bilingual = []
94
  for en, pt in zip(clauses_en, clauses_pt):
95
- bilingual.append(f"πŸ“˜ EN: {en}\nπŸ“— PT: {pt}\n" + "-" * 60)
96
- return "\n\n".join(bilingual)
 
97
 
98
  # ========== Export to DOCX ==========
99
 
100
- def export_to_word(text, filename="translated_contract.docx"):
101
  """
102
- πŸ“ Export text block to Word document.
103
  """
104
  doc = Document()
105
  doc.add_heading("Legal Translation Output", level=1)
 
1
+ # utils/translator.py
2
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import torch
5
  from docx import Document
6
 
7
+ # ========== Model Loading (Cached Once) ==========
 
 
 
8
 
9
+ def load_model_and_tokenizer(model_name):
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
12
+ return tokenizer, model
13
 
14
+ # English β†’ Portuguese
15
+ tokenizer_en_pt, model_en_pt = load_model_and_tokenizer("unicamp-dl/translation-en-pt-t5")
16
 
17
+ # Portuguese β†’ English
18
+ tokenizer_pt_en, model_pt_en = load_model_and_tokenizer("unicamp-dl/translation-pt-en-t5")
19
+
20
+ # ========== Preprocessing ==========
21
+
22
+ def clean_text(text: str) -> str:
23
  return text.replace("\n", " ").replace(" ", " ").strip()
24
 
25
+ def chunk_text(text: str, max_chunk_chars: int = 500):
26
  """
27
+ Split long text into token-safe chunks.
28
  """
29
  words = text.split()
30
+ chunks, current_chunk = [], ""
 
31
 
32
  for word in words:
33
  if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
 
37
  current_chunk = word
38
  if current_chunk:
39
  chunks.append(current_chunk.strip())
40
+
41
  return chunks
42
 
43
+ # ========== Translation Core Logic ==========
44
+
45
+ def translate_chunks(chunks, tokenizer, model):
46
+ translated = []
47
+ for chunk in chunks:
48
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
49
+ with torch.no_grad():
50
+ outputs = model.generate(**inputs, max_length=512, num_beams=4)
51
+ decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
52
+ translated.append(decoded)
53
+ return " ".join(translated)
54
 
55
+ def translate_to_portuguese(text: str) -> str:
56
  """
57
+ πŸ‡ΊπŸ‡Έ ➑️ πŸ‡§πŸ‡· Translate from English to Portuguese.
58
  """
59
  if not text.strip():
60
  return "No input provided."
61
+ chunks = chunk_text(clean_text(text))
62
+ return translate_chunks(chunks, tokenizer_en_pt, model_en_pt)
63
 
64
+ def translate_to_english(text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  """
66
+ πŸ‡§πŸ‡· ➑️ πŸ‡ΊπŸ‡Έ Translate from Portuguese to English.
67
  """
68
  if not text.strip():
69
  return "No input provided."
70
+ chunks = chunk_text(clean_text(text))
71
+ return translate_chunks(chunks, tokenizer_pt_en, model_pt_en)
72
 
73
+ # ========== Bilingual View ==========
 
74
 
75
+ def bilingual_clauses(english_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
76
  """
77
+ Create side-by-side bilingual clauses.
78
  """
79
  clauses_en = chunk_text(clean_text(english_text), max_chunk_chars=300)
80
+ clauses_pt = [translate_to_portuguese(clause) for clause in clauses_en]
81
 
82
+ bilingual_output = []
83
  for en, pt in zip(clauses_en, clauses_pt):
84
+ bilingual_output.append(f"πŸ“˜ EN: {en}\nπŸ“— PT: {pt}\n" + "-" * 60)
85
+
86
+ return "\n\n".join(bilingual_output)
87
 
88
  # ========== Export to DOCX ==========
89
 
90
+ def export_to_word(text: str, filename: str = "translated_contract.docx") -> str:
91
  """
92
+ Export bilingual translation to a Word document.
93
  """
94
  doc = Document()
95
  doc.add_heading("Legal Translation Output", level=1)