cstr commited on
Commit
43fa17a
Β·
verified Β·
1 Parent(s): 1715de1

Create translator.py

Browse files
Files changed (1) hide show
  1. translator.py +1651 -0
translator.py ADDED
@@ -0,0 +1,1651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import re
7
+ import sys
8
+ import gc
9
+ from collections import defaultdict
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional, Set, Tuple, Any
13
+ from enum import Enum
14
+ import requests
15
+ from tqdm import tqdm
16
+
17
+ # --- Core Library Diagnostics ---
18
+ print("πŸ” System Check...")
19
+
20
+ def check_library(name, import_stmt):
21
+ try:
22
+ exec(import_stmt, globals())
23
+ print(f" βœ“ {name}")
24
+ return True
25
+ except ImportError:
26
+ print(f" ⊘ {name} (optional)")
27
+ return False
28
+ except Exception as e:
29
+ print(f" βœ— {name} error: {e}")
30
+ return False
31
+
32
+ HAS_DOCX = check_library("python-docx", "from docx import Document; from docx.shared import Pt, RGBColor; from docx.text.paragraph import Paragraph; from docx.oxml.shared import OxmlElement; from docx.oxml.ns import qn")
33
+ HAS_TORCH = check_library("torch", "import torch")
34
+ HAS_CT2 = check_library("CTranslate2", "import ctranslate2; from huggingface_hub import snapshot_download")
35
+ HAS_TRANSFORMERS = check_library("Transformers", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM")
36
+ HAS_OPENAI = check_library("OpenAI", "from openai import AsyncOpenAI")
37
+ HAS_ANTHROPIC = check_library("Anthropic", "from anthropic import AsyncAnthropic")
38
+ HAS_SIMALIGN = check_library("simalign", "from simalign import SentenceAligner")
39
+
40
+ # --- Device & Backend Configuration ---
41
+ def get_torch_device():
42
+ if not HAS_TORCH: return "cpu"
43
+ import torch
44
+ if torch.cuda.is_available(): return torch.device("cuda")
45
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
46
+ return torch.device("mps")
47
+ return torch.device("cpu")
48
+
49
+ def get_ct2_settings():
50
+ """Optimized for M1 Mac (Accelerate CPU) vs NVIDIA (CUDA)."""
51
+ if HAS_TORCH:
52
+ import torch
53
+ if torch.cuda.is_available(): return "cuda", "float16"
54
+ # Mac M1/M2/M3: MUST use 'cpu' for CT2 and 'int8' for peak ARM64 optimization
55
+ return "cpu", "int8"
56
+
57
+ def check_fast_align():
58
+ script_dir = Path(__file__).parent
59
+ binary_locations = ["../fast_align/build/fast_align", "./fast_align/build/fast_align", "fast_align"]
60
+ for loc in binary_locations:
61
+ path = script_dir / loc if not loc.startswith('/') else Path(loc)
62
+ if os.path.isfile(path) and os.access(path, os.X_OK): return True
63
+ return False
64
+
65
+ HAS_FAST_ALIGN = check_fast_align()
66
+ print(f" {'βœ“' if HAS_FAST_ALIGN else '⊘'} fast_align")
67
+ print("-" * 60)
68
+
69
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
70
+ logger = logging.getLogger(__name__)
71
+
72
+ # ============================================================================
73
+ # ENUMS FOR CONFIGURATION
74
+ # ============================================================================
75
+
76
+ class TranslationBackend(Enum):
77
+ """Available translation backends"""
78
+ CT2 = "ct2"
79
+ NLLB = "nllb"
80
+ OPENAI = "openai"
81
+ ANTHROPIC = "anthropic"
82
+ OLLAMA = "ollama"
83
+ AUTO = "auto"
84
+
85
+
86
+ class AlignerBackend(Enum):
87
+ """Available alignment backends"""
88
+ LINDAT = "lindat"
89
+ FAST_ALIGN = "fast_align"
90
+ SIMALIGN = "simalign"
91
+ HEURISTIC = "heuristic"
92
+ AUTO = "auto"
93
+
94
+
95
+ class TranslationMode(Enum):
96
+ """Translation modes"""
97
+ NMT_ONLY = "nmt"
98
+ LLM_WITH_ALIGN = "llm-align"
99
+ LLM_WITHOUT_ALIGN = "llm-plain"
100
+ HYBRID = "hybrid"
101
+
102
+
103
+ # ============================================================================
104
+ # DATA STRUCTURES
105
+ # ============================================================================
106
+
107
+ @dataclass
108
+ class FormatRun:
109
+ """A run of text with formatting"""
110
+ text: str
111
+ bold: Optional[bool] = None
112
+ italic: Optional[bool] = None
113
+ underline: Optional[bool] = None
114
+ font_name: Optional[str] = None
115
+ font_size: Optional[float] = None
116
+ font_color: Optional[Tuple[int, int, int]] = None
117
+
118
+
119
+ @dataclass
120
+ class TranslatableParagraph:
121
+ """Paragraph with formatting and metadata"""
122
+ runs: List[FormatRun] = field(default_factory=list)
123
+ metadata: Dict[str, Any] = field(default_factory=dict)
124
+
125
+ def get_text(self) -> str:
126
+ return ''.join(run.text for run in self.runs)
127
+
128
+ def get_words(self) -> List[str]:
129
+ """Clean tokenization for the ALIGNER only: strips punctuation."""
130
+ import re
131
+ text = self.get_text()
132
+ # Extracts only alphanumeric sequences (e.g., "Theology" from "Theology:")
133
+ return re.findall(r"\w+", text)
134
+
135
+ def get_formatted_word_indices(self) -> Dict[str, Set[int]]:
136
+ """Maps formatting to clean alphanumeric word indices."""
137
+ formatted = {'italic': set(), 'bold': set(), 'italic_bold': set()}
138
+ text = self.get_text()
139
+ words = self.get_words() # Uses the same list the aligner sees
140
+
141
+ if not words:
142
+ return formatted
143
+
144
+ char_to_word = {}
145
+ last_found = 0
146
+ for word_idx, word in enumerate(words):
147
+ # Find the word in the text, starting search after the previous word
148
+ start = text.find(word, last_found)
149
+ if start != -1:
150
+ for i in range(start, start + len(word)):
151
+ char_to_word[i] = word_idx
152
+ last_found = start + len(word)
153
+
154
+ char_pos = 0
155
+ for run in self.runs:
156
+ if not run.text:
157
+ continue
158
+ for char in run.text:
159
+ if char_pos in char_to_word:
160
+ word_idx = char_to_word[char_pos]
161
+ if not char.isspace():
162
+ if run.bold and run.italic:
163
+ formatted['italic_bold'].add(word_idx)
164
+ elif run.italic:
165
+ formatted['italic'].add(word_idx)
166
+ elif run.bold:
167
+ formatted['bold'].add(word_idx)
168
+ char_pos += 1
169
+ return formatted
170
+
171
+
172
+ # ============================================================================
173
+ # TRANSLATION BACKENDS (keeping previous implementations)
174
+ # ============================================================================
175
+
176
+ class CTranslate2Translator:
177
+ """WMT21 Backend: High-quality dense model."""
178
+
179
+ MODELS = {
180
+ 'en_to_x': 'cstr/wmt21ct2_int8',
181
+ 'x_to_en': 'cstr/wmt21-ct2-x-en-int8',
182
+ }
183
+
184
+ SUPPORTED_LANGS = ['de', 'es', 'fr', 'it', 'ja', 'zh', 'ru', 'pt', 'nl', 'cs', 'uk']
185
+
186
+ def __init__(self, src_lang: str, tgt_lang: str):
187
+ self.src_lang, self.tgt_lang = src_lang, tgt_lang
188
+ self.available = False
189
+ self.ct2_dev, self.ct2_compute = get_ct2_settings()
190
+
191
+ if not HAS_CT2: return
192
+
193
+ if src_lang == 'en' and tgt_lang in self.SUPPORTED_LANGS:
194
+ self.direction, self.tokenizer_name = 'en_to_x', "facebook/wmt21-dense-24-wide-en-x"
195
+ elif tgt_lang == 'en' and src_lang in self.SUPPORTED_LANGS:
196
+ self.direction, self.tokenizer_name = 'x_to_en', "facebook/wmt21-dense-24-wide-x-en"
197
+ else: return
198
+
199
+ try:
200
+ logger.info(f"Loading WMT21-CT2 ({self.direction})...")
201
+ model_path = self._get_or_download_model()
202
+ self.translator = ctranslate2.Translator(
203
+ model_path, device=self.ct2_dev, compute_type=self.ct2_compute
204
+ )
205
+ self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
206
+ self.available = True
207
+ logger.info(f"βœ“ WMT21 CT2 ready on {self.ct2_dev}")
208
+ except Exception as e:
209
+ logger.error(f"WMT21 initialization failure: {e}")
210
+
211
+ def _get_or_download_model(self) -> str:
212
+ cache_base = Path.home() / '.cache' / 'huggingface' / 'hub'
213
+ model_repo = self.MODELS[self.direction]
214
+ model_dir = cache_base / f"models--{model_repo.replace('/', '--')}"
215
+ ref_path = model_dir / 'refs' / 'main'
216
+
217
+ if ref_path.exists():
218
+ with open(ref_path) as f:
219
+ commit_hash = f.read().strip()
220
+ snapshot_path = model_dir / 'snapshots' / commit_hash
221
+ if (snapshot_path / 'model.bin').exists():
222
+ return str(snapshot_path)
223
+
224
+ logger.info("Downloading CT2 model...")
225
+ return snapshot_download(repo_id=model_repo)
226
+
227
+ def translate_batch(self, texts: List[str]) -> List[str]:
228
+ if not self.available or not texts:
229
+ return texts
230
+
231
+ try:
232
+ # 1. Ensure language tokens are correctly formatted
233
+ # WMT21 models expect the target lang as a prefix
234
+ target_prefix = [[self.tokenizer.lang_code_to_token[self.tgt_lang]]] * len(texts)
235
+
236
+ # 2. Tokenize with specific padding/truncation
237
+ source_tokens = [self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t)) for t in texts]
238
+
239
+ # 3. Translate with proper beam search and repetition penalty
240
+ results = self.translator.translate_batch(
241
+ source_tokens,
242
+ target_prefix=target_prefix,
243
+ beam_size=5,
244
+ max_batch_size=16,
245
+ repetition_penalty=1.2,
246
+ # Prevent the model from just 'copying' the source if it gets confused
247
+ disable_unk=True
248
+ )
249
+
250
+ translated = []
251
+ for i, res in enumerate(results):
252
+ # Strip the language token from the start of the result
253
+ tokens = res.hypotheses[0]
254
+ if self.tokenizer.lang_code_to_token[self.tgt_lang] in tokens:
255
+ tokens = tokens[tokens.index(self.tokenizer.lang_code_to_token[self.tgt_lang]) + 1:]
256
+
257
+ decoded = self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True)
258
+ translated.append(decoded.strip())
259
+
260
+ return translated
261
+ except Exception as e:
262
+ logger.error(f"CT2 Critical Error: {e}")
263
+ return texts
264
+ class OpusMTTranslator:
265
+ """Opus-MT Backend: Tiny, specialized bilingual models. Standalone logic."""
266
+ def __init__(self, src_lang: str, tgt_lang: str):
267
+ self.src_lang, self.tgt_lang = src_lang, tgt_lang
268
+ self.available = False
269
+ self.ct2_dev, self.ct2_compute = get_ct2_settings()
270
+
271
+ # The standard name format for Opus-MT
272
+ original_repo = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
273
+ # Your custom optimized repo
274
+ self.custom_repo = f"cstr/opus-mt-{src_lang}-{tgt_lang}-ct2-int8"
275
+
276
+ try:
277
+ logger.info(f"NMT | Loading weights from {self.custom_repo}...")
278
+ model_path = snapshot_download(repo_id=self.custom_repo)
279
+
280
+ self.translator = ctranslate2.Translator(model_path, device=self.ct2_dev, compute_type=self.ct2_compute)
281
+
282
+ # FIXED: Load tokenizer from the ORIGINAL repo name to get the correct Marian model_type
283
+ # Transformers library will use the tiny cached JSON files from the original.
284
+ self.tokenizer = AutoTokenizer.from_pretrained(original_repo)
285
+
286
+ self.available = True
287
+ logger.info(f"βœ“ NMT | Opus-MT ready (Weights: cstr / Tokenizer: original)")
288
+ except Exception as e:
289
+ logger.warning(f"NMT | Opus-MT primary load failed: {e}. Trying michaelfeil fallback...")
290
+ try:
291
+ fallback = f"michaelfeil/ct2fast-opus-mt-{src_lang}-{tgt_lang}"
292
+ model_path = snapshot_download(repo_id=fallback)
293
+ self.translator = ctranslate2.Translator(model_path, device=self.ct2_dev, compute_type=self.ct2_compute)
294
+ self.tokenizer = AutoTokenizer.from_pretrained(original_repo)
295
+ self.available = True
296
+ logger.info(f"βœ“ NMT | Opus-MT ready using {fallback}")
297
+ except:
298
+ logger.error("NMT | All Opus-MT paths failed.")
299
+
300
+ def translate_batch(self, texts: List[str]) -> List[str]:
301
+ if not self.available or not texts: return texts
302
+ source_tokens = [self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t)) for t in texts]
303
+ results = self.translator.translate_batch(source_tokens, beam_size=5)
304
+ return [self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(r.hypotheses[0]), skip_special_tokens=True) for r in results]
305
+
306
+ class Madlad400Translator:
307
+ """Madlad-400 Backend: Google's 3B powerhouse. Optimized for your cstr/ repo."""
308
+ def __init__(self, src_lang: str, tgt_lang: str, model_size: str = "3b"):
309
+ self.src_lang, self.tgt_lang = src_lang, tgt_lang
310
+ self.available = False
311
+ self.ct2_dev, self.ct2_compute = get_ct2_settings()
312
+
313
+ original_repo = f"google/madlad400-{model_size}-mt"
314
+ self.custom_repo = f"cstr/madlad400-{model_size}-ct2-int8"
315
+ self.tgt_prefix = f"<2{tgt_lang}>"
316
+
317
+ try:
318
+ logger.info(f"NMT | Loading Madlad-400 from {self.custom_repo}...")
319
+ model_path = snapshot_download(repo_id=self.custom_repo)
320
+
321
+ self.translator = ctranslate2.Translator(model_path, device=self.ct2_dev, compute_type=self.ct2_compute)
322
+
323
+ # FIXED: Point tokenizer to Google's repo to resolve the T5 architecture correctly
324
+ self.tokenizer = AutoTokenizer.from_pretrained(original_repo)
325
+
326
+ self.available = True
327
+ logger.info(f"βœ“ NMT | Madlad-400 ready.")
328
+ except Exception as e:
329
+ logger.error(f"NMT | Madlad-400 load failed: {e}")
330
+
331
+ def translate_batch(self, texts: List[str]) -> List[str]:
332
+ if not self.available or not texts: return texts
333
+ # Prepends <2de> (or similar) to every sentence in the batch
334
+ source_tokens = [self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(f"{self.tgt_prefix} {t}")) for t in texts]
335
+ results = self.translator.translate_batch(source_tokens, beam_size=1, repetition_penalty=2.0)
336
+ return [self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(r.hypotheses[0]), skip_special_tokens=True) for r in results]
337
+ class NLLBTranslator:
338
+ """NLLB-200 translator using CTranslate2 for 4x speedup and low memory"""
339
+
340
+ # Map sizes to specific CTranslate2 optimized repositories
341
+ REPOS = {
342
+ "600M": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
343
+ "1.3B": "OpenNMT/nllb-200-distilled-1.3B-ct2-int8",
344
+ "3.3B": "OpenNMT/nllb-200-3.3B-ct2-int8"
345
+ }
346
+
347
+ LANG_CODES = {
348
+ 'en': 'eng_Latn', 'de': 'deu_Latn', 'fr': 'fra_Latn',
349
+ 'es': 'spa_Latn', 'it': 'ita_Latn', 'pt': 'por_Latn',
350
+ 'ru': 'rus_Cyrl', 'zh': 'zho_Hans', 'ja': 'jpn_Jpan',
351
+ 'ko': 'kor_Hang', 'ar': 'arb_Arab', 'hi': 'hin_Deva',
352
+ 'nl': 'nld_Latn', 'pl': 'pol_Latn', 'tr': 'tur_Latn',
353
+ 'cs': 'ces_Latn', 'uk': 'ukr_Cyrl', 'vi': 'vie_Latn',
354
+ }
355
+
356
+ def __init__(self, src_lang: str, tgt_lang: str, model_size: str = "600M"):
357
+ self.src_lang, self.tgt_lang = src_lang, tgt_lang
358
+ self.available = False
359
+ self.mode = None
360
+ self.device = get_torch_device()
361
+ self.ct2_dev, self.ct2_compute = get_ct2_settings()
362
+
363
+ self.src_code = self.LANG_CODES.get(src_lang)
364
+ self.tgt_code = self.LANG_CODES.get(tgt_lang)
365
+
366
+ if not self.src_code or not self.tgt_code: return
367
+
368
+ ct2_repo = self.REPOS.get(model_size, self.REPOS["600M"])
369
+ standard_repo = f"facebook/nllb-200-distilled-{model_size}"
370
+
371
+ if HAS_CT2:
372
+ try:
373
+ logger.info(f"Loading NLLB-CT2 from {ct2_repo}...")
374
+ model_path = snapshot_download(repo_id=ct2_repo)
375
+ try:
376
+ self.translator = ctranslate2.Translator(
377
+ model_path, device=self.ct2_dev, compute_type=self.ct2_compute
378
+ )
379
+ except Exception:
380
+ # Specific fallback for Mac CPU optimization mismatch
381
+ self.translator = ctranslate2.Translator(
382
+ model_path, device="cpu", compute_type="int8"
383
+ )
384
+
385
+ self.tokenizer = AutoTokenizer.from_pretrained(standard_repo, src_lang=self.src_code)
386
+ self.mode, self.available = "ct2", True
387
+ logger.info(f"βœ“ NLLB-{model_size} CT2 ready.")
388
+ return
389
+ except Exception as e:
390
+ logger.warning(f"NLLB-CT2 failed: {e}")
391
+
392
+ if HAS_TRANSFORMERS and HAS_TORCH:
393
+ try:
394
+ logger.info(f"Fallback: Loading standard PyTorch NLLB...")
395
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(standard_repo)
396
+ self.tokenizer = HFTokenizer.from_pretrained(standard_repo, src_lang=self.src_code)
397
+ if self.device.type == "mps": self.model = self.model.half()
398
+ self.model.to(self.device).eval()
399
+ self.mode, self.available = "torch", True
400
+ logger.info(f"βœ“ NLLB PyTorch ready on {self.device}")
401
+ except Exception as e:
402
+ logger.error(f"Critical: All NLLB paths failed: {e}")
403
+
404
+ def translate_batch(self, texts: List[str], batch_size: int = 16) -> List[str]:
405
+ """Translate batch using CT2 efficient beam search"""
406
+ if not self.available or not texts:
407
+ return texts
408
+
409
+ try:
410
+ results = []
411
+ # NLLB requires the target language code as the first token (target prefix)
412
+ target_prefix = [self.tgt_code]
413
+
414
+ for i in range(0, len(texts), batch_size):
415
+ batch = texts[i:i + batch_size]
416
+
417
+ # Tokenize
418
+ source_tokens = [
419
+ self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t))
420
+ for t in batch
421
+ ]
422
+
423
+ # Translate
424
+ step_results = self.translator.translate_batch(
425
+ source_tokens,
426
+ target_prefix=[target_prefix] * len(batch),
427
+ beam_size=4,
428
+ max_batch_size=batch_size,
429
+ repetition_penalty=1.1
430
+ )
431
+
432
+ # Decode (skipping the lang code prefix in the output)
433
+ for res in step_results:
434
+ tokens = res.hypotheses[0]
435
+ # The first token is usually the target lang code
436
+ if tokens[0] == self.tgt_code:
437
+ tokens = tokens[1:]
438
+
439
+ decoded = self.tokenizer.decode(
440
+ self.tokenizer.convert_tokens_to_ids(tokens),
441
+ skip_special_tokens=True
442
+ )
443
+ results.append(decoded.strip())
444
+
445
+ return results
446
+ except Exception as e:
447
+ logger.error(f"NLLB-CT2 translation failed: {e}")
448
+ return texts
449
+
450
+ class LLMTranslator:
451
+ """LLM translator (OpenAI/Anthropic/Ollama)"""
452
+
453
+ def __init__(self, src_lang: str, tgt_lang: str, preferred_provider: Optional[str] = None):
454
+ self.src_lang = src_lang
455
+ self.tgt_lang = tgt_lang
456
+ self.providers = self._init_providers(preferred_provider)
457
+
458
+ if self.providers:
459
+ logger.info(f"βœ“ LLM available ({list(self.providers.keys())})")
460
+
461
+ def _init_providers(self, preferred: Optional[str] = None) -> Dict[str, Any]:
462
+ providers = {}
463
+
464
+ if HAS_OPENAI and os.getenv("OPENAI_API_KEY"):
465
+ providers["openai"] = {
466
+ "client": AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")),
467
+ "model": os.getenv("OPENAI_MODEL", "gpt-4o-mini")
468
+ }
469
+
470
+ if HAS_ANTHROPIC and os.getenv("ANTHROPIC_API_KEY"):
471
+ providers["anthropic"] = {
472
+ "client": AsyncAnthropic(api_key=os.getenv("ANTHROPIC_API_KEY")),
473
+ "model": os.getenv("ANTHROPIC_MODEL", "claude-3-5-sonnet-20241022")
474
+ }
475
+
476
+ if self._check_ollama():
477
+ providers["ollama"] = {
478
+ "url": "http://localhost:11434/api/generate",
479
+ "model": self._get_ollama_model()
480
+ }
481
+
482
+ if preferred and preferred in providers:
483
+ return {preferred: providers[preferred]}
484
+
485
+ return providers
486
+
487
+ def _check_ollama(self) -> bool:
488
+ try:
489
+ r = requests.get("http://localhost:11434/api/tags", timeout=2)
490
+ return r.status_code == 200 and len(r.json().get("models", [])) > 0
491
+ except:
492
+ return False
493
+
494
+ def _get_ollama_model(self) -> str:
495
+ try:
496
+ r = requests.get("http://localhost:11434/api/tags", timeout=2)
497
+ models = r.json().get("models", [])
498
+ return models[0]["name"] if models else "llama3.2"
499
+ except:
500
+ return "llama3.2"
501
+
502
+ async def translate_text(self, text: str, use_alignment: bool = True) -> Optional[str]:
503
+ """Translate single text"""
504
+ if not text.strip() or not self.providers:
505
+ return None
506
+
507
+ if use_alignment:
508
+ prompt = (
509
+ f"Translate the following text from {self.src_lang} to {self.tgt_lang}. "
510
+ f"Preserve the word order as much as possible for alignment purposes. "
511
+ f"Return ONLY the translation:\n\n{text}"
512
+ )
513
+ else:
514
+ prompt = (
515
+ f"Translate the following text from {self.src_lang} to {self.tgt_lang}. "
516
+ f"Provide a natural, fluent translation. "
517
+ f"Return ONLY the translation:\n\n{text}"
518
+ )
519
+
520
+ for provider_name, provider in self.providers.items():
521
+ try:
522
+ if provider_name == "openai":
523
+ response = await provider["client"].chat.completions.create(
524
+ model=provider["model"],
525
+ messages=[{"role": "user", "content": prompt}],
526
+ temperature=0.3,
527
+ max_tokens=4000
528
+ )
529
+ return response.choices[0].message.content.strip()
530
+
531
+ elif provider_name == "anthropic":
532
+ response = await provider["client"].messages.create(
533
+ model=provider["model"],
534
+ messages=[{"role": "user", "content": prompt}],
535
+ temperature=0.3,
536
+ max_tokens=4000
537
+ )
538
+ return response.content[0].text.strip()
539
+
540
+ elif provider_name == "ollama":
541
+ r = requests.post(
542
+ provider["url"],
543
+ json={"model": provider["model"], "prompt": prompt, "stream": False},
544
+ timeout=120
545
+ )
546
+ if r.status_code == 200:
547
+ return r.json().get("response", "").strip()
548
+
549
+ except Exception as e:
550
+ logger.debug(f"{provider_name} failed: {e}")
551
+ continue
552
+
553
+ return None
554
+
555
+ async def translate_batch(self, texts: List[str], use_alignment: bool = True) -> List[str]:
556
+ """Translate batch"""
557
+ tasks = [self.translate_text(text, use_alignment) for text in texts]
558
+ results = await asyncio.gather(*tasks)
559
+ return [res if res else text for res, text in zip(results, texts)]
560
+
561
+
562
+ # ============================================================================
563
+ # WORD ALIGNERS
564
+ # ============================================================================
565
+
566
+ class LindatAligner:
567
+ """Lindat word alignment API: Zero RAM usage fallback."""
568
+
569
+ def __init__(self, src_lang: str, tgt_lang: str):
570
+ self.src_lang, self.tgt_lang = src_lang, tgt_lang
571
+ self.available = self._check_available()
572
+ if self.available:
573
+ logger.info(f"βœ“ ALIGN | Lindat API available ({src_lang}-{tgt_lang})")
574
+
575
+ def _check_available(self) -> bool:
576
+ try:
577
+ r = requests.get(f"https://lindat.cz/services/text-aligner/align/{self.src_lang}-{self.tgt_lang}", timeout=3)
578
+ return r.status_code in [200, 405]
579
+ except: return False
580
+
581
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
582
+ if not self.available or not src_words or not tgt_words: return []
583
+ try:
584
+ r = requests.post(
585
+ f"https://lindat.cz/services/text-aligner/align/{self.src_lang}-{self.tgt_lang}",
586
+ headers={'Content-Type': 'application/json'},
587
+ json={'src_tokens': [src_words], 'trg_tokens': [tgt_words]},
588
+ timeout=15
589
+ )
590
+ if r.status_code == 200:
591
+ alignment = r.json()["alignment"][0]
592
+ return [(int(a[0]), int(a[1])) for a in alignment]
593
+ except Exception as e:
594
+ logger.debug(f"ALIGN | Lindat request failed: {e}")
595
+ return []
596
+
597
+ class AwesomeAlignAligner:
598
+ """BERT Aligner: Uses custom CT2-INT8 model from HuggingFace."""
599
+
600
+ def __init__(self):
601
+ self.available = False
602
+ self.mode = None
603
+ self.device = get_torch_device()
604
+ self.ct2_dev, self.ct2_compute = get_ct2_settings()
605
+ self.ct2_repo = "cstr/bert-base-multilingual-cased-ct2-int8"
606
+ self.standard_repo = "bert-base-multilingual-cased"
607
+
608
+ if HAS_CT2:
609
+ try:
610
+ logger.info(f"Loading CT2 Aligner from {self.ct2_repo}...")
611
+ model_path = snapshot_download(repo_id=self.ct2_repo)
612
+ self.encoder = ctranslate2.Encoder(
613
+ model_path, device=self.ct2_dev, compute_type=self.ct2_compute,
614
+ intra_threads=1
615
+ )
616
+ # Load tokenizer from string to avoid local JSON collision
617
+ self.tokenizer = AutoTokenizer.from_pretrained(self.standard_repo)
618
+ self.mode, self.available = "ct2", True
619
+ logger.info("βœ“ Awesome-Align CT2 ready.")
620
+ return
621
+ except Exception as e:
622
+ logger.warning(f"CT2 Aligner load failed: {e}")
623
+
624
+ if HAS_TRANSFORMERS and HAS_TORCH:
625
+ try:
626
+ from transformers import BertModel, BertTokenizer
627
+ self.model = BertModel.from_pretrained(self.standard_repo)
628
+ self.tokenizer = BertTokenizer.from_pretrained(self.standard_repo)
629
+ if self.device.type == "mps": self.model = self.model.half()
630
+ self.model.to(self.device).eval()
631
+ self.mode, self.available = "torch", True
632
+ logger.info(f"βœ“ Awesome-Align PyTorch ready on {self.device}")
633
+ except Exception as e:
634
+ logger.error(f"Critical: Aligner fallback failed: {e}")
635
+
636
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
637
+ """
638
+ Extracts high-precision word alignments using BERT embeddings.
639
+ Uses Mutual Argmax (Intersection) logic for 1-to-1 precision.
640
+ Compatible with Mac CTranslate2 (no return_all_layers).
641
+ """
642
+ if not self.available or not src_words or not tgt_words:
643
+ return []
644
+
645
+ import numpy as np
646
+ src_out, tgt_out = None, None
647
+
648
+ try:
649
+ # 1. PRE-PROCESSING: Subword tokenization
650
+ def get_tokens_and_map(words):
651
+ subtokens, word_map = [], []
652
+ for i, w in enumerate(words):
653
+ tokens = self.tokenizer.tokenize(w) or [self.tokenizer.unk_token]
654
+ subtokens.extend(tokens)
655
+ word_map.extend([i] * len(tokens))
656
+ return subtokens, word_map
657
+
658
+ src_subtokens, src_word_map = get_tokens_and_map(src_words)
659
+ tgt_subtokens, tgt_word_map = get_tokens_and_map(tgt_words)
660
+
661
+ # 2. EMBEDDING EXTRACTION
662
+ if self.mode == "ct2":
663
+ src_input = [["[CLS]"] + src_subtokens + ["[SEP]"]]
664
+ tgt_input = [["[CLS]"] + tgt_subtokens + ["[SEP]"]]
665
+
666
+ # Use standard batch forward for cross-version compatibility
667
+ res_src = self.encoder.forward_batch(src_input)
668
+ res_tgt = self.encoder.forward_batch(tgt_input)
669
+
670
+ # Extract Layer 12 (last_hidden_state)
671
+ src_out = np.array(res_src.last_hidden_state)[0, 1:-1]
672
+ tgt_out = np.array(res_tgt.last_hidden_state)[0, 1:-1]
673
+
674
+ else: # PyTorch Mode
675
+ import torch
676
+ def to_ids(tokens):
677
+ ids = self.tokenizer.convert_tokens_to_ids(tokens)
678
+ return torch.tensor([self.tokenizer.cls_token_id] + ids + [self.tokenizer.sep_token_id]).to(self.device)
679
+
680
+ with torch.no_grad():
681
+ # PyTorch uses Layer 8 (sweet spot)
682
+ out_s = self.model(to_ids(src_subtokens).unsqueeze(0), output_hidden_states=True)[2][8][0, 1:-1]
683
+ out_t = self.model(to_ids(tgt_subtokens).unsqueeze(0), output_hidden_states=True)[2][8][0, 1:-1]
684
+ src_out = out_s.detach().cpu().float().numpy()
685
+ tgt_out = out_t.detach().cpu().float().numpy()
686
+
687
+ # 3. STABLE ALIGNMENT LOGIC: Mutual Argmax
688
+ # Normalize vectors for cosine similarity
689
+ src_norm = src_out / np.linalg.norm(src_out, axis=-1, keepdims=True)
690
+ tgt_norm = tgt_out / np.linalg.norm(tgt_out, axis=-1, keepdims=True)
691
+ similarity = np.dot(src_norm, tgt_norm.T)
692
+
693
+ # Find best matches in both directions
694
+ best_tgt_for_src = np.argmax(similarity, axis=1) # Shape: (src_len,)
695
+ best_src_for_tgt = np.argmax(similarity, axis=0) # Shape: (tgt_len,)
696
+
697
+ threshold = 1e-3
698
+ align_words = set()
699
+
700
+ # Mutual Agreement (The standard working method)
701
+ for i, j in enumerate(best_tgt_for_src):
702
+ # If source i picked target j, AND target j picked source i...
703
+ if best_src_for_tgt[j] == i and similarity[i, j] > threshold:
704
+ # Map subword indices back to word indices
705
+ align_words.add((src_word_map[i], tgt_word_map[j]))
706
+
707
+ final_alignments = sorted(list(align_words))
708
+
709
+ # VERBOSE CLI LOG
710
+ logger.debug(f"TRACE | Awesome-Align | Mode: {self.mode.upper()} | Links: {len(final_alignments)}")
711
+
712
+ return final_alignments
713
+
714
+ except Exception as e:
715
+ logger.error(f"ALIGN | Logic failure: {e}")
716
+ return []
717
+
718
+ def align_old(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
719
+ if not self.available or not src_words or not tgt_words:
720
+ return []
721
+
722
+ try:
723
+ import numpy as np
724
+
725
+ # 1. Tokenize
726
+ def get_tokens_and_map(words):
727
+ tokens = []
728
+ word_map = []
729
+ for i, w in enumerate(words):
730
+ subwords = self.tokenizer.tokenize(w)
731
+ tokens.extend(subwords)
732
+ word_map.extend([i] * len(subwords))
733
+ return tokens, word_map
734
+
735
+ src_tokens, src_map = get_tokens_and_map(src_words)
736
+ tgt_tokens, tgt_map = get_tokens_and_map(tgt_words)
737
+
738
+ # 2. Extract Embeddings using CTranslate2
739
+ # We add [CLS] and [SEP] just like BERT expects
740
+ src_input = [["[CLS]"] + src_tokens + ["[SEP]"]]
741
+ tgt_input = [["[CLS]"] + tgt_tokens + ["[SEP]"]]
742
+
743
+ # forward_batch returns a StorageView; we convert to numpy for easy math
744
+ src_out = np.array(self.encoder.forward_batch(src_input).last_hidden_state)[0, 1:-1]
745
+ tgt_out = np.array(self.encoder.forward_batch(tgt_input).last_hidden_state)[0, 1:-1]
746
+
747
+ # 3. Compute Similarity (Dot Product)
748
+ # Normalizing vectors first ensures we are doing Cosine Similarity
749
+ src_out /= np.linalg.norm(src_out, axis=-1, keepdims=True)
750
+ tgt_out /= np.linalg.norm(tgt_out, axis=-1, keepdims=True)
751
+
752
+ similarity = np.dot(src_out, tgt_out.T)
753
+
754
+ # 4. Extract Alignments (Mutual Argmax / Threshold)
755
+ threshold = 1e-3
756
+ best_src = np.argmax(similarity, axis=1)
757
+ best_tgt = np.argmax(similarity, axis=0)
758
+
759
+ align_words = set()
760
+ for i, j in enumerate(best_src):
761
+ if best_tgt[j] == i and similarity[i, j] > threshold:
762
+ align_words.add((src_map[i], tgt_map[j]))
763
+
764
+ return sorted(list(align_words))
765
+
766
+ except Exception as e:
767
+ logger.debug(f"CT2 Alignment failed: {e}")
768
+ return []
769
+
770
+ class FastAlignAligner:
771
+ """fast_align local aligner - uses temp file for binary"""
772
+
773
+ def __init__(self):
774
+ self.available = False
775
+ self.mode = None
776
+ self.binary_path = None
777
+ self.atools_path = None
778
+
779
+ # Check for Python package first
780
+ try:
781
+ import fast_align
782
+ self.available = True
783
+ self.mode = "python"
784
+ return
785
+ except ImportError:
786
+ pass
787
+
788
+ # Binary search logic
789
+ script_dir = Path(__file__).parent
790
+ search_dirs = [script_dir / "../fast_align/build", script_dir / "./fast_align/build"]
791
+
792
+ for d in search_dirs:
793
+ fa = d / "fast_align"
794
+ at = d / "atools"
795
+ if fa.exists() and os.access(fa, os.X_OK):
796
+ self.binary_path = str(fa)
797
+ self.atools_path = str(at) if at.exists() else None
798
+ self.available = True
799
+ self.mode = "binary"
800
+ return
801
+
802
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
803
+ if not self.available or not src_words or not tgt_words:
804
+ return []
805
+
806
+ try:
807
+ if self.mode == "python":
808
+ from fast_align import align
809
+ src_text = ' '.join(src_words)
810
+ tgt_text = ' '.join(tgt_words)
811
+ result = align([f"{src_text} ||| {tgt_text}"], forward=True)
812
+ return [tuple(map(int, p.split('-'))) for p in result[0].split()]
813
+
814
+ elif self.mode == "binary":
815
+ import subprocess
816
+ import tempfile
817
+
818
+ input_str = f"{' '.join(src_words)} ||| {' '.join(tgt_words)}\n"
819
+
820
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
821
+ f.write(input_str)
822
+ temp_path = f.name
823
+
824
+ try:
825
+ # -d: diagonal, -o: optimize tension, -v: variational bayes
826
+ # -I 1: single pass is enough for alignment of a known translation
827
+ cmd = [self.binary_path, '-i', temp_path, '-d', '-o', '-v', '-I', '1']
828
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
829
+
830
+ if result.returncode != 0:
831
+ return []
832
+
833
+ # fast_align binary outputs to stdout in Pharaoh format (i-j)
834
+ alignments = []
835
+ output = result.stdout.strip()
836
+ if output:
837
+ for pair in output.split():
838
+ if '-' in pair:
839
+ i, j = map(int, pair.split('-'))
840
+ alignments.append((i, j))
841
+ return alignments
842
+ finally:
843
+ if os.path.exists(temp_path): os.unlink(temp_path)
844
+ except Exception as e:
845
+ logger.debug(f"fast_align execution failed: {e}")
846
+ return []
847
+
848
+ class SimAlignAligner:
849
+ """SimAlign Aligner: Heavy PyTorch BERT (1.2GB RAM)."""
850
+ def __init__(self):
851
+ self.available = globals().get('HAS_SIMALIGN', False)
852
+ if self.available:
853
+ try:
854
+ from simalign import SentenceAligner
855
+ # FIXED: matching_methods must be a short string mapping code
856
+ # "m" maps to "mwmf" internally in simalign.simalign.py
857
+ self.aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="m", device="cpu")
858
+ logger.info("βœ“ ALIGN | SimAlign (Standard BERT) ready.")
859
+ except Exception as e:
860
+ logger.error(f"ALIGN | SimAlign init failed: {e}")
861
+ self.available = False
862
+
863
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
864
+ if not self.available or not src_words or not tgt_words: return []
865
+ try:
866
+ # returns dict of lists: {'mwmf': [(0,0), ...]}
867
+ result = self.aligner.get_word_aligns(src_words, tgt_words)
868
+ alignments = result.get('mwmf', [])
869
+ return [(int(a[0]), int(a[1])) for a in alignments]
870
+ except Exception as e:
871
+ logger.error(f"ALIGN | SimAlign failed: {e}")
872
+ return []
873
+
874
+ class HeuristicAligner:
875
+ """Heuristic fallback - align words that appear in both"""
876
+
877
+ def __init__(self):
878
+ logger.info("βœ“ Heuristic aligner (fallback)")
879
+
880
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
881
+ """Simple heuristic alignment"""
882
+ alignments = []
883
+
884
+ src_lower = [w.lower().strip('.,!?;:') for w in src_words]
885
+ tgt_lower = [w.lower().strip('.,!?;:') for w in tgt_words]
886
+
887
+ for i, src_word in enumerate(src_lower):
888
+ for j, tgt_word in enumerate(tgt_lower):
889
+ if src_word == tgt_word and len(src_word) > 2:
890
+ alignments.append((i, j))
891
+
892
+ return alignments
893
+
894
+
895
+ class MultiAligner:
896
+ """Try multiple aligners with proper fallback"""
897
+
898
+ def __init__(self, src_lang: str, tgt_lang: str, preferred: Optional[str] = None):
899
+ self.aligners = []
900
+
901
+ # If a specific aligner was requested
902
+ if preferred and preferred != "auto":
903
+ if preferred == "lindat":
904
+ lindat = LindatAligner(src_lang, tgt_lang)
905
+ if lindat.available:
906
+ self.aligners.append(("Lindat", lindat))
907
+ else:
908
+ logger.warning(f"Requested aligner '{preferred}' not available, will try others")
909
+
910
+ elif preferred == "awesome":
911
+ awesome = AwesomeAlignAligner()
912
+ if awesome.available:
913
+ self.aligners.append(("awesome-align", awesome))
914
+
915
+ elif preferred == "fast_align":
916
+ fast_align = FastAlignAligner()
917
+ if HAS_FAST_ALIGN:
918
+ self.aligners.append(("fast_align", fast_align))
919
+ else:
920
+ logger.warning(f"fast_align requested but binary/package not found.")
921
+
922
+ elif preferred == "simalign":
923
+ simalign = SimAlignAligner()
924
+ if simalign.available:
925
+ self.aligners.append(("SimAlign", simalign))
926
+ else:
927
+ logger.warning(f"Requested aligner '{preferred}' not available, will try others")
928
+
929
+ elif preferred == "heuristic":
930
+ self.aligners.append(("Heuristic", HeuristicAligner()))
931
+
932
+ # Auto mode or fallback: try all available aligners
933
+ if not self.aligners or preferred == "auto":
934
+ # Try Lindat first
935
+ lindat = LindatAligner(src_lang, tgt_lang)
936
+ if lindat.available:
937
+ self.aligners.append(("Lindat", lindat))
938
+
939
+ awesome = AwesomeAlignAligner()
940
+ if awesome.available:
941
+ self.aligners.append(("awesome-align", awesome))
942
+
943
+ # Then SimAlign
944
+ simalign = SimAlignAligner()
945
+ if simalign.available:
946
+ self.aligners.append(("SimAlign", simalign))
947
+
948
+ # Then fast_align but will work right only with snapshot
949
+ fast_align = FastAlignAligner()
950
+ if fast_align.available:
951
+ self.aligners.append(("fast_align", fast_align))
952
+
953
+ # Always add heuristic as final fallback
954
+ if not any(name == "Heuristic" for name, _ in self.aligners):
955
+ self.aligners.append(("Heuristic", HeuristicAligner()))
956
+
957
+ logger.info(f"Aligner chain: {[name for name, _ in self.aligners]}")
958
+
959
+ def align(self, src_words: List[str], tgt_words: List[str]) -> List[Tuple[int, int]]:
960
+ """Try aligners in order until one succeeds"""
961
+ for name, aligner in self.aligners:
962
+ result = aligner.align(src_words, tgt_words)
963
+ if result:
964
+ logger.debug(f"βœ“ {name} alignment: {len(result)} links")
965
+ return result
966
+
967
+ logger.debug("Using heuristic fallback (no quality alignments found)")
968
+ return []
969
+
970
+
971
+ # ============================================================================
972
+ # DOCUMENT TRANSLATOR
973
+ # ============================================================================
974
+
975
+ class UltimateDocumentTranslator:
976
+ """Document translator with configurable backends and M1 optimization."""
977
+
978
+ def __init__(
979
+ self,
980
+ src_lang: str,
981
+ tgt_lang: str,
982
+ mode: TranslationMode = TranslationMode.HYBRID,
983
+ nmt_backend: Optional[str] = "nllb",
984
+ llm_provider: Optional[str] = None,
985
+ aligner: Optional[str] = None,
986
+ nllb_model_size: str = "600M"
987
+ ):
988
+ self.src_lang, self.tgt_lang, self.mode = src_lang, tgt_lang, mode
989
+ self.ct2 = None # WMT translator
990
+ self.nllb = None # NLLB translator
991
+ self.llm = None
992
+ self.aligner = None
993
+ self.opus = None
994
+ self.madlad = None
995
+
996
+ logger.info(f"INIT | Starting Translator ({src_lang}β†’{tgt_lang})")
997
+ logger.info(f"INIT | Mode: {mode.value} | NMT: {nmt_backend} | Aligner: {aligner or 'auto'}")
998
+ self.log_memory("Initialization Start")
999
+
1000
+ # 1. NMT BACKEND SELECTION (Exclusive)
1001
+ if mode in [TranslationMode.NMT_ONLY, TranslationMode.HYBRID]:
1002
+ if nmt_backend == "nllb":
1003
+ self.nllb = NLLBTranslator(src_lang, tgt_lang, nllb_model_size)
1004
+ if not self.nllb.available:
1005
+ logger.error("INIT | CRITICAL: NLLB specifically requested but failed to load.")
1006
+
1007
+ elif nmt_backend == "opus":
1008
+ self.opus = OpusMTTranslator(src_lang, tgt_lang)
1009
+
1010
+ elif nmt_backend == "madlad":
1011
+ self.madlad = Madlad400Translator(src_lang, tgt_lang, "3b")
1012
+
1013
+ elif nmt_backend == "ct2":
1014
+ self.ct2 = CTranslate2Translator(src_lang, tgt_lang)
1015
+ if not self.ct2.available:
1016
+ logger.error("INIT | CRITICAL: CTranslate2/WMT specifically requested but failed to load.")
1017
+
1018
+ elif nmt_backend == "auto" or nmt_backend is None:
1019
+ # Priority: NLLB (Lightweight) -> CT2 (Dense)
1020
+ self.nllb = NLLBTranslator(src_lang, tgt_lang, nllb_model_size)
1021
+ if not (self.nllb and self.nllb.available):
1022
+ logger.info("INIT | NLLB unavailable, trying CTranslate2/WMT...")
1023
+ self.ct2 = CTranslate2Translator(src_lang, tgt_lang)
1024
+
1025
+ # 2. LLM BACKEND (Hybrid or LLM modes)
1026
+ if mode in [TranslationMode.LLM_WITH_ALIGN, TranslationMode.LLM_WITHOUT_ALIGN, TranslationMode.HYBRID]:
1027
+ self.llm = LLMTranslator(src_lang, tgt_lang, llm_provider)
1028
+ if not self.llm.providers:
1029
+ logger.warning("INIT | Mode requires LLM but no providers (OpenAI/Anthropic/Ollama) available.")
1030
+
1031
+ # 3. ALIGNER SELECTION (Priority: Awesome-Align)
1032
+ if mode in [TranslationMode.LLM_WITH_ALIGN, TranslationMode.HYBRID, TranslationMode.NMT_ONLY]:
1033
+ # We force 'awesome' as the preferred auto-aligner for M1 precision
1034
+ aligner_choice = aligner if aligner and aligner != "auto" else "awesome"
1035
+ self.aligner = MultiAligner(src_lang, tgt_lang, aligner_choice)
1036
+
1037
+ self.log_memory("Initialization Complete")
1038
+
1039
+ # FINAL VERIFICATION: Ensure at least one engine is ready
1040
+ engines_ready = any([
1041
+ self.nllb and self.nllb.available,
1042
+ self.ct2 and self.ct2.available,
1043
+ self.opus and self.opus.available,
1044
+ self.madlad and self.madlad.available,
1045
+ self.llm and self.llm.providers
1046
+ ])
1047
+
1048
+ if not engines_ready:
1049
+ logger.error("INIT | CRITICAL FAILURE: No translation engines were able to load.")
1050
+ raise RuntimeError("No translation backends available. Check your model paths and API keys.")
1051
+
1052
+ def log_memory(self, stage: str):
1053
+ """Log current RAM usage (requires psutil)."""
1054
+ try:
1055
+ import psutil
1056
+ process = psutil.Process(os.getpid())
1057
+ mem_mb = process.memory_info().rss / (1024 * 1024)
1058
+ logger.debug(f"MEM | Stage: {stage} | Usage: {mem_mb:.2f} MB")
1059
+ except ImportError:
1060
+ pass
1061
+ except Exception as e:
1062
+ logger.debug(f"Memory log failed: {e}")
1063
+
1064
+ async def translate_text(self, text: str) -> str:
1065
+ """Routes text through the active neural engine chain."""
1066
+ if not text.strip(): return text
1067
+
1068
+ # 1. Try NLLB (Local CT2)
1069
+ if self.nllb and self.nllb.available:
1070
+ result = self.nllb.translate_batch([text])[0]
1071
+ if result and result.strip() != text.strip(): return result
1072
+
1073
+ # 2. Try WMT21 (Local CT2 - 'ct2' backend)
1074
+ if self.ct2 and self.ct2.available:
1075
+ result = self.ct2.translate_batch([text])[0]
1076
+ if result and result.strip() != text.strip(): return result
1077
+
1078
+ # 3. Try OPUS-MT (Local CT2)
1079
+ if self.opus and self.opus.available:
1080
+ result = self.opus.translate_batch([text])[0]
1081
+ if result and result.strip() != text.strip(): return result
1082
+
1083
+ # 4. Try MADLAD-400 (Local CT2)
1084
+ if self.madlad and self.madlad.available:
1085
+ result = self.madlad.translate_batch([text])[0]
1086
+ if result and result.strip() != text.strip(): return result
1087
+
1088
+ # 5. Try LLM (Hybrid/LLM modes)
1089
+ if self.llm and self.llm.providers:
1090
+ # use_alignment depends on mode
1091
+ use_align = (self.mode != TranslationMode.LLM_WITHOUT_ALIGN)
1092
+ result = await self.llm.translate_text(text, use_alignment=use_align)
1093
+ if result: return result
1094
+
1095
+ return text
1096
+
1097
+ def extract_paragraph(self, para: Paragraph) -> TranslatableParagraph:
1098
+ """Extracts paragraph with resolved font hierarchy to prevent Theme reversion."""
1099
+ # 1. Resolve the "Base Font" for this paragraph
1100
+ def get_resolved_font_name(p):
1101
+ # Check runs first
1102
+ for r in p.runs:
1103
+ if r.font.name: return r.font.name
1104
+ # Check style hierarchy
1105
+ curr_style = p.style
1106
+ while curr_style:
1107
+ if curr_style.font.name: return curr_style.font.name
1108
+ curr_style = curr_style.base_style
1109
+ return "Times New Roman" # Fallback if everything is 'None'
1110
+
1111
+ resolved_base_font = get_resolved_font_name(para)
1112
+
1113
+ runs = []
1114
+ for run in para.runs:
1115
+ f_color = None
1116
+ try:
1117
+ if run.font.color and run.font.color.rgb:
1118
+ f_color = (run.font.color.rgb[0], run.font.color.rgb[1], run.font.color.rgb[2])
1119
+ except: pass
1120
+
1121
+ runs.append(FormatRun(
1122
+ text=run.text,
1123
+ bold=run.bold,
1124
+ italic=run.italic,
1125
+ underline=run.underline,
1126
+ # If run has no font name, use the resolved base font we found
1127
+ font_name=run.font.name if run.font.name else resolved_base_font,
1128
+ font_size=run.font.size.pt if run.font.size else (para.style.font.size.pt if para.style.font.size else 12.0),
1129
+ font_color=f_color
1130
+ ))
1131
+
1132
+ trans_para = TranslatableParagraph(runs=runs)
1133
+ trans_para.metadata['style'] = para.style
1134
+ trans_para.metadata['alignment'] = para.alignment
1135
+
1136
+ pf = para.paragraph_format
1137
+ trans_para.metadata['layout'] = {
1138
+ 'left_indent': pf.left_indent,
1139
+ 'right_indent': pf.right_indent,
1140
+ 'first_line_indent': pf.first_line_indent,
1141
+ 'line_spacing': pf.line_spacing,
1142
+ 'space_before': pf.space_before,
1143
+ 'space_after': pf.space_after
1144
+ }
1145
+ return trans_para
1146
+
1147
+ def log_memory(self, stage: str):
1148
+ """Log current RAM usage (requires psutil)."""
1149
+ try:
1150
+ import psutil
1151
+ process = psutil.Process(os.getpid())
1152
+ mem_mb = process.memory_info().rss / (1024 * 1024)
1153
+ logger.debug(f"MEM | Stage: {stage} | Usage: {mem_mb:.2f} MB")
1154
+ except ImportError:
1155
+ pass
1156
+
1157
+ def copy_font_properties(self, target_run, source_run: FormatRun):
1158
+ """Forces Font Name, Size, and Color into XML. Does NOT touch bold/italic."""
1159
+ try:
1160
+ from docx.shared import Pt, RGBColor
1161
+ from docx.oxml.ns import qn
1162
+ if source_run.font_size:
1163
+ target_run.font.size = Pt(source_run.font_size)
1164
+ if source_run.font_color:
1165
+ target_run.font.color.rgb = RGBColor(*source_run.font_color)
1166
+ if source_run.underline is not None:
1167
+ target_run.font.underline = source_run.underline
1168
+
1169
+ if source_run.font_name:
1170
+ # The XML 'rFonts' injection to bypass Theme defaults
1171
+ r = target_run._element
1172
+ rPr = r.get_or_add_rPr()
1173
+ rFonts = rPr.get_or_add_rFonts()
1174
+ rFonts.set(qn('w:ascii'), source_run.font_name)
1175
+ rFonts.set(qn('w:hAnsi'), source_run.font_name)
1176
+ rFonts.set(qn('w:eastAsia'), source_run.font_name)
1177
+ rFonts.set(qn('w:cs'), source_run.font_name)
1178
+ target_run.font.name = source_run.font_name
1179
+ except Exception as e:
1180
+ logger.debug(f"TRACE | Font Force Error: {e}")
1181
+
1182
+ def apply_aligned_formatting(self, para: Paragraph, trans_para: TranslatableParagraph, translated_text: str, alignment: List[Tuple[int, int]]):
1183
+ """
1184
+ Reconstructs the Word paragraph by mapping source formatting onto translated text
1185
+ using neural word alignments. Preserves layout, font themes, and footnote anchors.
1186
+ """
1187
+ # 1. INITIAL TRACE & ANCHOR EXTRACTION
1188
+ self.log_para_trace(para, "INPUT")
1189
+ p_element = para._p
1190
+
1191
+ # Extract all footnote reference elements (w:footnoteReference) or
1192
+ # auto-numbered markers (w:footnoteRef) before we clear the paragraph.
1193
+ footnote_refs = p_element.xpath('.//w:r[w:footnoteReference or w:footnoteRef]')
1194
+ if footnote_refs:
1195
+ logger.debug(f"TRACE | Found {len(footnote_refs)} footnote anchors to re-attach.")
1196
+
1197
+ # 2. RESTORE PARAGRAPH-LEVEL METADATA (Layout & Style)
1198
+ if trans_para.metadata.get('style'):
1199
+ para.style = trans_para.metadata['style']
1200
+ para.alignment = trans_para.metadata.get('alignment')
1201
+
1202
+ layout = trans_para.metadata.get('layout', {})
1203
+ pf = para.paragraph_format
1204
+ try:
1205
+ # We explicitly set these to bypass Word's 'Normal' style defaults
1206
+ if layout.get('left_indent') is not None: pf.left_indent = layout['left_indent']
1207
+ if layout.get('right_indent') is not None: pf.right_indent = layout['right_indent']
1208
+ if layout.get('first_line_indent') is not None: pf.first_line_indent = layout['first_line_indent']
1209
+ if layout.get('line_spacing') is not None: pf.line_spacing = layout['line_spacing']
1210
+ if layout.get('space_before') is not None: pf.space_before = layout['space_before']
1211
+ if layout.get('space_after') is not None: pf.space_after = layout['space_after']
1212
+ logger.debug(f"TRACE | Layout metrics restored: Indent={pf.left_indent}, Spacing={pf.line_spacing}")
1213
+ except Exception as e:
1214
+ logger.error(f"TRACE | Layout Restore Error: {e}")
1215
+
1216
+ # 3. CLEAR EXISTING CONTENT
1217
+ # We must remove runs one-by-one to keep the paragraph container intact
1218
+ for run in para.runs:
1219
+ p_element.remove(run._element)
1220
+
1221
+ # 4. HANDLE FOOTNOTE-SPECIFIC STARTUP
1222
+ # For paragraphs that ARE footnote text, the marker must come first.
1223
+ is_footnote_content_para = "footnote" in str(para.style.name).lower()
1224
+ if is_footnote_content_para and footnote_refs:
1225
+ for ref in footnote_refs:
1226
+ p_element.append(ref)
1227
+ para.add_run("\u00A0") # Add a non-breaking space after the number
1228
+ logger.debug("TRACE | Footnote number marker re-attached to start.")
1229
+
1230
+ # 5. MAPPING LOGIC PREPARATION
1231
+ src_clean_words = trans_para.get_words()
1232
+ tgt_raw_units = translated_text.split()
1233
+ formatted_indices = trans_para.get_formatted_word_indices()
1234
+
1235
+ # Map Clean Aligner indices to Raw whitespace-split units
1236
+ clean_to_raw_tgt = {}
1237
+ clean_idx = 0
1238
+ for raw_idx, unit in enumerate(tgt_raw_units):
1239
+ # Only count as a 'word' if it contains alphanumeric characters
1240
+ if re.search(r'\w', unit):
1241
+ clean_to_raw_tgt[clean_idx] = raw_idx
1242
+ clean_idx += 1
1243
+
1244
+ # Use the first source run as the baseline "Aesthetic" (Font Name/Size/Color)
1245
+ font_template = trans_para.runs[0] if trans_para.runs else None
1246
+
1247
+ # 6. RECONSTRUCT RUNS WITH INLINE STYLES
1248
+ logger.debug(f"TRACE | Reconstructing {len(tgt_raw_units)} target units...")
1249
+
1250
+ for i, unit in enumerate(tgt_raw_units):
1251
+ # Maintain original spacing
1252
+ run_text = unit + (" " if i < len(tgt_raw_units)-1 else "")
1253
+ run = para.add_run(run_text)
1254
+
1255
+ # Determine Bold/Italic/Underline for this specific unit
1256
+ style_type = None
1257
+ matched_src_indices = [s_idx for s_idx, t_idx in alignment if clean_to_raw_tgt.get(t_idx) == i]
1258
+
1259
+ if matched_src_indices:
1260
+ for s_idx in matched_src_indices:
1261
+ # Check style priority: Bold+Italic > Bold > Italic
1262
+ if s_idx in formatted_indices['italic_bold']:
1263
+ style_type = 'italic_bold'; break
1264
+ elif s_idx in formatted_indices['bold']:
1265
+ style_type = 'bold'
1266
+ elif s_idx in formatted_indices['italic'] and style_type != 'bold':
1267
+ style_type = 'italic'
1268
+
1269
+ # Apply Aligned Inline Styles
1270
+ if style_type == 'italic_bold':
1271
+ run.bold = run.italic = True
1272
+ elif style_type == 'bold':
1273
+ run.bold = True
1274
+ elif style_type == 'italic':
1275
+ run.italic = True
1276
+
1277
+ # Apply Baseline Aesthetics (The "Look" of the document)
1278
+ if font_template:
1279
+ self.copy_font_properties(run, font_template)
1280
+
1281
+ # 7. RE-ANCHOR BODY FOOTNOTES
1282
+ # For main text, footnote citations usually go at the end of the translated block
1283
+ if not is_footnote_content_para and footnote_refs:
1284
+ logger.debug(f"TRACE | Body Footnote | Re-anchoring {len(footnote_refs)} refs to end of paragraph.")
1285
+ for ref in footnote_refs:
1286
+ p_element.append(ref)
1287
+
1288
+ self.log_para_trace(para, "OUTPUT")
1289
+
1290
+ def is_paragraph_safe_to_translate(self, para: Paragraph) -> bool:
1291
+ """Check if paragraph can be safely translated"""
1292
+ if not para.text or not para.text.strip():
1293
+ return False
1294
+
1295
+ text = para.text.strip()
1296
+ if not text:
1297
+ return False
1298
+
1299
+ if len(para.text.strip()) <= 1 and not any(run.text.strip() for run in para.runs):
1300
+ return False
1301
+
1302
+ try:
1303
+ for run in para.runs:
1304
+ if run._element.xpath('.//w:drawing | .//w:pict'):
1305
+ logger.debug(f"Skipping paragraph with drawing/image")
1306
+ return False
1307
+ except:
1308
+ pass
1309
+
1310
+ try:
1311
+ field_chars = para._element.xpath('.//w:fldChar')
1312
+ if field_chars:
1313
+ is_footnote_field = para._element.xpath('.//w:footnoteReference')
1314
+ if not is_footnote_field:
1315
+ logger.debug(f"Skipping paragraph with field")
1316
+ return False
1317
+ except:
1318
+ pass
1319
+
1320
+ return True
1321
+
1322
+ async def translate_paragraph(self, para: Paragraph):
1323
+ """Paragraph translation with multi-backend routing and alignment."""
1324
+ if not para.text or not para.text.strip() or not self.is_paragraph_safe_to_translate(para):
1325
+ return
1326
+
1327
+ try:
1328
+ self.log_memory("Pre-Paragraph")
1329
+ trans_para = self.extract_paragraph(para)
1330
+ original_text = trans_para.get_text()
1331
+
1332
+ # 1. Split into sentences for higher NMT quality
1333
+ sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', original_text) if s.strip()]
1334
+ if not sentences: return
1335
+
1336
+ # 2. FIXED: Use the central router for each sentence
1337
+ translated_sentences = []
1338
+ for s in sentences:
1339
+ t = await self.translate_text(s)
1340
+ translated_sentences.append(t)
1341
+
1342
+ translated_text = " ".join(translated_sentences)
1343
+
1344
+ # 3. Preparation for Aligner
1345
+ src_words = trans_para.get_words()
1346
+ tgt_words_clean = re.findall(r"\w+", translated_text)
1347
+
1348
+ # Log the translation for the CLI trace
1349
+ logger.info("-" * 30)
1350
+ logger.info(f"TRANS | Out: {translated_text[:60]}...")
1351
+
1352
+ if not src_words or not tgt_words_clean:
1353
+ para.text = translated_text
1354
+ return
1355
+
1356
+ # 4. Alignment Pass
1357
+ alignment = []
1358
+ if self.aligner:
1359
+ alignment = self.aligner.align(src_words, tgt_words_clean)
1360
+ logger.info(f"ALIGN | Matches: {len(alignment)}")
1361
+
1362
+ # 5. Reconstruction
1363
+ self.apply_aligned_formatting(para, trans_para, translated_text, alignment)
1364
+ self.log_memory("Post-Paragraph")
1365
+
1366
+ except Exception as e:
1367
+ logger.error(f"PARA | Translation Failed: {e}", exc_info=True)
1368
+
1369
+ def get_footnotes(self, doc: Document) -> List[Paragraph]:
1370
+ """Extract footnotes. We pass doc as parent to avoid Part attribute error"""
1371
+ try:
1372
+ document_part = doc.part
1373
+ footnote_part = None
1374
+
1375
+ for rel in document_part.rels.values():
1376
+ if "relationships/footnotes" in rel.reltype:
1377
+ footnote_part = rel.target_part
1378
+ break
1379
+
1380
+ if not footnote_part:
1381
+ return []
1382
+
1383
+ from docx.oxml import parse_xml
1384
+ root = parse_xml(footnote_part.blob)
1385
+ ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
1386
+
1387
+ paragraphs = []
1388
+ for footnote in root.xpath('//w:footnote', namespaces=ns):
1389
+ f_id = footnote.get(f'{{{ns["w"]}}}id')
1390
+ # Skip internal Word markers (id 0 and -1)
1391
+ if f_id and int(f_id) <= 0:
1392
+ continue
1393
+
1394
+ for p_elem in footnote.xpath('.//w:p', namespaces=ns):
1395
+ # FIX: Pass 'doc' instead of 'None' so para.part is valid
1396
+ para = Paragraph(p_elem, doc)
1397
+ if para.text.strip():
1398
+ paragraphs.append(para)
1399
+
1400
+ self._footnote_root = root
1401
+ self._footnote_part = footnote_part
1402
+ return paragraphs
1403
+
1404
+ except Exception as e:
1405
+ logger.warning(f"Could not extract footnotes: {e}")
1406
+ return []
1407
+
1408
+ def get_all_paragraphs(self, doc: Document) -> List[Tuple[Paragraph, str]]:
1409
+ """Aggregate all paragraphs from document"""
1410
+ all_paras = []
1411
+
1412
+ # Main body
1413
+ for para in doc.paragraphs:
1414
+ all_paras.append((para, "body"))
1415
+
1416
+ # Tables
1417
+ for table in doc.tables:
1418
+ for row in table.rows:
1419
+ for cell in row.cells:
1420
+ for para in cell.paragraphs:
1421
+ all_paras.append((para, "table"))
1422
+
1423
+ # Footnotes - with error handling
1424
+ try:
1425
+ footnote_paras = self.get_footnotes(doc)
1426
+ for para in footnote_paras:
1427
+ all_paras.append((para, "footnote"))
1428
+ except Exception as e:
1429
+ logger.warning(f"Skipping footnotes due to error: {e}")
1430
+
1431
+ # Headers/Footers
1432
+ for section in doc.sections:
1433
+ for para in section.header.paragraphs:
1434
+ all_paras.append((para, "header"))
1435
+ for para in section.footer.paragraphs:
1436
+ all_paras.append((para, "footer"))
1437
+
1438
+ return all_paras
1439
+
1440
+ def log_document_info(self, doc: Document, label: str):
1441
+ """Logs global document properties and style inventory."""
1442
+ logger.debug(f"{'='*20} DOCUMENT INFO [{label}] {'='*20}")
1443
+ for i, section in enumerate(doc.sections):
1444
+ logger.debug(f"Section {i} | Size: {section.page_width.pt:.1f}x{section.page_height.pt:.1f}pt")
1445
+ logger.debug(f"Section {i} | Margins: L:{section.left_margin.pt:.1f} R:{section.right_margin.pt:.1f} T:{section.top_margin.pt:.1f} B:{section.bottom_margin.pt:.1f}")
1446
+ logger.debug(f"Section {i} | Gutter: {section.gutter.pt:.1f}pt | Header-Dist: {section.header_distance.pt:.1f}pt")
1447
+
1448
+ # Log all paragraph styles present in the document
1449
+ style_names = [s.name for s in doc.styles if s.type == 1]
1450
+ logger.debug(f"Style Inventory ({len(style_names)}): {', '.join(style_names)}")
1451
+
1452
+ def log_para_trace(self, para: Paragraph, label: str):
1453
+ """Detailed debug log showing style, font, and layout metrics."""
1454
+ pf = para.paragraph_format
1455
+ style = para.style
1456
+
1457
+ # Check if first run has inline formatting
1458
+ has_bold = any(r.bold for r in para.runs)
1459
+ has_ital = any(r.italic for r in para.runs)
1460
+
1461
+ f_name = para.runs[0].font.name if para.runs and para.runs[0].font.name else "Inherited"
1462
+ f_size = f"{para.runs[0].font.size.pt if para.runs and para.runs[0].font.size else 'Default'}pt"
1463
+
1464
+ logger.debug(f"PARA [{label}] | Style: '{style.name}'")
1465
+ logger.debug(f" > Font: {f_name} @ {f_size} | Bold-Any: {has_bold} | Ital-Any: {has_ital}")
1466
+ logger.debug(f" > Indent-L: {pf.left_indent.pt if pf.left_indent else 0:.1f}pt | Spacing-A: {pf.space_after.pt if pf.space_after else 0:.1f}pt")
1467
+
1468
+ async def translate_document(self, input_path: Path, output_path: Path):
1469
+ """Full document lifecycle with robust XML commitment and verification logs."""
1470
+ self.log_memory("Initialization")
1471
+ doc = Document(str(input_path))
1472
+
1473
+ if logger.isEnabledFor(logging.DEBUG):
1474
+ self.log_document_info(doc, "INPUT")
1475
+
1476
+ # Gather footnotes properly to initialize _footnote_root
1477
+ _ = self.get_footnotes(doc)
1478
+ all_paras = self.get_all_paragraphs(doc)
1479
+ translatable = [(p, l) for p, l in all_paras if p.text.strip() and self.is_paragraph_safe_to_translate(p)]
1480
+
1481
+ logger.info(f"Processing {len(translatable)} paragraphs across {len(all_paras)} total entities.")
1482
+
1483
+ for para, location in tqdm(translatable, desc="Translating"):
1484
+ try:
1485
+ await self.translate_paragraph(para)
1486
+ except Exception as e:
1487
+ logger.error(f"Error in {location} translation: {e}")
1488
+
1489
+ # FOOTNOTE XML COMMITMENT
1490
+ if hasattr(self, '_footnote_part') and hasattr(self, '_footnote_root'):
1491
+ try:
1492
+ from lxml import etree
1493
+ # Standard etree tostring ensures namespace 'w' is preserved correctly
1494
+ updated_xml = etree.tostring(self._footnote_root, encoding='utf-8', xml_declaration=True)
1495
+ self._footnote_part._blob = updated_xml
1496
+ logger.info("βœ“ Success | Footnote XML blob successfully serialized.")
1497
+ except Exception as e:
1498
+ logger.error(f"Error | Footnote commitment failed: {e}")
1499
+
1500
+ logger.info(f"Saving file to {output_path}")
1501
+ doc.save(str(output_path))
1502
+
1503
+ if logger.isEnabledFor(logging.DEBUG):
1504
+ self.log_document_info(Document(str(output_path)), "OUTPUT")
1505
+ logger.info("βœ“ Document Translation Complete.")
1506
+
1507
+
1508
+ # ============================================================================
1509
+ # CLI
1510
+ # ============================================================================
1511
+
1512
+ async def main():
1513
+ parser = argparse.ArgumentParser(
1514
+ description='πŸš€ Ultimate Document Translator - Multi-Backend Production Version',
1515
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1516
+ epilog="""
1517
+ Backends Comparison:
1518
+ nllb - Distilled 600M (Default). Best speed/RAM ratio. Great general support.
1519
+ madlad - Google's 3B model. Superior academic/formal quality (~3GB RAM).
1520
+ opus - Specialized bilingual models. Tiny (~200MB), extremely fast, literal.
1521
+ ct2 (wmt) - Dense Facebook models. Peak German/European quality (~6GB RAM).
1522
+
1523
+ Examples:
1524
+ # Standard use (NLLB-600M)
1525
+ %(prog)s input.docx output.docx -s en -t de
1526
+
1527
+ # High-quality academic translation (Madlad-400)
1528
+ %(prog)s input.docx output.docx -s en -t de --nmt madlad
1529
+
1530
+ # Maximum speed for EN-DE specialized pair (Opus-MT)
1531
+ %(prog)s input.docx output.docx -s en -t de --nmt opus
1532
+
1533
+ # Use LLM (Claude) with local neural alignment
1534
+ %(prog)s input.docx output.docx -s en -t es --mode llm-align --llm anthropic
1535
+
1536
+ # Larger NLLB model for rare languages
1537
+ %(prog)s input.docx output.docx -s en -t ja --nmt nllb --nllb-size 1.3B
1538
+
1539
+ Environment Variables:
1540
+ OPENAI_API_KEY, ANTHROPIC_API_KEY - Required for LLM backends.
1541
+ """
1542
+ )
1543
+
1544
+ # 1. POSITIONAL ARGUMENTS
1545
+ parser.add_argument('input', help='Input .docx file path')
1546
+ parser.add_argument('output', help='Output .docx file path')
1547
+
1548
+ # 2. LANGUAGE ARGUMENTS
1549
+ parser.add_argument('-s', '--source', default='en', help='Source language code (default: en)')
1550
+ parser.add_argument('-t', '--target', default='de', help='Target language code (default: de)')
1551
+
1552
+ # 3. TRANSLATION MODE
1553
+ parser.add_argument(
1554
+ '--mode',
1555
+ choices=['nmt', 'llm-align', 'llm-plain', 'hybrid'],
1556
+ default='hybrid',
1557
+ help='Translation strategy (default: hybrid)'
1558
+ )
1559
+
1560
+ # 4. NMT ENGINE SELECTION
1561
+ parser.add_argument(
1562
+ '--nmt',
1563
+ choices=['nllb', 'madlad', 'opus', 'ct2', 'auto'],
1564
+ default='nllb',
1565
+ help='Local NMT Engine: NLLB (general), Madlad (academic), Opus (specialized), CT2 (dense)'
1566
+ )
1567
+
1568
+ parser.add_argument(
1569
+ '--nllb-size',
1570
+ choices=['600M', '1.3B', '3.3B'],
1571
+ default='600M',
1572
+ help='NLLB variant only: 600M (fastest), 1.3B (balanced), 3.3B (heavy)'
1573
+ )
1574
+
1575
+ # 5. LLM PROVIDER
1576
+ parser.add_argument(
1577
+ '--llm',
1578
+ choices=['openai', 'anthropic', 'ollama'],
1579
+ help='LLM provider for hybrid/llm modes'
1580
+ )
1581
+
1582
+ # 6. ALIGNER SELECTION
1583
+ parser.add_argument(
1584
+ '--aligner',
1585
+ choices=['awesome', 'simalign', 'lindat', 'fast_align', 'heuristic', 'auto'],
1586
+ default='auto',
1587
+ help='Word Aligner: awesome (M1 optimized), simalign (heavy), lindat (API)'
1588
+ )
1589
+
1590
+ parser.add_argument('-v', '--verbose', action='store_true', help='Enable detailed TRACE logging')
1591
+
1592
+ args = parser.parse_args()
1593
+
1594
+ # Set global logging level based on verbose flag
1595
+ if args.verbose:
1596
+ logging.getLogger().setLevel(logging.DEBUG)
1597
+
1598
+ # Path validation
1599
+ input_path = Path(args.input)
1600
+ if not input_path.exists():
1601
+ logger.error(f"File not found: {input_path}")
1602
+ sys.exit(1)
1603
+
1604
+ # Mapping CLI strings to Enums
1605
+ mode_map = {
1606
+ 'nmt': TranslationMode.NMT_ONLY,
1607
+ 'llm-align': TranslationMode.LLM_WITH_ALIGN,
1608
+ 'llm-plain': TranslationMode.LLM_WITHOUT_ALIGN,
1609
+ 'hybrid': TranslationMode.HYBRID
1610
+ }
1611
+
1612
+ # --- LOGO & STATUS HEADER ---
1613
+ print(f"\n{'='*60}")
1614
+ print(f"🌍 DOCUMENT TRANSLATOR - PRODUCTION v12")
1615
+ print(f"{'='*60}")
1616
+ print(f"Input: {input_path.name}")
1617
+ print(f"Output: {args.output}")
1618
+ print(f"Direction: {args.source.upper()} β†’ {args.target.upper()}")
1619
+ print(f"Mode: {args.mode.upper()}")
1620
+ print(f"NMT Engine: {args.nmt.upper()} {'('+args.nllb_size+')' if args.nmt=='nllb' else ''}")
1621
+ print(f"Aligner: {args.aligner.upper()}")
1622
+ if args.llm:
1623
+ print(f"LLM: {args.llm.upper()}")
1624
+ print(f"{'='*60}\n")
1625
+
1626
+ # Initialize the engine
1627
+ translator = UltimateDocumentTranslator(
1628
+ src_lang=args.source,
1629
+ tgt_lang=args.target,
1630
+ mode=mode_map[args.mode],
1631
+ nmt_backend=args.nmt,
1632
+ llm_provider=args.llm,
1633
+ aligner=args.aligner,
1634
+ nllb_model_size=args.nllb_size
1635
+ )
1636
+
1637
+ # Execute lifecycle
1638
+ try:
1639
+ await translator.translate_document(input_path, Path(args.output))
1640
+
1641
+ print(f"\n{'='*60}")
1642
+ print(f"βœ… Success! Document processed in {args.mode} mode.")
1643
+ print(f"πŸ’Ύ File saved to: {args.output}")
1644
+ print(f"{'='*60}\n")
1645
+ except Exception as e:
1646
+ logger.error(f"FAILED | Document translation aborted: {e}", exc_info=args.verbose)
1647
+ sys.exit(1)
1648
+
1649
+
1650
+ if __name__ == "__main__":
1651
+ asyncio.run(main())