fokan commited on
Commit
3e2ca56
·
1 Parent(s): c52255c

first push

Browse files
Files changed (4) hide show
  1. app/main.py +19 -1
  2. debug_translation.py +69 -0
  3. requirements.txt +2 -1
  4. translator.py +216 -36
app/main.py CHANGED
@@ -19,7 +19,14 @@ import logging
19
  from translator import DocumentTranslator, TranslationReport
20
 
21
  # Configure logging
22
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
23
  logger = logging.getLogger(__name__)
24
 
25
  app = FastAPI(title="Document Translator", description="Translate PDF and DOCX documents using OpenRouter")
@@ -91,6 +98,7 @@ async def translate_document(
91
 
92
  try:
93
  # Perform translation
 
94
  result = await translator.translate_document(
95
  input_file=input_file,
96
  model=model,
@@ -99,6 +107,16 @@ async def translate_document(
99
  output_dir=temp_path
100
  )
101
 
 
 
 
 
 
 
 
 
 
 
102
  # Move files to uploads directory for serving
103
  timestamp = int(asyncio.get_event_loop().time())
104
  result_dir = UPLOAD_DIR / f"translation_{timestamp}"
 
19
  from translator import DocumentTranslator, TranslationReport
20
 
21
  # Configure logging
22
+ logging.basicConfig(
23
+ level=logging.INFO,
24
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
25
+ handlers=[
26
+ logging.StreamHandler(),
27
+ logging.FileHandler('translation.log') if os.path.exists('.') else logging.StreamHandler()
28
+ ]
29
+ )
30
  logger = logging.getLogger(__name__)
31
 
32
  app = FastAPI(title="Document Translator", description="Translate PDF and DOCX documents using OpenRouter")
 
98
 
99
  try:
100
  # Perform translation
101
+ logger.info(f"Starting translation of {input_file} using model {model}")
102
  result = await translator.translate_document(
103
  input_file=input_file,
104
  model=model,
 
107
  output_dir=temp_path
108
  )
109
 
110
+ # Check if translation was successful
111
+ if result.status == "failed":
112
+ error_details = f"Translation failed: {result.errors[0] if result.errors else 'Unknown error'}"
113
+ logger.error(error_details)
114
+ raise HTTPException(status_code=500, detail=error_details)
115
+
116
+ if result.paragraphs_count == 0:
117
+ logger.warning("Translation completed but no paragraphs were translated")
118
+ # Still proceed but log the issue
119
+
120
  # Move files to uploads directory for serving
121
  timestamp = int(asyncio.get_event_loop().time())
122
  result_dir = UPLOAD_DIR / f"translation_{timestamp}"
debug_translation.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Debug script to test translation functionality
4
+ """
5
+
6
+ import os
7
+ import asyncio
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Add the current directory to Python path
12
+ sys.path.insert(0, str(Path(__file__).parent))
13
+
14
+ from translator import DocumentTranslator
15
+
16
+ async def test_translation():
17
+ """Test the translation system"""
18
+ print("🧪 Testing Document Translation System...")
19
+
20
+ # Check API key
21
+ api_key = os.getenv('OPENROUTER_API_KEY')
22
+ if not api_key:
23
+ print("❌ OPENROUTER_API_KEY not found!")
24
+ print("Set it with: export OPENROUTER_API_KEY='your_key_here'")
25
+ return
26
+
27
+ print(f"✅ API key found (length: {len(api_key)})")
28
+
29
+ # Initialize translator
30
+ translator = DocumentTranslator()
31
+
32
+ if not translator.is_ready():
33
+ print("❌ Translator not ready")
34
+ return
35
+
36
+ print("✅ Translator initialized")
37
+
38
+ # Test model availability
39
+ models = await translator.get_available_models()
40
+ print(f"✅ Available models: {len(models)}")
41
+ for model in models:
42
+ print(f" - {model['name']}: {model['id']}")
43
+
44
+ # Test basic translation
45
+ test_text = "Hello, this is a test sentence for translation."
46
+ print(f"\n🔤 Testing basic translation...")
47
+ print(f"Original: {test_text}")
48
+
49
+ try:
50
+ translated = await translator.translate_text(
51
+ test_text,
52
+ "google/gemini-2.5-pro-exp-03-25",
53
+ "en",
54
+ "ar"
55
+ )
56
+ print(f"Translated: {translated}")
57
+
58
+ if translated != test_text:
59
+ print("✅ Basic translation working!")
60
+ else:
61
+ print("⚠️ Translation returned original text - check API key and credits")
62
+
63
+ except Exception as e:
64
+ print(f"❌ Translation test failed: {e}")
65
+
66
+ print("\n🎯 Translation system test complete!")
67
+
68
+ if __name__ == "__main__":
69
+ asyncio.run(test_translation())
requirements.txt CHANGED
@@ -6,4 +6,5 @@ aiohttp==3.9.1
6
  python-docx==1.1.0
7
  requests==2.31.0
8
  Pillow==10.1.0
9
- typing-extensions==4.8.0
 
 
6
  python-docx==1.1.0
7
  requests==2.31.0
8
  Pillow==10.1.0
9
+ typing-extensions==4.8.0
10
+ PyPDF2==3.0.1
translator.py CHANGED
@@ -12,6 +12,7 @@ from docx import Document
12
  from docx.shared import Inches
13
  import time
14
  import json
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -60,27 +61,52 @@ class DocumentTranslator:
60
  ]
61
 
62
  async def translate_text(self, text: str, model: str, source_lang: str = "auto", target_lang: str = "en") -> str:
63
- """Translate text using OpenRouter API"""
64
  if not text.strip():
65
  return text
66
 
67
- prompt = f"""Please translate the following text from {source_lang} to {target_lang}.
68
- Only return the translated text, without any explanations or additional content.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  Text to translate:
71
- {text}"""
 
 
72
 
73
  try:
74
  async with aiohttp.ClientSession() as session:
75
  payload = {
76
  "model": model,
77
  "messages": [
 
78
  {"role": "user", "content": prompt}
79
  ],
80
  "temperature": 0.1,
81
- "max_tokens": len(text) * 2 + 100 # Rough estimate for translation length
82
  }
83
 
 
 
84
  async with session.post(
85
  f"{self.base_url}/chat/completions",
86
  headers=self.headers,
@@ -89,6 +115,12 @@ Text to translate:
89
  if response.status == 200:
90
  data = await response.json()
91
  translated = data["choices"][0]["message"]["content"].strip()
 
 
 
 
 
 
92
  return translated
93
  else:
94
  error_text = await response.text()
@@ -98,11 +130,83 @@ Text to translate:
98
  logger.error(f"Translation error: {e}")
99
  return text # Return original text if translation fails
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def pdf_to_docx(self, pdf_path: Path, output_dir: Path) -> Path:
102
- """Convert PDF to DOCX using LibreOffice"""
103
  try:
104
  docx_path = output_dir / f"{pdf_path.stem}.docx"
105
 
 
 
 
106
  # Use LibreOffice to convert PDF to DOCX
107
  cmd = [
108
  "libreoffice",
@@ -112,17 +216,41 @@ Text to translate:
112
  str(pdf_path)
113
  ]
114
 
115
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
 
116
 
117
- if result.returncode == 0 and docx_path.exists():
118
- logger.info(f"Successfully converted {pdf_path} to {docx_path}")
119
- return docx_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  else:
121
- logger.error(f"LibreOffice conversion failed: {result.stderr}")
122
- raise Exception(f"PDF to DOCX conversion failed: {result.stderr}")
123
 
124
  except subprocess.TimeoutExpired:
125
- raise Exception("PDF conversion timed out")
126
  except Exception as e:
127
  logger.error(f"Error converting PDF to DOCX: {e}")
128
  raise
@@ -156,44 +284,76 @@ Text to translate:
156
  raise
157
 
158
  async def translate_docx(self, docx_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
159
- """Translate DOCX document paragraph by paragraph"""
160
  try:
161
  # Load the document
 
162
  doc = Document(docx_path)
163
  paragraphs_count = 0
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # Translate each paragraph
166
- for paragraph in doc.paragraphs:
167
  if paragraph.text.strip():
168
- original_text = paragraph.text
 
 
169
  translated_text = await self.translate_text(
170
  original_text, model, source_lang, target_lang
171
  )
 
 
 
 
 
 
 
172
  paragraph.text = translated_text
173
  paragraphs_count += 1
174
 
175
  # Add small delay to avoid rate limiting
176
- await asyncio.sleep(0.1)
177
 
178
  # Translate tables if any
179
- for table in doc.tables:
180
- for row in table.rows:
181
- for cell in row.cells:
 
 
182
  if cell.text.strip():
183
- original_text = cell.text
184
  translated_text = await self.translate_text(
185
  original_text, model, source_lang, target_lang
186
  )
187
  cell.text = translated_text
188
- paragraphs_count += 1
189
  await asyncio.sleep(0.1)
190
 
 
 
 
191
  # Save translated document
192
  translated_path = output_dir / f"translated_{docx_path.name}"
193
  doc.save(translated_path)
194
 
195
- logger.info(f"Translated {paragraphs_count} paragraphs in {docx_path}")
196
- return translated_path, paragraphs_count
 
 
 
 
 
 
197
 
198
  except Exception as e:
199
  logger.error(f"Error translating DOCX: {e}")
@@ -218,19 +378,39 @@ Text to translate:
218
 
219
  try:
220
  if file_extension == ".pdf":
221
- # Convert PDF to DOCX first
222
- logger.info(f"Converting PDF {input_file} to DOCX")
223
- docx_file = self.pdf_to_docx(input_file, output_dir)
224
-
225
- # Translate the DOCX
226
- logger.info(f"Translating DOCX {docx_file}")
227
- translated_docx, paragraphs_count = await self.translate_docx(
228
- docx_file, model, source_language, target_language, output_dir
229
- )
230
 
231
- # Convert translated DOCX back to PDF
232
- logger.info(f"Converting translated DOCX back to PDF")
233
- translated_file = self.docx_to_pdf(translated_docx, output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  # Estimate pages (rough estimate: 1 page = ~500 words)
236
  doc = Document(translated_docx)
 
12
  from docx.shared import Inches
13
  import time
14
  import json
15
+ from PyPDF2 import PdfReader
16
 
17
  logger = logging.getLogger(__name__)
18
 
 
61
  ]
62
 
63
  async def translate_text(self, text: str, model: str, source_lang: str = "auto", target_lang: str = "en") -> str:
64
+ """Translate text using OpenRouter API with improved prompt"""
65
  if not text.strip():
66
  return text
67
 
68
+ # Create a more specific translation prompt
69
+ if source_lang == "auto":
70
+ prompt = f"""You are a professional document translator. Translate the following text to {target_lang} (Arabic if 'ar', English if 'en', etc.).
71
+
72
+ IMPORTANT INSTRUCTIONS:
73
+ 1. Translate ONLY the content, do not add explanations
74
+ 2. Maintain the original formatting and structure
75
+ 3. Preserve technical terms appropriately
76
+ 4. Return ONLY the translated text
77
+
78
+ Text to translate:
79
+ {text}
80
+
81
+ Translated text:"""
82
+ else:
83
+ prompt = f"""You are a professional document translator. Translate the following text from {source_lang} to {target_lang}.
84
+
85
+ IMPORTANT INSTRUCTIONS:
86
+ 1. Translate ONLY the content, do not add explanations
87
+ 2. Maintain the original formatting and structure
88
+ 3. Preserve technical terms appropriately
89
+ 4. Return ONLY the translated text
90
 
91
  Text to translate:
92
+ {text}
93
+
94
+ Translated text:"""
95
 
96
  try:
97
  async with aiohttp.ClientSession() as session:
98
  payload = {
99
  "model": model,
100
  "messages": [
101
+ {"role": "system", "content": "You are a professional document translator. Provide direct translations without any explanations or additional text."},
102
  {"role": "user", "content": prompt}
103
  ],
104
  "temperature": 0.1,
105
+ "max_tokens": len(text) * 3 + 200 # More generous token limit for Arabic
106
  }
107
 
108
+ logger.info(f"Translating text: '{text[:50]}...' from {source_lang} to {target_lang}")
109
+
110
  async with session.post(
111
  f"{self.base_url}/chat/completions",
112
  headers=self.headers,
 
115
  if response.status == 200:
116
  data = await response.json()
117
  translated = data["choices"][0]["message"]["content"].strip()
118
+
119
+ # Clean up the response to ensure we only get the translation
120
+ if "Translated text:" in translated:
121
+ translated = translated.split("Translated text:")[-1].strip()
122
+
123
+ logger.info(f"Translation successful: '{translated[:50]}...'")
124
  return translated
125
  else:
126
  error_text = await response.text()
 
130
  logger.error(f"Translation error: {e}")
131
  return text # Return original text if translation fails
132
 
133
+ def extract_text_from_pdf(self, pdf_path: Path) -> str:
134
+ """Extract text directly from PDF as fallback method"""
135
+ try:
136
+ logger.info(f"Attempting direct text extraction from PDF: {pdf_path}")
137
+ reader = PdfReader(pdf_path)
138
+ text_content = ""
139
+
140
+ for page_num, page in enumerate(reader.pages):
141
+ page_text = page.extract_text()
142
+ if page_text.strip():
143
+ text_content += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
144
+
145
+ logger.info(f"Extracted {len(text_content)} characters from {len(reader.pages)} pages")
146
+ return text_content
147
+
148
+ except Exception as e:
149
+ logger.error(f"Direct PDF text extraction failed: {e}")
150
+ return ""
151
+
152
+ async def translate_pdf_direct(self, pdf_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
153
+ """Translate PDF by extracting text directly and creating new DOCX"""
154
+ try:
155
+ logger.info(f"Using direct PDF text extraction method for {pdf_path}")
156
+
157
+ # Extract text from PDF
158
+ pdf_text = self.extract_text_from_pdf(pdf_path)
159
+
160
+ if not pdf_text.strip():
161
+ raise Exception("No text could be extracted from PDF")
162
+
163
+ # Split text into paragraphs
164
+ paragraphs = [p.strip() for p in pdf_text.split('\n\n') if p.strip()]
165
+ logger.info(f"Split PDF text into {len(paragraphs)} paragraphs")
166
+
167
+ # Create new DOCX document
168
+ doc = Document()
169
+ doc.add_heading('Translated Document', 0)
170
+
171
+ paragraphs_translated = 0
172
+
173
+ # Translate each paragraph
174
+ for i, paragraph in enumerate(paragraphs):
175
+ if len(paragraph.strip()) > 10: # Only translate substantial paragraphs
176
+ logger.info(f"Translating paragraph {i+1}/{len(paragraphs)}: '{paragraph[:50]}...'")
177
+
178
+ translated_text = await self.translate_text(
179
+ paragraph, model, source_lang, target_lang
180
+ )
181
+
182
+ # Add translated paragraph to document
183
+ doc.add_paragraph(translated_text)
184
+ paragraphs_translated += 1
185
+
186
+ # Add delay to avoid rate limiting
187
+ await asyncio.sleep(0.2)
188
+ else:
189
+ # Add short text as-is
190
+ doc.add_paragraph(paragraph)
191
+
192
+ # Save translated document
193
+ translated_path = output_dir / f"translated_{pdf_path.stem}.docx"
194
+ doc.save(translated_path)
195
+
196
+ logger.info(f"Successfully created translated DOCX with {paragraphs_translated} translated paragraphs")
197
+ return translated_path, paragraphs_translated
198
+
199
+ except Exception as e:
200
+ logger.error(f"Direct PDF translation failed: {e}")
201
+ raise
202
+
203
  def pdf_to_docx(self, pdf_path: Path, output_dir: Path) -> Path:
 
204
  try:
205
  docx_path = output_dir / f"{pdf_path.stem}.docx"
206
 
207
+ # Log the conversion attempt
208
+ logger.info(f"Starting PDF to DOCX conversion: {pdf_path} -> {docx_path}")
209
+
210
  # Use LibreOffice to convert PDF to DOCX
211
  cmd = [
212
  "libreoffice",
 
216
  str(pdf_path)
217
  ]
218
 
219
+ logger.info(f"Running command: {' '.join(cmd)}")
220
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
221
 
222
+ logger.info(f"LibreOffice exit code: {result.returncode}")
223
+ logger.info(f"LibreOffice stdout: {result.stdout}")
224
+ logger.info(f"LibreOffice stderr: {result.stderr}")
225
+
226
+ # Check if conversion was successful
227
+ if result.returncode == 0:
228
+ if docx_path.exists():
229
+ file_size = docx_path.stat().st_size
230
+ logger.info(f"Successfully converted {pdf_path} to {docx_path} (size: {file_size} bytes)")
231
+
232
+ # Verify the DOCX file has content
233
+ try:
234
+ from docx import Document
235
+ doc = Document(docx_path)
236
+ paragraph_count = len([p for p in doc.paragraphs if p.text.strip()])
237
+ logger.info(f"DOCX contains {paragraph_count} paragraphs with text")
238
+
239
+ if paragraph_count == 0:
240
+ logger.warning("Converted DOCX appears to have no text content")
241
+ # Try alternative conversion approach if available
242
+
243
+ except Exception as e:
244
+ logger.error(f"Error validating DOCX content: {e}")
245
+
246
+ return docx_path
247
+ else:
248
+ raise Exception(f"Conversion completed but output file {docx_path} not found")
249
  else:
250
+ raise Exception(f"LibreOffice conversion failed with exit code {result.returncode}: {result.stderr}")
 
251
 
252
  except subprocess.TimeoutExpired:
253
+ raise Exception("PDF conversion timed out after 120 seconds")
254
  except Exception as e:
255
  logger.error(f"Error converting PDF to DOCX: {e}")
256
  raise
 
284
  raise
285
 
286
  async def translate_docx(self, docx_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
287
+ """Translate DOCX document paragraph by paragraph with enhanced debugging"""
288
  try:
289
  # Load the document
290
+ logger.info(f"Loading DOCX document: {docx_path}")
291
  doc = Document(docx_path)
292
  paragraphs_count = 0
293
+ total_paragraphs = len(doc.paragraphs)
294
+
295
+ logger.info(f"Document has {total_paragraphs} total paragraphs")
296
+
297
+ # Count paragraphs with text first
298
+ text_paragraphs = [p for p in doc.paragraphs if p.text.strip()]
299
+ logger.info(f"Found {len(text_paragraphs)} paragraphs with text content")
300
+
301
+ # Log first few paragraphs for debugging
302
+ for i, paragraph in enumerate(text_paragraphs[:3]):
303
+ logger.info(f"Sample paragraph {i+1}: '{paragraph.text[:100]}...'")
304
 
305
  # Translate each paragraph
306
+ for i, paragraph in enumerate(doc.paragraphs):
307
  if paragraph.text.strip():
308
+ original_text = paragraph.text.strip()
309
+ logger.info(f"Translating paragraph {paragraphs_count + 1}/{len(text_paragraphs)}: '{original_text[:50]}...'")
310
+
311
  translated_text = await self.translate_text(
312
  original_text, model, source_lang, target_lang
313
  )
314
+
315
+ # Verify translation actually happened
316
+ if translated_text != original_text:
317
+ logger.info(f"Translation successful: '{translated_text[:50]}...'")
318
+ else:
319
+ logger.warning(f"Translation returned original text for: '{original_text[:50]}...'")
320
+
321
  paragraph.text = translated_text
322
  paragraphs_count += 1
323
 
324
  # Add small delay to avoid rate limiting
325
+ await asyncio.sleep(0.2)
326
 
327
  # Translate tables if any
328
+ table_cells_translated = 0
329
+ for table_idx, table in enumerate(doc.tables):
330
+ logger.info(f"Processing table {table_idx + 1} of {len(doc.tables)}")
331
+ for row_idx, row in enumerate(table.rows):
332
+ for cell_idx, cell in enumerate(row.cells):
333
  if cell.text.strip():
334
+ original_text = cell.text.strip()
335
  translated_text = await self.translate_text(
336
  original_text, model, source_lang, target_lang
337
  )
338
  cell.text = translated_text
339
+ table_cells_translated += 1
340
  await asyncio.sleep(0.1)
341
 
342
+ logger.info(f"Translated {table_cells_translated} table cells")
343
+ total_translated = paragraphs_count + table_cells_translated
344
+
345
  # Save translated document
346
  translated_path = output_dir / f"translated_{docx_path.name}"
347
  doc.save(translated_path)
348
 
349
+ logger.info(f"Successfully translated {total_translated} text elements and saved to {translated_path}")
350
+
351
+ # Verify the saved document
352
+ if translated_path.exists():
353
+ file_size = translated_path.stat().st_size
354
+ logger.info(f"Translated document saved (size: {file_size} bytes)")
355
+
356
+ return translated_path, total_translated
357
 
358
  except Exception as e:
359
  logger.error(f"Error translating DOCX: {e}")
 
378
 
379
  try:
380
  if file_extension == ".pdf":
381
+ logger.info(f"Processing PDF file: {input_file}")
 
 
 
 
 
 
 
 
382
 
383
+ try:
384
+ # Try LibreOffice conversion first
385
+ logger.info(f"Attempting LibreOffice conversion for {input_file}")
386
+ docx_file = self.pdf_to_docx(input_file, output_dir)
387
+
388
+ # Translate the DOCX
389
+ logger.info(f"Translating converted DOCX {docx_file}")
390
+ translated_docx, paragraphs_count = await self.translate_docx(
391
+ docx_file, model, source_language, target_language, output_dir
392
+ )
393
+
394
+ # If no paragraphs were translated, try direct method
395
+ if paragraphs_count == 0:
396
+ logger.warning("LibreOffice conversion produced no translatable content, trying direct extraction")
397
+ raise Exception("No content found in LibreOffice conversion")
398
+
399
+ # Convert translated DOCX back to PDF
400
+ logger.info(f"Converting translated DOCX back to PDF")
401
+ translated_file = self.docx_to_pdf(translated_docx, output_dir)
402
+
403
+ except Exception as libreoffice_error:
404
+ logger.warning(f"LibreOffice method failed: {libreoffice_error}")
405
+ logger.info("Falling back to direct PDF text extraction")
406
+
407
+ # Fallback to direct PDF text extraction
408
+ translated_docx, paragraphs_count = await self.translate_pdf_direct(
409
+ input_file, model, source_language, target_language, output_dir
410
+ )
411
+
412
+ # Convert the translated DOCX to PDF
413
+ translated_file = self.docx_to_pdf(translated_docx, output_dir)
414
 
415
  # Estimate pages (rough estimate: 1 page = ~500 words)
416
  doc = Document(translated_docx)