Spaces:

fokan
/

trabb

Sleeping

App Files Files Community

fokan commited on Sep 3, 2025

Commit

3e2ca56

1 Parent(s): c52255c

first push

Browse files

Files changed (4) hide show

app/main.py +19 -1
debug_translation.py +69 -0
requirements.txt +2 -1
translator.py +216 -36

app/main.py CHANGED Viewed

@@ -19,7 +19,14 @@ import logging
 from translator import DocumentTranslator, TranslationReport
 # Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(title="Document Translator", description="Translate PDF and DOCX documents using OpenRouter")
@@ -91,6 +98,7 @@ async def translate_document(
         try:
             # Perform translation
             result = await translator.translate_document(
                 input_file=input_file,
                 model=model,
@@ -99,6 +107,16 @@ async def translate_document(
                 output_dir=temp_path
             )
             # Move files to uploads directory for serving
             timestamp = int(asyncio.get_event_loop().time())
             result_dir = UPLOAD_DIR / f"translation_{timestamp}"

 from translator import DocumentTranslator, TranslationReport
 # Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('translation.log') if os.path.exists('.') else logging.StreamHandler()
+    ]
+)
 logger = logging.getLogger(__name__)
 app = FastAPI(title="Document Translator", description="Translate PDF and DOCX documents using OpenRouter")
         try:
             # Perform translation
+            logger.info(f"Starting translation of {input_file} using model {model}")
             result = await translator.translate_document(
                 input_file=input_file,
                 model=model,
                 output_dir=temp_path
             )
+            # Check if translation was successful
+            if result.status == "failed":
+                error_details = f"Translation failed: {result.errors[0] if result.errors else 'Unknown error'}"
+                logger.error(error_details)
+                raise HTTPException(status_code=500, detail=error_details)
+            if result.paragraphs_count == 0:
+                logger.warning("Translation completed but no paragraphs were translated")
+                # Still proceed but log the issue
             # Move files to uploads directory for serving
             timestamp = int(asyncio.get_event_loop().time())
             result_dir = UPLOAD_DIR / f"translation_{timestamp}"

debug_translation.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+"""
+Debug script to test translation functionality
+"""
+import os
+import asyncio
+import sys
+from pathlib import Path
+# Add the current directory to Python path
+sys.path.insert(0, str(Path(__file__).parent))
+from translator import DocumentTranslator
+async def test_translation():
+    """Test the translation system"""
+    print("🧪 Testing Document Translation System...")
+    # Check API key
+    api_key = os.getenv('OPENROUTER_API_KEY')
+    if not api_key:
+        print("❌ OPENROUTER_API_KEY not found!")
+        print("Set it with: export OPENROUTER_API_KEY='your_key_here'")
+        return
+    print(f"✅ API key found (length: {len(api_key)})")
+    # Initialize translator
+    translator = DocumentTranslator()
+    if not translator.is_ready():
+        print("❌ Translator not ready")
+        return
+    print("✅ Translator initialized")
+    # Test model availability
+    models = await translator.get_available_models()
+    print(f"✅ Available models: {len(models)}")
+    for model in models:
+        print(f"   - {model['name']}: {model['id']}")
+    # Test basic translation
+    test_text = "Hello, this is a test sentence for translation."
+    print(f"\n🔤 Testing basic translation...")
+    print(f"Original: {test_text}")
+    try:
+        translated = await translator.translate_text(
+            test_text,
+            "google/gemini-2.5-pro-exp-03-25",
+            "en",
+            "ar"
+        )
+        print(f"Translated: {translated}")
+        if translated != test_text:
+            print("✅ Basic translation working!")
+        else:
+            print("⚠️  Translation returned original text - check API key and credits")
+    except Exception as e:
+        print(f"❌ Translation test failed: {e}")
+    print("\n🎯 Translation system test complete!")
+if __name__ == "__main__":
+    asyncio.run(test_translation())

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ aiohttp==3.9.1
 python-docx==1.1.0
 requests==2.31.0
 Pillow==10.1.0
-typing-extensions==4.8.0

 python-docx==1.1.0
 requests==2.31.0
 Pillow==10.1.0
+typing-extensions==4.8.0
+PyPDF2==3.0.1

translator.py CHANGED Viewed

@@ -12,6 +12,7 @@ from docx import Document
 from docx.shared import Inches
 import time
 import json
 logger = logging.getLogger(__name__)
@@ -60,27 +61,52 @@ class DocumentTranslator:
         ]
     async def translate_text(self, text: str, model: str, source_lang: str = "auto", target_lang: str = "en") -> str:
-        """Translate text using OpenRouter API"""
         if not text.strip():
             return text
-        prompt = f"""Please translate the following text from {source_lang} to {target_lang}.
-Only return the translated text, without any explanations or additional content.
 Text to translate:
-{text}"""
         try:
             async with aiohttp.ClientSession() as session:
                 payload = {
                     "model": model,
                     "messages": [
                         {"role": "user", "content": prompt}
                     ],
                     "temperature": 0.1,
-                    "max_tokens": len(text) * 2 + 100  # Rough estimate for translation length
                 }
                 async with session.post(
                     f"{self.base_url}/chat/completions",
                     headers=self.headers,
@@ -89,6 +115,12 @@ Text to translate:
                     if response.status == 200:
                         data = await response.json()
                         translated = data["choices"][0]["message"]["content"].strip()
                         return translated
                     else:
                         error_text = await response.text()
@@ -98,11 +130,83 @@ Text to translate:
             logger.error(f"Translation error: {e}")
             return text  # Return original text if translation fails
     def pdf_to_docx(self, pdf_path: Path, output_dir: Path) -> Path:
-        """Convert PDF to DOCX using LibreOffice"""
         try:
             docx_path = output_dir / f"{pdf_path.stem}.docx"
             # Use LibreOffice to convert PDF to DOCX
             cmd = [
                 "libreoffice",
@@ -112,17 +216,41 @@ Text to translate:
                 str(pdf_path)
             ]
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-            if result.returncode == 0 and docx_path.exists():
-                logger.info(f"Successfully converted {pdf_path} to {docx_path}")
-                return docx_path
             else:
-                logger.error(f"LibreOffice conversion failed: {result.stderr}")
-                raise Exception(f"PDF to DOCX conversion failed: {result.stderr}")
         except subprocess.TimeoutExpired:
-            raise Exception("PDF conversion timed out")
         except Exception as e:
             logger.error(f"Error converting PDF to DOCX: {e}")
             raise
@@ -156,44 +284,76 @@ Text to translate:
             raise
     async def translate_docx(self, docx_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
-        """Translate DOCX document paragraph by paragraph"""
         try:
             # Load the document
             doc = Document(docx_path)
             paragraphs_count = 0
             # Translate each paragraph
-            for paragraph in doc.paragraphs:
                 if paragraph.text.strip():
-                    original_text = paragraph.text
                     translated_text = await self.translate_text(
                         original_text, model, source_lang, target_lang
                     )
                     paragraph.text = translated_text
                     paragraphs_count += 1
                     # Add small delay to avoid rate limiting
-                    await asyncio.sleep(0.1)
             # Translate tables if any
-            for table in doc.tables:
-                for row in table.rows:
-                    for cell in row.cells:
                         if cell.text.strip():
-                            original_text = cell.text
                             translated_text = await self.translate_text(
                                 original_text, model, source_lang, target_lang
                             )
                             cell.text = translated_text
-                            paragraphs_count += 1
                             await asyncio.sleep(0.1)
             # Save translated document
             translated_path = output_dir / f"translated_{docx_path.name}"
             doc.save(translated_path)
-            logger.info(f"Translated {paragraphs_count} paragraphs in {docx_path}")
-            return translated_path, paragraphs_count
         except Exception as e:
             logger.error(f"Error translating DOCX: {e}")
@@ -218,19 +378,39 @@ Text to translate:
         try:
             if file_extension == ".pdf":
-                # Convert PDF to DOCX first
-                logger.info(f"Converting PDF {input_file} to DOCX")
-                docx_file = self.pdf_to_docx(input_file, output_dir)
-                # Translate the DOCX
-                logger.info(f"Translating DOCX {docx_file}")
-                translated_docx, paragraphs_count = await self.translate_docx(
-                    docx_file, model, source_language, target_language, output_dir
-                )
-                # Convert translated DOCX back to PDF
-                logger.info(f"Converting translated DOCX back to PDF")
-                translated_file = self.docx_to_pdf(translated_docx, output_dir)
                 # Estimate pages (rough estimate: 1 page = ~500 words)
                 doc = Document(translated_docx)

 from docx.shared import Inches
 import time
 import json
+from PyPDF2 import PdfReader
 logger = logging.getLogger(__name__)
         ]
     async def translate_text(self, text: str, model: str, source_lang: str = "auto", target_lang: str = "en") -> str:
+        """Translate text using OpenRouter API with improved prompt"""
         if not text.strip():
             return text
+        # Create a more specific translation prompt
+        if source_lang == "auto":
+            prompt = f"""You are a professional document translator. Translate the following text to {target_lang} (Arabic if 'ar', English if 'en', etc.).
+IMPORTANT INSTRUCTIONS:
+1. Translate ONLY the content, do not add explanations
+2. Maintain the original formatting and structure
+3. Preserve technical terms appropriately
+4. Return ONLY the translated text
+Text to translate:
+{text}
+Translated text:"""
+        else:
+            prompt = f"""You are a professional document translator. Translate the following text from {source_lang} to {target_lang}.
+IMPORTANT INSTRUCTIONS:
+1. Translate ONLY the content, do not add explanations
+2. Maintain the original formatting and structure
+3. Preserve technical terms appropriately
+4. Return ONLY the translated text
 Text to translate:
+{text}
+Translated text:"""
         try:
             async with aiohttp.ClientSession() as session:
                 payload = {
                     "model": model,
                     "messages": [
+                        {"role": "system", "content": "You are a professional document translator. Provide direct translations without any explanations or additional text."},
                         {"role": "user", "content": prompt}
                     ],
                     "temperature": 0.1,
+                    "max_tokens": len(text) * 3 + 200  # More generous token limit for Arabic
                 }
+                logger.info(f"Translating text: '{text[:50]}...' from {source_lang} to {target_lang}")
                 async with session.post(
                     f"{self.base_url}/chat/completions",
                     headers=self.headers,
                     if response.status == 200:
                         data = await response.json()
                         translated = data["choices"][0]["message"]["content"].strip()
+                        # Clean up the response to ensure we only get the translation
+                        if "Translated text:" in translated:
+                            translated = translated.split("Translated text:")[-1].strip()
+                        logger.info(f"Translation successful: '{translated[:50]}...'")
                         return translated
                     else:
                         error_text = await response.text()
             logger.error(f"Translation error: {e}")
             return text  # Return original text if translation fails
+    def extract_text_from_pdf(self, pdf_path: Path) -> str:
+        """Extract text directly from PDF as fallback method"""
+        try:
+            logger.info(f"Attempting direct text extraction from PDF: {pdf_path}")
+            reader = PdfReader(pdf_path)
+            text_content = ""
+            for page_num, page in enumerate(reader.pages):
+                page_text = page.extract_text()
+                if page_text.strip():
+                    text_content += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
+            logger.info(f"Extracted {len(text_content)} characters from {len(reader.pages)} pages")
+            return text_content
+        except Exception as e:
+            logger.error(f"Direct PDF text extraction failed: {e}")
+            return ""
+    async def translate_pdf_direct(self, pdf_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
+        """Translate PDF by extracting text directly and creating new DOCX"""
+        try:
+            logger.info(f"Using direct PDF text extraction method for {pdf_path}")
+            # Extract text from PDF
+            pdf_text = self.extract_text_from_pdf(pdf_path)
+            if not pdf_text.strip():
+                raise Exception("No text could be extracted from PDF")
+            # Split text into paragraphs
+            paragraphs = [p.strip() for p in pdf_text.split('\n\n') if p.strip()]
+            logger.info(f"Split PDF text into {len(paragraphs)} paragraphs")
+            # Create new DOCX document
+            doc = Document()
+            doc.add_heading('Translated Document', 0)
+            paragraphs_translated = 0
+            # Translate each paragraph
+            for i, paragraph in enumerate(paragraphs):
+                if len(paragraph.strip()) > 10:  # Only translate substantial paragraphs
+                    logger.info(f"Translating paragraph {i+1}/{len(paragraphs)}: '{paragraph[:50]}...'")
+                    translated_text = await self.translate_text(
+                        paragraph, model, source_lang, target_lang
+                    )
+                    # Add translated paragraph to document
+                    doc.add_paragraph(translated_text)
+                    paragraphs_translated += 1
+                    # Add delay to avoid rate limiting
+                    await asyncio.sleep(0.2)
+                else:
+                    # Add short text as-is
+                    doc.add_paragraph(paragraph)
+            # Save translated document
+            translated_path = output_dir / f"translated_{pdf_path.stem}.docx"
+            doc.save(translated_path)
+            logger.info(f"Successfully created translated DOCX with {paragraphs_translated} translated paragraphs")
+            return translated_path, paragraphs_translated
+        except Exception as e:
+            logger.error(f"Direct PDF translation failed: {e}")
+            raise
     def pdf_to_docx(self, pdf_path: Path, output_dir: Path) -> Path:
         try:
             docx_path = output_dir / f"{pdf_path.stem}.docx"
+            # Log the conversion attempt
+            logger.info(f"Starting PDF to DOCX conversion: {pdf_path} -> {docx_path}")
             # Use LibreOffice to convert PDF to DOCX
             cmd = [
                 "libreoffice",
                 str(pdf_path)
             ]
+            logger.info(f"Running command: {' '.join(cmd)}")
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+            logger.info(f"LibreOffice exit code: {result.returncode}")
+            logger.info(f"LibreOffice stdout: {result.stdout}")
+            logger.info(f"LibreOffice stderr: {result.stderr}")
+            # Check if conversion was successful
+            if result.returncode == 0:
+                if docx_path.exists():
+                    file_size = docx_path.stat().st_size
+                    logger.info(f"Successfully converted {pdf_path} to {docx_path} (size: {file_size} bytes)")
+                    # Verify the DOCX file has content
+                    try:
+                        from docx import Document
+                        doc = Document(docx_path)
+                        paragraph_count = len([p for p in doc.paragraphs if p.text.strip()])
+                        logger.info(f"DOCX contains {paragraph_count} paragraphs with text")
+                        if paragraph_count == 0:
+                            logger.warning("Converted DOCX appears to have no text content")
+                            # Try alternative conversion approach if available
+                    except Exception as e:
+                        logger.error(f"Error validating DOCX content: {e}")
+                    return docx_path
+                else:
+                    raise Exception(f"Conversion completed but output file {docx_path} not found")
             else:
+                raise Exception(f"LibreOffice conversion failed with exit code {result.returncode}: {result.stderr}")
         except subprocess.TimeoutExpired:
+            raise Exception("PDF conversion timed out after 120 seconds")
         except Exception as e:
             logger.error(f"Error converting PDF to DOCX: {e}")
             raise
             raise
     async def translate_docx(self, docx_path: Path, model: str, source_lang: str, target_lang: str, output_dir: Path) -> Tuple[Path, int]:
+        """Translate DOCX document paragraph by paragraph with enhanced debugging"""
         try:
             # Load the document
+            logger.info(f"Loading DOCX document: {docx_path}")
             doc = Document(docx_path)
             paragraphs_count = 0
+            total_paragraphs = len(doc.paragraphs)
+            logger.info(f"Document has {total_paragraphs} total paragraphs")
+            # Count paragraphs with text first
+            text_paragraphs = [p for p in doc.paragraphs if p.text.strip()]
+            logger.info(f"Found {len(text_paragraphs)} paragraphs with text content")
+            # Log first few paragraphs for debugging
+            for i, paragraph in enumerate(text_paragraphs[:3]):
+                logger.info(f"Sample paragraph {i+1}: '{paragraph.text[:100]}...'")
             # Translate each paragraph
+            for i, paragraph in enumerate(doc.paragraphs):
                 if paragraph.text.strip():
+                    original_text = paragraph.text.strip()
+                    logger.info(f"Translating paragraph {paragraphs_count + 1}/{len(text_paragraphs)}: '{original_text[:50]}...'")
                     translated_text = await self.translate_text(
                         original_text, model, source_lang, target_lang
                     )
+                    # Verify translation actually happened
+                    if translated_text != original_text:
+                        logger.info(f"Translation successful: '{translated_text[:50]}...'")
+                    else:
+                        logger.warning(f"Translation returned original text for: '{original_text[:50]}...'")
                     paragraph.text = translated_text
                     paragraphs_count += 1
                     # Add small delay to avoid rate limiting
+                    await asyncio.sleep(0.2)
             # Translate tables if any
+            table_cells_translated = 0
+            for table_idx, table in enumerate(doc.tables):
+                logger.info(f"Processing table {table_idx + 1} of {len(doc.tables)}")
+                for row_idx, row in enumerate(table.rows):
+                    for cell_idx, cell in enumerate(row.cells):
                         if cell.text.strip():
+                            original_text = cell.text.strip()
                             translated_text = await self.translate_text(
                                 original_text, model, source_lang, target_lang
                             )
                             cell.text = translated_text
+                            table_cells_translated += 1
                             await asyncio.sleep(0.1)
+            logger.info(f"Translated {table_cells_translated} table cells")
+            total_translated = paragraphs_count + table_cells_translated
             # Save translated document
             translated_path = output_dir / f"translated_{docx_path.name}"
             doc.save(translated_path)
+            logger.info(f"Successfully translated {total_translated} text elements and saved to {translated_path}")
+            # Verify the saved document
+            if translated_path.exists():
+                file_size = translated_path.stat().st_size
+                logger.info(f"Translated document saved (size: {file_size} bytes)")
+            return translated_path, total_translated
         except Exception as e:
             logger.error(f"Error translating DOCX: {e}")
         try:
             if file_extension == ".pdf":
+                logger.info(f"Processing PDF file: {input_file}")
+                try:
+                    # Try LibreOffice conversion first
+                    logger.info(f"Attempting LibreOffice conversion for {input_file}")
+                    docx_file = self.pdf_to_docx(input_file, output_dir)
+                    # Translate the DOCX
+                    logger.info(f"Translating converted DOCX {docx_file}")
+                    translated_docx, paragraphs_count = await self.translate_docx(
+                        docx_file, model, source_language, target_language, output_dir
+                    )
+                    # If no paragraphs were translated, try direct method
+                    if paragraphs_count == 0:
+                        logger.warning("LibreOffice conversion produced no translatable content, trying direct extraction")
+                        raise Exception("No content found in LibreOffice conversion")
+                    # Convert translated DOCX back to PDF
+                    logger.info(f"Converting translated DOCX back to PDF")
+                    translated_file = self.docx_to_pdf(translated_docx, output_dir)
+                except Exception as libreoffice_error:
+                    logger.warning(f"LibreOffice method failed: {libreoffice_error}")
+                    logger.info("Falling back to direct PDF text extraction")
+                    # Fallback to direct PDF text extraction
+                    translated_docx, paragraphs_count = await self.translate_pdf_direct(
+                        input_file, model, source_language, target_language, output_dir
+                    )
+                    # Convert the translated DOCX to PDF
+                    translated_file = self.docx_to_pdf(translated_docx, output_dir)
                 # Estimate pages (rough estimate: 1 page = ~500 words)
                 doc = Document(translated_docx)