Spaces:

Arsive
/

lt_space

Build error

App Files Files Community

Arsive2 commited on Apr 15, 2025

Commit

d0d0352

1 Parent(s): 0bf2d2c

Updated comments

Browse files

Files changed (5) hide show

api_server.py +12 -27
app/models/document_processor.py +2 -5
app/models/html_processor.py +2 -12
app/models/text_chunker.py +6 -25
app/models/translation_model.py +5 -5

api_server.py CHANGED Viewed

@@ -1,42 +1,38 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import Optional, Dict, Any, List
-import torch
-import os
 import logging
 import uvicorn
-from app.models.translation_model import TranslationModel
 from app.models.html_processor import HTMLProcessor
 from app.models.text_chunker import TextChunker
-from app.models.document_processor import DocumentProcessor
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
-# Initialize FastAPI app
 app = FastAPI(
     title="Universal Translator API",
     description="API for text, HTML, and document translation services",
     version="1.0.0"
 )
-# Configure CORS
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Adjust in production
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Initialize model components
 try:
-    # Use the CPU-optimized translation model
     model = TranslationModel()
     html_processor = HTMLProcessor()
     text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
@@ -47,7 +43,6 @@ except Exception as e:
     logger.error(f"Error initializing components: {str(e)}")
     initialization_error = str(e)
-# Define request/response models
 class TranslationRequest(BaseModel):
     text: str
     source_lang_code: str
@@ -96,14 +91,11 @@ async def translate_text(request: TranslationRequest):
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
-        # Using the OPUS-MT/NLLB hybrid model for more efficient translation
         logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
-        # Create chunks using TextChunker for long texts
         chunks = text_chunker.create_chunks(request.text)
         translated_chunks = []
-        # Translate each chunk
         for chunk in chunks:
             translated_text = model.translate(
                 chunk.text,
@@ -112,7 +104,6 @@ async def translate_text(request: TranslationRequest):
             )
             translated_chunks.append(translated_text)
-        # Combine translations
         final_translation = text_chunker.combine_translations(
             request.text, chunks, translated_chunks
         )
@@ -129,16 +120,13 @@ async def translate_html(request: HTMLTranslationRequest):
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
-        # Extract text and maintain exact DOM structure
         text_fragments, dom_data = html_processor.extract_text(request.html)
         if not text_fragments:
             return {"translated_html": request.html}  # No text to translate
-        # Process each text fragment individually
         translated_fragments = []
-        # Process in smaller batches to avoid timeouts
         batch_size = 10
         for i in range(0, len(text_fragments), batch_size):
             batch = text_fragments[i:i+batch_size]
@@ -155,7 +143,6 @@ async def translate_html(request: HTMLTranslationRequest):
                 )
                 translated_fragments.append(translated_text)
-        # Replace the original text with translated text in the HTML structure
         translated_html = html_processor.replace_text(dom_data, translated_fragments)
         return {"translated_html": translated_html}
@@ -175,10 +162,8 @@ async def process_document(
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
-        # Read file content
         file_content = await file.read()
-        # Process document to extract text
         extracted_text = document_processor.process_document(
             file_data=file_content,
             filename=file.filename,
@@ -191,7 +176,6 @@ async def process_document(
                 detail="No text could be extracted from the document"
             )
-        # Translate the extracted text using our more efficient model
         translated_text = model.translate(
             extracted_text,
             source_lang_code,
@@ -207,4 +191,5 @@ async def process_document(
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
-    uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)

 import logging
+import os
+import torch
 import uvicorn
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from app.models.document_processor import DocumentProcessor
 from app.models.html_processor import HTMLProcessor
 from app.models.text_chunker import TextChunker
+from app.models.translation_model import TranslationModel
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 app = FastAPI(
     title="Universal Translator API",
     description="API for text, HTML, and document translation services",
     version="1.0.0"
 )
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 try:
     model = TranslationModel()
     html_processor = HTMLProcessor()
     text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
     logger.error(f"Error initializing components: {str(e)}")
     initialization_error = str(e)
 class TranslationRequest(BaseModel):
     text: str
     source_lang_code: str
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
         chunks = text_chunker.create_chunks(request.text)
         translated_chunks = []
         for chunk in chunks:
             translated_text = model.translate(
                 chunk.text,
             )
             translated_chunks.append(translated_text)
         final_translation = text_chunker.combine_translations(
             request.text, chunks, translated_chunks
         )
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         text_fragments, dom_data = html_processor.extract_text(request.html)
         if not text_fragments:
             return {"translated_html": request.html}  # No text to translate
         translated_fragments = []
         batch_size = 10
         for i in range(0, len(text_fragments), batch_size):
             batch = text_fragments[i:i+batch_size]
                 )
                 translated_fragments.append(translated_text)
         translated_html = html_processor.replace_text(dom_data, translated_fragments)
         return {"translated_html": translated_html}
         raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         file_content = await file.read()
         extracted_text = document_processor.process_document(
             file_data=file_content,
             filename=file.filename,
                 detail="No text could be extracted from the document"
             )
         translated_text = model.translate(
             extracted_text,
             source_lang_code,
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)

app/models/document_processor.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import fitz  # PyMuPDF
 import logging
 from pathlib import Path
 logger = logging.getLogger(__name__)
 class DocumentProcessor:
@@ -35,15 +36,11 @@ class DocumentProcessor:
             if file_ext not in self.supported_formats:
                 raise ValueError(f"Unsupported file format: {file_ext}")
-            # Process PDF using PyMuPDF
             if file_ext == '.pdf':
                 return self._process_pdf(file_data)
-            # Process image (placeholder - would need OCR integration)
             else:
                 if use_ocr:
-                    # Placeholder for OCR implementation
-                    # You would integrate with an OCR service here
                     raise NotImplementedError("OCR for images not implemented")
                 else:
                     return "Text extraction from images requires OCR to be enabled"

 import logging
 from pathlib import Path
+import fitz  # PyMuPDF
 logger = logging.getLogger(__name__)
 class DocumentProcessor:
             if file_ext not in self.supported_formats:
                 raise ValueError(f"Unsupported file format: {file_ext}")
             if file_ext == '.pdf':
                 return self._process_pdf(file_data)
             else:
                 if use_ocr:
                     raise NotImplementedError("OCR for images not implemented")
                 else:
                     return "Text extraction from images requires OCR to be enabled"

app/models/html_processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 from bs4 import BeautifulSoup, NavigableString, Tag
-from typing import List, Tuple, Dict, Any
 logger = logging.getLogger(__name__)
@@ -30,14 +31,11 @@ class HTMLProcessor:
             - DOM map that maintains references to the exact nodes in the original structure
         """
         try:
-            # Parse the HTML using 'html.parser' to ensure proper handling
             soup = BeautifulSoup(html_content, 'html.parser')
-            # Use a list to store text fragments and their corresponding nodes
             text_fragments = []
             dom_map = {}
-            # Process the soup to find all text nodes
             self._extract_text_from_node(soup, text_fragments, dom_map)
             return text_fragments, {'soup': soup, 'node_map': dom_map}
@@ -56,24 +54,19 @@ class HTMLProcessor:
             dom_map: Dictionary to map indices to nodes
             path: Current path in the DOM tree for debugging
         """
-        # Skip processing for certain tags
         if isinstance(node, Tag) and node.name in self.skip_tags:
             return
-        # Skip elements with notranslate class
         if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
             return
-        # Process this node
         if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
-            # Only process non-empty text
             text = str(node).strip()
             if text:
                 index = len(text_fragments)
                 text_fragments.append(text)
                 dom_map[index] = node
-        # Recursively process child nodes
         if isinstance(node, Tag):
             for child in node.children:
                 child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
@@ -98,13 +91,10 @@ class HTMLProcessor:
                 logger.error("Invalid DOM data for text replacement")
                 return ""
-            # Replace text in each node
             for index, node in node_map.items():
                 if index < len(translated_fragments):
-                    # Replace the original string with the translated string
                     node.replace_with(NavigableString(translated_fragments[index]))
-            # Return the HTML as a string
             return str(soup)
         except Exception as e:

 import logging
+from typing import Any, Dict, List, Tuple
 from bs4 import BeautifulSoup, NavigableString, Tag
 logger = logging.getLogger(__name__)
             - DOM map that maintains references to the exact nodes in the original structure
         """
         try:
             soup = BeautifulSoup(html_content, 'html.parser')
             text_fragments = []
             dom_map = {}
             self._extract_text_from_node(soup, text_fragments, dom_map)
             return text_fragments, {'soup': soup, 'node_map': dom_map}
             dom_map: Dictionary to map indices to nodes
             path: Current path in the DOM tree for debugging
         """
         if isinstance(node, Tag) and node.name in self.skip_tags:
             return
         if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
             return
         if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
             text = str(node).strip()
             if text:
                 index = len(text_fragments)
                 text_fragments.append(text)
                 dom_map[index] = node
         if isinstance(node, Tag):
             for child in node.children:
                 child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
                 logger.error("Invalid DOM data for text replacement")
                 return ""
             for index, node in node_map.items():
                 if index < len(translated_fragments):
                     node.replace_with(NavigableString(translated_fragments[index]))
             return str(soup)
         except Exception as e:

app/models/text_chunker.py CHANGED Viewed

@@ -1,17 +1,15 @@
-import re
 import logging
 import os
-import nltk
-from typing import List, Optional
 from dataclasses import dataclass
 from nltk.tokenize import sent_tokenize
-# Set NLTK data path from environment variable if available
 nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
 nltk.data.path.append(nltk_data_path)
-# Ensure NLTK data is downloaded
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
@@ -19,7 +17,6 @@ except LookupError:
         nltk.download('punkt', download_dir=nltk_data_path)
     except Exception as e:
         logging.warning(f"Failed to download NLTK data: {e}")
-        # Fallback to not using NLTK if download fails
 logger = logging.getLogger(__name__)
@@ -62,22 +59,16 @@ class TextChunker:
         if not text:
             return ""
-        # Replace multiple newlines with single \n
         text = re.sub(r'\n\s*\n', '\n', text)
-        # Replace other whitespace characters with space
         text = re.sub(r'[\r\t\f\v]', ' ', text)
-        # Replace multiple spaces with single space
         text = re.sub(r' +', ' ', text)
-        # Clean up spaces around newlines
         text = re.sub(r' *\n *', '\n', text)
-        # Remove spaces at the start and end of the text
         text = text.strip()
-        # Handle bullet points and lists consistently
         text = re.sub(r'•\s*', '• ', text)
         text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
@@ -88,7 +79,6 @@ class TextChunker:
         Estimate the number of tokens in a text string.
         This is a rough approximation - actual token count may vary by tokenizer.
         """
-        # Split on whitespace and punctuation
         words = re.findall(r'\b\w+\b|[^\w\s]', text)
         return len(words)
@@ -98,7 +88,6 @@ class TextChunker:
             return sent_tokenize(text)
         except Exception as e:
             logger.warning(f"Error in sentence tokenization: {e}")
-            # Fallback to simple period-based splitting
             return [s.strip() + '.' for s in text.split('.') if s.strip()]
     def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
@@ -114,7 +103,6 @@ class TextChunker:
             sentence = sentences[i]
             sentence_tokens = self.estimate_tokens(sentence)
-            # If single sentence exceeds max tokens, split it
             if sentence_tokens > max_tokens:
                 if not current_sentences:  # First sentence
                     words = sentence.split()
@@ -134,7 +122,6 @@ class TextChunker:
                     return chunk_text, i, is_partial
                 break
-            # Check if adding this sentence would exceed the limit
             if current_tokens + sentence_tokens > max_tokens and current_sentences:
                 break
@@ -160,13 +147,11 @@ class TextChunker:
         chunks = []
         current_idx = 0
-        # Split into paragraphs if preserve_paragraphs is True
         if self.preserve_paragraphs:
             paragraphs = text.split('\n')
         else:
             paragraphs = [text]
-        # Process each paragraph
         for para in paragraphs:
             if not para.strip():
                 continue
@@ -182,7 +167,6 @@ class TextChunker:
                 if not chunk_text:
                     break
-                # Calculate original text positions
                 original_start = text.find(chunk_text)
                 original_end = original_start + len(chunk_text)
@@ -222,11 +206,9 @@ class TextChunker:
         if len(chunks) == 1:
             return translations[0]
-        # Combine translations, handling partial sentences
         result = []
         for i, (chunk, translation) in enumerate(zip(chunks, translations)):
             if i > 0 and chunk.is_partial_sentence:
-                # For partial sentences, try to find a clean break point
                 prev_translation = translations[i-1]
                 overlap = self._find_overlap(prev_translation, translation)
                 if overlap:
@@ -241,15 +223,14 @@ class TextChunker:
         if not text1 or not text2:
             return None
-        # Get the last part of text1 and first part of text2
         end_text = text1[-100:]  # Look at last 100 chars
         start_text = text2[:100]  # Look at first 100 chars
-        # Find the longest common substring
         overlap = None
         for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
             if end_text[-length:] == start_text[:length]:
                 overlap = start_text[:length]
                 break
-        return overlap

 import logging
 import os
+import re
 from dataclasses import dataclass
+from typing import List, Optional
+import nltk
 from nltk.tokenize import sent_tokenize
 nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
 nltk.data.path.append(nltk_data_path)
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
         nltk.download('punkt', download_dir=nltk_data_path)
     except Exception as e:
         logging.warning(f"Failed to download NLTK data: {e}")
 logger = logging.getLogger(__name__)
         if not text:
             return ""
         text = re.sub(r'\n\s*\n', '\n', text)
         text = re.sub(r'[\r\t\f\v]', ' ', text)
         text = re.sub(r' +', ' ', text)
         text = re.sub(r' *\n *', '\n', text)
         text = text.strip()
         text = re.sub(r'•\s*', '• ', text)
         text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
         Estimate the number of tokens in a text string.
         This is a rough approximation - actual token count may vary by tokenizer.
         """
         words = re.findall(r'\b\w+\b|[^\w\s]', text)
         return len(words)
             return sent_tokenize(text)
         except Exception as e:
             logger.warning(f"Error in sentence tokenization: {e}")
             return [s.strip() + '.' for s in text.split('.') if s.strip()]
     def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
             sentence = sentences[i]
             sentence_tokens = self.estimate_tokens(sentence)
             if sentence_tokens > max_tokens:
                 if not current_sentences:  # First sentence
                     words = sentence.split()
                     return chunk_text, i, is_partial
                 break
             if current_tokens + sentence_tokens > max_tokens and current_sentences:
                 break
         chunks = []
         current_idx = 0
         if self.preserve_paragraphs:
             paragraphs = text.split('\n')
         else:
             paragraphs = [text]
         for para in paragraphs:
             if not para.strip():
                 continue
                 if not chunk_text:
                     break
                 original_start = text.find(chunk_text)
                 original_end = original_start + len(chunk_text)
         if len(chunks) == 1:
             return translations[0]
         result = []
         for i, (chunk, translation) in enumerate(zip(chunks, translations)):
             if i > 0 and chunk.is_partial_sentence:
                 prev_translation = translations[i-1]
                 overlap = self._find_overlap(prev_translation, translation)
                 if overlap:
         if not text1 or not text2:
             return None
         end_text = text1[-100:]  # Look at last 100 chars
         start_text = text2[:100]  # Look at first 100 chars
         overlap = None
         for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
             if end_text[-length:] == start_text[:length]:
                 overlap = start_text[:length]
                 break
+        return overlap

app/models/translation_model.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import torch
 import logging
-import re
 import os
-from typing import Optional, Dict, Any, List
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from tqdm import tqdm
 logger = logging.getLogger(__name__)

 import logging
 import os
+import re
+from typing import Optional
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 logger = logging.getLogger(__name__)