Updated comments
Browse files- api_server.py +12 -27
- app/models/document_processor.py +2 -5
- app/models/html_processor.py +2 -12
- app/models/text_chunker.py +6 -25
- app/models/translation_model.py +5 -5
api_server.py
CHANGED
|
@@ -1,42 +1,38 @@
|
|
| 1 |
-
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 2 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
-
from pydantic import BaseModel
|
| 4 |
-
from typing import Optional, Dict, Any, List
|
| 5 |
-
import torch
|
| 6 |
-
import os
|
| 7 |
import logging
|
|
|
|
|
|
|
|
|
|
| 8 |
import uvicorn
|
| 9 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from app.models.html_processor import HTMLProcessor
|
| 11 |
from app.models.text_chunker import TextChunker
|
| 12 |
-
from app.models.
|
| 13 |
|
| 14 |
-
# Configure logging
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
| 17 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
-
# Initialize FastAPI app
|
| 22 |
app = FastAPI(
|
| 23 |
title="Universal Translator API",
|
| 24 |
description="API for text, HTML, and document translation services",
|
| 25 |
version="1.0.0"
|
| 26 |
)
|
| 27 |
|
| 28 |
-
# Configure CORS
|
| 29 |
app.add_middleware(
|
| 30 |
CORSMiddleware,
|
| 31 |
-
allow_origins=["*"],
|
| 32 |
allow_credentials=True,
|
| 33 |
allow_methods=["*"],
|
| 34 |
allow_headers=["*"],
|
| 35 |
)
|
| 36 |
|
| 37 |
-
# Initialize model components
|
| 38 |
try:
|
| 39 |
-
# Use the CPU-optimized translation model
|
| 40 |
model = TranslationModel()
|
| 41 |
html_processor = HTMLProcessor()
|
| 42 |
text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
|
|
@@ -47,7 +43,6 @@ except Exception as e:
|
|
| 47 |
logger.error(f"Error initializing components: {str(e)}")
|
| 48 |
initialization_error = str(e)
|
| 49 |
|
| 50 |
-
# Define request/response models
|
| 51 |
class TranslationRequest(BaseModel):
|
| 52 |
text: str
|
| 53 |
source_lang_code: str
|
|
@@ -96,14 +91,11 @@ async def translate_text(request: TranslationRequest):
|
|
| 96 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 97 |
|
| 98 |
try:
|
| 99 |
-
# Using the OPUS-MT/NLLB hybrid model for more efficient translation
|
| 100 |
logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
|
| 101 |
|
| 102 |
-
# Create chunks using TextChunker for long texts
|
| 103 |
chunks = text_chunker.create_chunks(request.text)
|
| 104 |
translated_chunks = []
|
| 105 |
|
| 106 |
-
# Translate each chunk
|
| 107 |
for chunk in chunks:
|
| 108 |
translated_text = model.translate(
|
| 109 |
chunk.text,
|
|
@@ -112,7 +104,6 @@ async def translate_text(request: TranslationRequest):
|
|
| 112 |
)
|
| 113 |
translated_chunks.append(translated_text)
|
| 114 |
|
| 115 |
-
# Combine translations
|
| 116 |
final_translation = text_chunker.combine_translations(
|
| 117 |
request.text, chunks, translated_chunks
|
| 118 |
)
|
|
@@ -129,16 +120,13 @@ async def translate_html(request: HTMLTranslationRequest):
|
|
| 129 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 130 |
|
| 131 |
try:
|
| 132 |
-
# Extract text and maintain exact DOM structure
|
| 133 |
text_fragments, dom_data = html_processor.extract_text(request.html)
|
| 134 |
|
| 135 |
if not text_fragments:
|
| 136 |
return {"translated_html": request.html} # No text to translate
|
| 137 |
|
| 138 |
-
# Process each text fragment individually
|
| 139 |
translated_fragments = []
|
| 140 |
|
| 141 |
-
# Process in smaller batches to avoid timeouts
|
| 142 |
batch_size = 10
|
| 143 |
for i in range(0, len(text_fragments), batch_size):
|
| 144 |
batch = text_fragments[i:i+batch_size]
|
|
@@ -155,7 +143,6 @@ async def translate_html(request: HTMLTranslationRequest):
|
|
| 155 |
)
|
| 156 |
translated_fragments.append(translated_text)
|
| 157 |
|
| 158 |
-
# Replace the original text with translated text in the HTML structure
|
| 159 |
translated_html = html_processor.replace_text(dom_data, translated_fragments)
|
| 160 |
|
| 161 |
return {"translated_html": translated_html}
|
|
@@ -175,10 +162,8 @@ async def process_document(
|
|
| 175 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 176 |
|
| 177 |
try:
|
| 178 |
-
# Read file content
|
| 179 |
file_content = await file.read()
|
| 180 |
|
| 181 |
-
# Process document to extract text
|
| 182 |
extracted_text = document_processor.process_document(
|
| 183 |
file_data=file_content,
|
| 184 |
filename=file.filename,
|
|
@@ -191,7 +176,6 @@ async def process_document(
|
|
| 191 |
detail="No text could be extracted from the document"
|
| 192 |
)
|
| 193 |
|
| 194 |
-
# Translate the extracted text using our more efficient model
|
| 195 |
translated_text = model.translate(
|
| 196 |
extracted_text,
|
| 197 |
source_lang_code,
|
|
@@ -207,4 +191,5 @@ async def process_document(
|
|
| 207 |
raise HTTPException(status_code=500, detail=str(e))
|
| 208 |
|
| 209 |
if __name__ == "__main__":
|
| 210 |
-
uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
import uvicorn
|
| 6 |
+
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from app.models.document_processor import DocumentProcessor
|
| 11 |
from app.models.html_processor import HTMLProcessor
|
| 12 |
from app.models.text_chunker import TextChunker
|
| 13 |
+
from app.models.translation_model import TranslationModel
|
| 14 |
|
|
|
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
| 17 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
|
|
|
| 21 |
app = FastAPI(
|
| 22 |
title="Universal Translator API",
|
| 23 |
description="API for text, HTML, and document translation services",
|
| 24 |
version="1.0.0"
|
| 25 |
)
|
| 26 |
|
|
|
|
| 27 |
app.add_middleware(
|
| 28 |
CORSMiddleware,
|
| 29 |
+
allow_origins=["*"],
|
| 30 |
allow_credentials=True,
|
| 31 |
allow_methods=["*"],
|
| 32 |
allow_headers=["*"],
|
| 33 |
)
|
| 34 |
|
|
|
|
| 35 |
try:
|
|
|
|
| 36 |
model = TranslationModel()
|
| 37 |
html_processor = HTMLProcessor()
|
| 38 |
text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
|
|
|
|
| 43 |
logger.error(f"Error initializing components: {str(e)}")
|
| 44 |
initialization_error = str(e)
|
| 45 |
|
|
|
|
| 46 |
class TranslationRequest(BaseModel):
|
| 47 |
text: str
|
| 48 |
source_lang_code: str
|
|
|
|
| 91 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 92 |
|
| 93 |
try:
|
|
|
|
| 94 |
logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
|
| 95 |
|
|
|
|
| 96 |
chunks = text_chunker.create_chunks(request.text)
|
| 97 |
translated_chunks = []
|
| 98 |
|
|
|
|
| 99 |
for chunk in chunks:
|
| 100 |
translated_text = model.translate(
|
| 101 |
chunk.text,
|
|
|
|
| 104 |
)
|
| 105 |
translated_chunks.append(translated_text)
|
| 106 |
|
|
|
|
| 107 |
final_translation = text_chunker.combine_translations(
|
| 108 |
request.text, chunks, translated_chunks
|
| 109 |
)
|
|
|
|
| 120 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 121 |
|
| 122 |
try:
|
|
|
|
| 123 |
text_fragments, dom_data = html_processor.extract_text(request.html)
|
| 124 |
|
| 125 |
if not text_fragments:
|
| 126 |
return {"translated_html": request.html} # No text to translate
|
| 127 |
|
|
|
|
| 128 |
translated_fragments = []
|
| 129 |
|
|
|
|
| 130 |
batch_size = 10
|
| 131 |
for i in range(0, len(text_fragments), batch_size):
|
| 132 |
batch = text_fragments[i:i+batch_size]
|
|
|
|
| 143 |
)
|
| 144 |
translated_fragments.append(translated_text)
|
| 145 |
|
|
|
|
| 146 |
translated_html = html_processor.replace_text(dom_data, translated_fragments)
|
| 147 |
|
| 148 |
return {"translated_html": translated_html}
|
|
|
|
| 162 |
raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
|
| 163 |
|
| 164 |
try:
|
|
|
|
| 165 |
file_content = await file.read()
|
| 166 |
|
|
|
|
| 167 |
extracted_text = document_processor.process_document(
|
| 168 |
file_data=file_content,
|
| 169 |
filename=file.filename,
|
|
|
|
| 176 |
detail="No text could be extracted from the document"
|
| 177 |
)
|
| 178 |
|
|
|
|
| 179 |
translated_text = model.translate(
|
| 180 |
extracted_text,
|
| 181 |
source_lang_code,
|
|
|
|
| 191 |
raise HTTPException(status_code=500, detail=str(e))
|
| 192 |
|
| 193 |
if __name__ == "__main__":
|
| 194 |
+
uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
|
| 195 |
+
|
app/models/document_processor.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
import fitz # PyMuPDF
|
| 2 |
import logging
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
|
|
|
|
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
class DocumentProcessor:
|
|
@@ -35,15 +36,11 @@ class DocumentProcessor:
|
|
| 35 |
if file_ext not in self.supported_formats:
|
| 36 |
raise ValueError(f"Unsupported file format: {file_ext}")
|
| 37 |
|
| 38 |
-
# Process PDF using PyMuPDF
|
| 39 |
if file_ext == '.pdf':
|
| 40 |
return self._process_pdf(file_data)
|
| 41 |
|
| 42 |
-
# Process image (placeholder - would need OCR integration)
|
| 43 |
else:
|
| 44 |
if use_ocr:
|
| 45 |
-
# Placeholder for OCR implementation
|
| 46 |
-
# You would integrate with an OCR service here
|
| 47 |
raise NotImplementedError("OCR for images not implemented")
|
| 48 |
else:
|
| 49 |
return "Text extraction from images requires OCR to be enabled"
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
+
import fitz # PyMuPDF
|
| 5 |
+
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
| 8 |
class DocumentProcessor:
|
|
|
|
| 36 |
if file_ext not in self.supported_formats:
|
| 37 |
raise ValueError(f"Unsupported file format: {file_ext}")
|
| 38 |
|
|
|
|
| 39 |
if file_ext == '.pdf':
|
| 40 |
return self._process_pdf(file_data)
|
| 41 |
|
|
|
|
| 42 |
else:
|
| 43 |
if use_ocr:
|
|
|
|
|
|
|
| 44 |
raise NotImplementedError("OCR for images not implemented")
|
| 45 |
else:
|
| 46 |
return "Text extraction from images requires OCR to be enabled"
|
app/models/html_processor.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import logging
|
|
|
|
|
|
|
| 2 |
from bs4 import BeautifulSoup, NavigableString, Tag
|
| 3 |
-
from typing import List, Tuple, Dict, Any
|
| 4 |
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
|
@@ -30,14 +31,11 @@ class HTMLProcessor:
|
|
| 30 |
- DOM map that maintains references to the exact nodes in the original structure
|
| 31 |
"""
|
| 32 |
try:
|
| 33 |
-
# Parse the HTML using 'html.parser' to ensure proper handling
|
| 34 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 35 |
|
| 36 |
-
# Use a list to store text fragments and their corresponding nodes
|
| 37 |
text_fragments = []
|
| 38 |
dom_map = {}
|
| 39 |
|
| 40 |
-
# Process the soup to find all text nodes
|
| 41 |
self._extract_text_from_node(soup, text_fragments, dom_map)
|
| 42 |
|
| 43 |
return text_fragments, {'soup': soup, 'node_map': dom_map}
|
|
@@ -56,24 +54,19 @@ class HTMLProcessor:
|
|
| 56 |
dom_map: Dictionary to map indices to nodes
|
| 57 |
path: Current path in the DOM tree for debugging
|
| 58 |
"""
|
| 59 |
-
# Skip processing for certain tags
|
| 60 |
if isinstance(node, Tag) and node.name in self.skip_tags:
|
| 61 |
return
|
| 62 |
|
| 63 |
-
# Skip elements with notranslate class
|
| 64 |
if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
|
| 65 |
return
|
| 66 |
|
| 67 |
-
# Process this node
|
| 68 |
if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
|
| 69 |
-
# Only process non-empty text
|
| 70 |
text = str(node).strip()
|
| 71 |
if text:
|
| 72 |
index = len(text_fragments)
|
| 73 |
text_fragments.append(text)
|
| 74 |
dom_map[index] = node
|
| 75 |
|
| 76 |
-
# Recursively process child nodes
|
| 77 |
if isinstance(node, Tag):
|
| 78 |
for child in node.children:
|
| 79 |
child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
|
|
@@ -98,13 +91,10 @@ class HTMLProcessor:
|
|
| 98 |
logger.error("Invalid DOM data for text replacement")
|
| 99 |
return ""
|
| 100 |
|
| 101 |
-
# Replace text in each node
|
| 102 |
for index, node in node_map.items():
|
| 103 |
if index < len(translated_fragments):
|
| 104 |
-
# Replace the original string with the translated string
|
| 105 |
node.replace_with(NavigableString(translated_fragments[index]))
|
| 106 |
|
| 107 |
-
# Return the HTML as a string
|
| 108 |
return str(soup)
|
| 109 |
|
| 110 |
except Exception as e:
|
|
|
|
| 1 |
import logging
|
| 2 |
+
from typing import Any, Dict, List, Tuple
|
| 3 |
+
|
| 4 |
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
| 5 |
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
|
|
|
| 31 |
- DOM map that maintains references to the exact nodes in the original structure
|
| 32 |
"""
|
| 33 |
try:
|
|
|
|
| 34 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 35 |
|
|
|
|
| 36 |
text_fragments = []
|
| 37 |
dom_map = {}
|
| 38 |
|
|
|
|
| 39 |
self._extract_text_from_node(soup, text_fragments, dom_map)
|
| 40 |
|
| 41 |
return text_fragments, {'soup': soup, 'node_map': dom_map}
|
|
|
|
| 54 |
dom_map: Dictionary to map indices to nodes
|
| 55 |
path: Current path in the DOM tree for debugging
|
| 56 |
"""
|
|
|
|
| 57 |
if isinstance(node, Tag) and node.name in self.skip_tags:
|
| 58 |
return
|
| 59 |
|
|
|
|
| 60 |
if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
|
| 61 |
return
|
| 62 |
|
|
|
|
| 63 |
if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
|
|
|
|
| 64 |
text = str(node).strip()
|
| 65 |
if text:
|
| 66 |
index = len(text_fragments)
|
| 67 |
text_fragments.append(text)
|
| 68 |
dom_map[index] = node
|
| 69 |
|
|
|
|
| 70 |
if isinstance(node, Tag):
|
| 71 |
for child in node.children:
|
| 72 |
child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
|
|
|
|
| 91 |
logger.error("Invalid DOM data for text replacement")
|
| 92 |
return ""
|
| 93 |
|
|
|
|
| 94 |
for index, node in node_map.items():
|
| 95 |
if index < len(translated_fragments):
|
|
|
|
| 96 |
node.replace_with(NavigableString(translated_fragments[index]))
|
| 97 |
|
|
|
|
| 98 |
return str(soup)
|
| 99 |
|
| 100 |
except Exception as e:
|
app/models/text_chunker.py
CHANGED
|
@@ -1,17 +1,15 @@
|
|
| 1 |
-
import re
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
-
from typing import List, Optional
|
| 7 |
from dataclasses import dataclass
|
|
|
|
|
|
|
|
|
|
| 8 |
from nltk.tokenize import sent_tokenize
|
| 9 |
|
| 10 |
-
# Set NLTK data path from environment variable if available
|
| 11 |
nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
|
| 12 |
nltk.data.path.append(nltk_data_path)
|
| 13 |
|
| 14 |
-
# Ensure NLTK data is downloaded
|
| 15 |
try:
|
| 16 |
nltk.data.find('tokenizers/punkt')
|
| 17 |
except LookupError:
|
|
@@ -19,7 +17,6 @@ except LookupError:
|
|
| 19 |
nltk.download('punkt', download_dir=nltk_data_path)
|
| 20 |
except Exception as e:
|
| 21 |
logging.warning(f"Failed to download NLTK data: {e}")
|
| 22 |
-
# Fallback to not using NLTK if download fails
|
| 23 |
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
@@ -62,22 +59,16 @@ class TextChunker:
|
|
| 62 |
if not text:
|
| 63 |
return ""
|
| 64 |
|
| 65 |
-
# Replace multiple newlines with single \n
|
| 66 |
text = re.sub(r'\n\s*\n', '\n', text)
|
| 67 |
|
| 68 |
-
# Replace other whitespace characters with space
|
| 69 |
text = re.sub(r'[\r\t\f\v]', ' ', text)
|
| 70 |
|
| 71 |
-
# Replace multiple spaces with single space
|
| 72 |
text = re.sub(r' +', ' ', text)
|
| 73 |
|
| 74 |
-
# Clean up spaces around newlines
|
| 75 |
text = re.sub(r' *\n *', '\n', text)
|
| 76 |
|
| 77 |
-
# Remove spaces at the start and end of the text
|
| 78 |
text = text.strip()
|
| 79 |
|
| 80 |
-
# Handle bullet points and lists consistently
|
| 81 |
text = re.sub(r'•\s*', '• ', text)
|
| 82 |
text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
|
| 83 |
|
|
@@ -88,7 +79,6 @@ class TextChunker:
|
|
| 88 |
Estimate the number of tokens in a text string.
|
| 89 |
This is a rough approximation - actual token count may vary by tokenizer.
|
| 90 |
"""
|
| 91 |
-
# Split on whitespace and punctuation
|
| 92 |
words = re.findall(r'\b\w+\b|[^\w\s]', text)
|
| 93 |
return len(words)
|
| 94 |
|
|
@@ -98,7 +88,6 @@ class TextChunker:
|
|
| 98 |
return sent_tokenize(text)
|
| 99 |
except Exception as e:
|
| 100 |
logger.warning(f"Error in sentence tokenization: {e}")
|
| 101 |
-
# Fallback to simple period-based splitting
|
| 102 |
return [s.strip() + '.' for s in text.split('.') if s.strip()]
|
| 103 |
|
| 104 |
def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
|
|
@@ -114,7 +103,6 @@ class TextChunker:
|
|
| 114 |
sentence = sentences[i]
|
| 115 |
sentence_tokens = self.estimate_tokens(sentence)
|
| 116 |
|
| 117 |
-
# If single sentence exceeds max tokens, split it
|
| 118 |
if sentence_tokens > max_tokens:
|
| 119 |
if not current_sentences: # First sentence
|
| 120 |
words = sentence.split()
|
|
@@ -134,7 +122,6 @@ class TextChunker:
|
|
| 134 |
return chunk_text, i, is_partial
|
| 135 |
break
|
| 136 |
|
| 137 |
-
# Check if adding this sentence would exceed the limit
|
| 138 |
if current_tokens + sentence_tokens > max_tokens and current_sentences:
|
| 139 |
break
|
| 140 |
|
|
@@ -160,13 +147,11 @@ class TextChunker:
|
|
| 160 |
chunks = []
|
| 161 |
current_idx = 0
|
| 162 |
|
| 163 |
-
# Split into paragraphs if preserve_paragraphs is True
|
| 164 |
if self.preserve_paragraphs:
|
| 165 |
paragraphs = text.split('\n')
|
| 166 |
else:
|
| 167 |
paragraphs = [text]
|
| 168 |
|
| 169 |
-
# Process each paragraph
|
| 170 |
for para in paragraphs:
|
| 171 |
if not para.strip():
|
| 172 |
continue
|
|
@@ -182,7 +167,6 @@ class TextChunker:
|
|
| 182 |
if not chunk_text:
|
| 183 |
break
|
| 184 |
|
| 185 |
-
# Calculate original text positions
|
| 186 |
original_start = text.find(chunk_text)
|
| 187 |
original_end = original_start + len(chunk_text)
|
| 188 |
|
|
@@ -222,11 +206,9 @@ class TextChunker:
|
|
| 222 |
if len(chunks) == 1:
|
| 223 |
return translations[0]
|
| 224 |
|
| 225 |
-
# Combine translations, handling partial sentences
|
| 226 |
result = []
|
| 227 |
for i, (chunk, translation) in enumerate(zip(chunks, translations)):
|
| 228 |
if i > 0 and chunk.is_partial_sentence:
|
| 229 |
-
# For partial sentences, try to find a clean break point
|
| 230 |
prev_translation = translations[i-1]
|
| 231 |
overlap = self._find_overlap(prev_translation, translation)
|
| 232 |
if overlap:
|
|
@@ -241,15 +223,14 @@ class TextChunker:
|
|
| 241 |
if not text1 or not text2:
|
| 242 |
return None
|
| 243 |
|
| 244 |
-
# Get the last part of text1 and first part of text2
|
| 245 |
end_text = text1[-100:] # Look at last 100 chars
|
| 246 |
start_text = text2[:100] # Look at first 100 chars
|
| 247 |
|
| 248 |
-
# Find the longest common substring
|
| 249 |
overlap = None
|
| 250 |
for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
|
| 251 |
if end_text[-length:] == start_text[:length]:
|
| 252 |
overlap = start_text[:length]
|
| 253 |
break
|
| 254 |
|
| 255 |
-
return overlap
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
+
import re
|
|
|
|
|
|
|
| 4 |
from dataclasses import dataclass
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
import nltk
|
| 8 |
from nltk.tokenize import sent_tokenize
|
| 9 |
|
|
|
|
| 10 |
nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
|
| 11 |
nltk.data.path.append(nltk_data_path)
|
| 12 |
|
|
|
|
| 13 |
try:
|
| 14 |
nltk.data.find('tokenizers/punkt')
|
| 15 |
except LookupError:
|
|
|
|
| 17 |
nltk.download('punkt', download_dir=nltk_data_path)
|
| 18 |
except Exception as e:
|
| 19 |
logging.warning(f"Failed to download NLTK data: {e}")
|
|
|
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
|
|
|
| 59 |
if not text:
|
| 60 |
return ""
|
| 61 |
|
|
|
|
| 62 |
text = re.sub(r'\n\s*\n', '\n', text)
|
| 63 |
|
|
|
|
| 64 |
text = re.sub(r'[\r\t\f\v]', ' ', text)
|
| 65 |
|
|
|
|
| 66 |
text = re.sub(r' +', ' ', text)
|
| 67 |
|
|
|
|
| 68 |
text = re.sub(r' *\n *', '\n', text)
|
| 69 |
|
|
|
|
| 70 |
text = text.strip()
|
| 71 |
|
|
|
|
| 72 |
text = re.sub(r'•\s*', '• ', text)
|
| 73 |
text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
|
| 74 |
|
|
|
|
| 79 |
Estimate the number of tokens in a text string.
|
| 80 |
This is a rough approximation - actual token count may vary by tokenizer.
|
| 81 |
"""
|
|
|
|
| 82 |
words = re.findall(r'\b\w+\b|[^\w\s]', text)
|
| 83 |
return len(words)
|
| 84 |
|
|
|
|
| 88 |
return sent_tokenize(text)
|
| 89 |
except Exception as e:
|
| 90 |
logger.warning(f"Error in sentence tokenization: {e}")
|
|
|
|
| 91 |
return [s.strip() + '.' for s in text.split('.') if s.strip()]
|
| 92 |
|
| 93 |
def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
|
|
|
|
| 103 |
sentence = sentences[i]
|
| 104 |
sentence_tokens = self.estimate_tokens(sentence)
|
| 105 |
|
|
|
|
| 106 |
if sentence_tokens > max_tokens:
|
| 107 |
if not current_sentences: # First sentence
|
| 108 |
words = sentence.split()
|
|
|
|
| 122 |
return chunk_text, i, is_partial
|
| 123 |
break
|
| 124 |
|
|
|
|
| 125 |
if current_tokens + sentence_tokens > max_tokens and current_sentences:
|
| 126 |
break
|
| 127 |
|
|
|
|
| 147 |
chunks = []
|
| 148 |
current_idx = 0
|
| 149 |
|
|
|
|
| 150 |
if self.preserve_paragraphs:
|
| 151 |
paragraphs = text.split('\n')
|
| 152 |
else:
|
| 153 |
paragraphs = [text]
|
| 154 |
|
|
|
|
| 155 |
for para in paragraphs:
|
| 156 |
if not para.strip():
|
| 157 |
continue
|
|
|
|
| 167 |
if not chunk_text:
|
| 168 |
break
|
| 169 |
|
|
|
|
| 170 |
original_start = text.find(chunk_text)
|
| 171 |
original_end = original_start + len(chunk_text)
|
| 172 |
|
|
|
|
| 206 |
if len(chunks) == 1:
|
| 207 |
return translations[0]
|
| 208 |
|
|
|
|
| 209 |
result = []
|
| 210 |
for i, (chunk, translation) in enumerate(zip(chunks, translations)):
|
| 211 |
if i > 0 and chunk.is_partial_sentence:
|
|
|
|
| 212 |
prev_translation = translations[i-1]
|
| 213 |
overlap = self._find_overlap(prev_translation, translation)
|
| 214 |
if overlap:
|
|
|
|
| 223 |
if not text1 or not text2:
|
| 224 |
return None
|
| 225 |
|
|
|
|
| 226 |
end_text = text1[-100:] # Look at last 100 chars
|
| 227 |
start_text = text2[:100] # Look at first 100 chars
|
| 228 |
|
|
|
|
| 229 |
overlap = None
|
| 230 |
for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
|
| 231 |
if end_text[-length:] == start_text[:length]:
|
| 232 |
overlap = start_text[:length]
|
| 233 |
break
|
| 234 |
|
| 235 |
+
return overlap
|
| 236 |
+
|
app/models/translation_model.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
import torch
|
| 2 |
import logging
|
| 3 |
-
import re
|
| 4 |
import os
|
| 5 |
-
|
| 6 |
-
from
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
|
|
|
|
|
|
| 1 |
import logging
|
|
|
|
| 2 |
import os
|
| 3 |
+
import re
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|