Initial commit of Universal Translator API
Browse files- Dockerfile +33 -0
- README.md +88 -2
- api_server.py +160 -0
- app/models/document_processor.py +68 -0
- app/models/html_processor.py +112 -0
- app/models/text_chunker.py +246 -0
- app/models/translation_model.py +132 -0
- requirements.txt +13 -0
- setup.sh +28 -0
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-bullseye
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
libffi-dev \
|
| 9 |
+
git \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Install PyTorch with CUDA support
|
| 13 |
+
RUN pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
|
| 14 |
+
|
| 15 |
+
# Copy requirements file
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Expose the port for the API
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
# Set environment variables
|
| 28 |
+
ENV PYTHONUNBUFFERED=1
|
| 29 |
+
ENV TRANSFORMERS_CACHE=/app/.cache
|
| 30 |
+
ENV HF_HOME=/app/.cache
|
| 31 |
+
|
| 32 |
+
# Run the API server
|
| 33 |
+
CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
title: Lt Space
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
|
@@ -9,4 +9,90 @@ license: mit
|
|
| 9 |
short_description: Language translation space
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Lt Space
|
| 3 |
+
emoji: 🗣 ️
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
|
|
|
| 9 |
short_description: Language translation space
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Universal Translator API
|
| 13 |
+
|
| 14 |
+
This is a Hugging Face Spaces deployment of the Universal Translator API service, which provides translation capabilities using the MADLAD-400 3B model.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- Text translation across 450+ languages
|
| 19 |
+
- HTML translation with structure preservation
|
| 20 |
+
- Document translation (PDF, images) with optional OCR
|
| 21 |
+
- Efficient chunking for long text translation
|
| 22 |
+
- GPU-accelerated inference
|
| 23 |
+
|
| 24 |
+
## API Endpoints
|
| 25 |
+
|
| 26 |
+
### Health Check
|
| 27 |
+
```
|
| 28 |
+
GET /
|
| 29 |
+
```
|
| 30 |
+
Returns the status of the service and model information.
|
| 31 |
+
|
| 32 |
+
### Text Translation
|
| 33 |
+
```
|
| 34 |
+
POST /translate
|
| 35 |
+
```
|
| 36 |
+
Translates text from one language to another.
|
| 37 |
+
|
| 38 |
+
**Request Body:**
|
| 39 |
+
```json
|
| 40 |
+
{
|
| 41 |
+
"text": "Text to translate",
|
| 42 |
+
"source_lang_code": "en",
|
| 43 |
+
"target_lang_code": "fr"
|
| 44 |
+
}
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### HTML Translation
|
| 48 |
+
```
|
| 49 |
+
POST /translate-html
|
| 50 |
+
```
|
| 51 |
+
Translates HTML content while preserving the HTML structure.
|
| 52 |
+
|
| 53 |
+
**Request Body:**
|
| 54 |
+
```json
|
| 55 |
+
{
|
| 56 |
+
"html": "<p>Text to translate</p>",
|
| 57 |
+
"source_lang_code": "en",
|
| 58 |
+
"target_lang_code": "fr"
|
| 59 |
+
}
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
### Document Translation
|
| 63 |
+
```
|
| 64 |
+
POST /process-document
|
| 65 |
+
```
|
| 66 |
+
Processes and translates PDF or image files.
|
| 67 |
+
|
| 68 |
+
**Form Data:**
|
| 69 |
+
- `file`: The document file (PDF or image)
|
| 70 |
+
- `source_lang_code`: Source language code (e.g., "en")
|
| 71 |
+
- `target_lang_code`: Target language code (e.g., "fr")
|
| 72 |
+
- `use_ocr`: Whether to use OCR (boolean)
|
| 73 |
+
|
| 74 |
+
## Language Codes
|
| 75 |
+
|
| 76 |
+
The API uses the following language codes (ISO 639-1):
|
| 77 |
+
|
| 78 |
+
- `en`: English
|
| 79 |
+
- `fr`: French
|
| 80 |
+
- `es`: Spanish
|
| 81 |
+
- `de`: German
|
| 82 |
+
- And many more (450+ languages supported by MADLAD-400)
|
| 83 |
+
|
| 84 |
+
## Deployment
|
| 85 |
+
|
| 86 |
+
This API is deployed on Hugging Face Spaces using Docker. The web interface is hosted separately on Render.
|
| 87 |
+
|
| 88 |
+
## Model
|
| 89 |
+
|
| 90 |
+
The service uses the Google MADLAD-400 3B model for translation.
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
google/madlad400-3b-mt
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## License
|
| 97 |
+
|
| 98 |
+
This project is available under the MIT License.
|
api_server.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
import logging
|
| 5 |
+
import uvicorn
|
| 6 |
+
from app.models.translation_model import TranslationModel
|
| 7 |
+
from app.models.html_processor import HTMLProcessor
|
| 8 |
+
from app.models.text_chunker import TextChunker
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 14 |
+
)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Initialize FastAPI app
|
| 18 |
+
app = FastAPI(
|
| 19 |
+
title="Universal Translator API",
|
| 20 |
+
description="API for text, HTML, and document translation services",
|
| 21 |
+
version="1.0.0"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Configure CORS
|
| 25 |
+
app.add_middleware(
|
| 26 |
+
CORSMiddleware,
|
| 27 |
+
allow_origins=["*"], # Adjust in production
|
| 28 |
+
allow_credentials=True,
|
| 29 |
+
allow_methods=["*"],
|
| 30 |
+
allow_headers=["*"],
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Initialize translation model
|
| 34 |
+
model = TranslationModel()
|
| 35 |
+
html_processor = HTMLProcessor()
|
| 36 |
+
text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
|
| 37 |
+
|
| 38 |
+
# Define request/response models
|
| 39 |
+
class TranslationRequest(BaseModel):
|
| 40 |
+
text: str
|
| 41 |
+
source_lang_code: str
|
| 42 |
+
target_lang_code: str
|
| 43 |
+
|
| 44 |
+
class TranslationResponse(BaseModel):
|
| 45 |
+
translated_text: str
|
| 46 |
+
|
| 47 |
+
class HTMLTranslationRequest(BaseModel):
|
| 48 |
+
html: str
|
| 49 |
+
source_lang_code: str
|
| 50 |
+
target_lang_code: str
|
| 51 |
+
|
| 52 |
+
class HTMLTranslationResponse(BaseModel):
|
| 53 |
+
translated_html: str
|
| 54 |
+
|
| 55 |
+
@app.get("/")
|
| 56 |
+
async def root():
|
| 57 |
+
"""Health check endpoint"""
|
| 58 |
+
return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
|
| 59 |
+
|
| 60 |
+
@app.post("/translate", response_model=TranslationResponse)
|
| 61 |
+
async def translate_text(request: TranslationRequest):
|
| 62 |
+
"""Translate text from source to target language"""
|
| 63 |
+
try:
|
| 64 |
+
# Get chunks using TextChunker
|
| 65 |
+
chunks = text_chunker.create_chunks(request.text)
|
| 66 |
+
translated_chunks = []
|
| 67 |
+
|
| 68 |
+
# Translate each chunk
|
| 69 |
+
for chunk in chunks:
|
| 70 |
+
translated_text = model.translate(
|
| 71 |
+
chunk.text,
|
| 72 |
+
request.source_lang_code,
|
| 73 |
+
request.target_lang_code
|
| 74 |
+
)
|
| 75 |
+
translated_chunks.append(translated_text)
|
| 76 |
+
|
| 77 |
+
# Combine translations
|
| 78 |
+
final_translation = text_chunker.combine_translations(
|
| 79 |
+
request.text, chunks, translated_chunks
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
return {"translated_text": final_translation}
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Translation error: {str(e)}")
|
| 85 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 86 |
+
|
| 87 |
+
@app.post("/translate-html", response_model=HTMLTranslationResponse)
|
| 88 |
+
async def translate_html(request: HTMLTranslationRequest):
|
| 89 |
+
"""Translate HTML content while preserving structure"""
|
| 90 |
+
try:
|
| 91 |
+
# Extract text and maintain exact DOM structure
|
| 92 |
+
text_fragments, dom_data = html_processor.extract_text(request.html)
|
| 93 |
+
|
| 94 |
+
if not text_fragments:
|
| 95 |
+
return {"translated_html": request.html} # No text to translate
|
| 96 |
+
|
| 97 |
+
# Process each text fragment individually
|
| 98 |
+
translated_fragments = []
|
| 99 |
+
for fragment in text_fragments:
|
| 100 |
+
if not fragment.strip():
|
| 101 |
+
translated_fragments.append(fragment)
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
translated_text = model.translate(
|
| 105 |
+
fragment,
|
| 106 |
+
request.source_lang_code,
|
| 107 |
+
request.target_lang_code
|
| 108 |
+
)
|
| 109 |
+
translated_fragments.append(translated_text)
|
| 110 |
+
|
| 111 |
+
# Replace the original text with translated text in the HTML structure
|
| 112 |
+
translated_html = html_processor.replace_text(dom_data, translated_fragments)
|
| 113 |
+
|
| 114 |
+
return {"translated_html": translated_html}
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"HTML translation error: {str(e)}")
|
| 117 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 118 |
+
|
| 119 |
+
@app.post("/process-document")
|
| 120 |
+
async def process_document(
|
| 121 |
+
file: UploadFile = File(...),
|
| 122 |
+
source_lang_code: str = Form(...),
|
| 123 |
+
target_lang_code: str = Form(...),
|
| 124 |
+
use_ocr: bool = Form(False)
|
| 125 |
+
):
|
| 126 |
+
"""Process and translate document (PDF or image)"""
|
| 127 |
+
try:
|
| 128 |
+
# Read file content
|
| 129 |
+
file_content = await file.read()
|
| 130 |
+
|
| 131 |
+
# Process document to extract text
|
| 132 |
+
extracted_text = model.process_document(
|
| 133 |
+
file_content,
|
| 134 |
+
file.filename,
|
| 135 |
+
use_ocr=use_ocr
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
if not extracted_text:
|
| 139 |
+
raise HTTPException(
|
| 140 |
+
status_code=400,
|
| 141 |
+
detail="No text could be extracted from the document"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Translate the extracted text
|
| 145 |
+
translated_text = model.translate(
|
| 146 |
+
extracted_text,
|
| 147 |
+
source_lang_code,
|
| 148 |
+
target_lang_code
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
return {
|
| 152 |
+
"extracted_text": extracted_text,
|
| 153 |
+
"translated_text": translated_text
|
| 154 |
+
}
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Document processing error: {str(e)}")
|
| 157 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
|
app/models/document_processor.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class DocumentProcessor:
|
| 8 |
+
"""Simplified document processor for the API service"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
"""Initialize the document processor"""
|
| 12 |
+
self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
|
| 13 |
+
|
| 14 |
+
def process_document(
|
| 15 |
+
self,
|
| 16 |
+
file_data: bytes,
|
| 17 |
+
filename: str,
|
| 18 |
+
use_ocr: bool = False
|
| 19 |
+
) -> str:
|
| 20 |
+
"""
|
| 21 |
+
Extract text from document (PDF or image)
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
file_data: Raw file content
|
| 25 |
+
filename: Original filename
|
| 26 |
+
use_ocr: Whether to use OCR (not implemented in this simplified version)
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Extracted text as string
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
file_ext = Path(filename).suffix.lower()
|
| 33 |
+
logger.info(f"Processing file: {filename} with extension: {file_ext}")
|
| 34 |
+
|
| 35 |
+
if file_ext not in self.supported_formats:
|
| 36 |
+
raise ValueError(f"Unsupported file format: {file_ext}")
|
| 37 |
+
|
| 38 |
+
# Process PDF using PyMuPDF
|
| 39 |
+
if file_ext == '.pdf':
|
| 40 |
+
return self._process_pdf(file_data)
|
| 41 |
+
|
| 42 |
+
# Process image (placeholder - would need OCR integration)
|
| 43 |
+
else:
|
| 44 |
+
if use_ocr:
|
| 45 |
+
# Placeholder for OCR implementation
|
| 46 |
+
# You would integrate with an OCR service here
|
| 47 |
+
raise NotImplementedError("OCR for images not implemented")
|
| 48 |
+
else:
|
| 49 |
+
return "Text extraction from images requires OCR to be enabled"
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Error processing document: {str(e)}")
|
| 53 |
+
raise
|
| 54 |
+
|
| 55 |
+
def _process_pdf(self, file_data: bytes) -> str:
|
| 56 |
+
"""Process PDF to extract text using PyMuPDF"""
|
| 57 |
+
try:
|
| 58 |
+
with fitz.open(stream=file_data, filetype="pdf") as pdf_doc:
|
| 59 |
+
text_parts = []
|
| 60 |
+
for page_num in range(len(pdf_doc)):
|
| 61 |
+
page = pdf_doc[page_num]
|
| 62 |
+
text = page.get_text()
|
| 63 |
+
text_parts.append(text)
|
| 64 |
+
|
| 65 |
+
return "\n\n".join(text_parts)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
| 68 |
+
raise
|
app/models/html_processor.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
| 3 |
+
from typing import List, Tuple, Dict, Any
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class HTMLProcessor:
|
| 8 |
+
"""
|
| 9 |
+
A processor for HTML content that preserves exact HTML structure
|
| 10 |
+
while only translating text content.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.skip_translation_class = 'notranslate'
|
| 15 |
+
self.skip_tags = {
|
| 16 |
+
'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
|
| 17 |
+
'link', 'iframe', 'noscript', 'svg', 'path', 'img'
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
|
| 21 |
+
"""
|
| 22 |
+
Extract translatable text nodes from HTML content while preserving exact structure.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
html_content: HTML content as a string
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
A tuple containing:
|
| 29 |
+
- List of text fragments to translate
|
| 30 |
+
- DOM map that maintains references to the exact nodes in the original structure
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
# Parse the HTML using 'html.parser' to ensure proper handling
|
| 34 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 35 |
+
|
| 36 |
+
# Use a list to store text fragments and their corresponding nodes
|
| 37 |
+
text_fragments = []
|
| 38 |
+
dom_map = {}
|
| 39 |
+
|
| 40 |
+
# Process the soup to find all text nodes
|
| 41 |
+
self._extract_text_from_node(soup, text_fragments, dom_map)
|
| 42 |
+
|
| 43 |
+
return text_fragments, {'soup': soup, 'node_map': dom_map}
|
| 44 |
+
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Error extracting text from HTML: {str(e)}")
|
| 47 |
+
return [], {}
|
| 48 |
+
|
| 49 |
+
def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
|
| 50 |
+
"""
|
| 51 |
+
Recursively extract text from nodes while maintaining exact structure.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
node: The current BeautifulSoup node
|
| 55 |
+
text_fragments: List to store extracted text
|
| 56 |
+
dom_map: Dictionary to map indices to nodes
|
| 57 |
+
path: Current path in the DOM tree for debugging
|
| 58 |
+
"""
|
| 59 |
+
# Skip processing for certain tags
|
| 60 |
+
if isinstance(node, Tag) and node.name in self.skip_tags:
|
| 61 |
+
return
|
| 62 |
+
|
| 63 |
+
# Skip elements with notranslate class
|
| 64 |
+
if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
|
| 65 |
+
return
|
| 66 |
+
|
| 67 |
+
# Process this node
|
| 68 |
+
if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
|
| 69 |
+
# Only process non-empty text
|
| 70 |
+
text = str(node).strip()
|
| 71 |
+
if text:
|
| 72 |
+
index = len(text_fragments)
|
| 73 |
+
text_fragments.append(text)
|
| 74 |
+
dom_map[index] = node
|
| 75 |
+
|
| 76 |
+
# Recursively process child nodes
|
| 77 |
+
if isinstance(node, Tag):
|
| 78 |
+
for child in node.children:
|
| 79 |
+
child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
|
| 80 |
+
self._extract_text_from_node(child, text_fragments, dom_map, child_path)
|
| 81 |
+
|
| 82 |
+
def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
|
| 83 |
+
"""
|
| 84 |
+
Replace the original text with translated text while keeping exact HTML structure.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
dom_data: DOM data containing soup and node map
|
| 88 |
+
translated_fragments: List of translated text fragments
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
HTML content with translated text and preserved structure
|
| 92 |
+
"""
|
| 93 |
+
try:
|
| 94 |
+
soup = dom_data.get('soup')
|
| 95 |
+
node_map = dom_data.get('node_map', {})
|
| 96 |
+
|
| 97 |
+
if not soup or not node_map:
|
| 98 |
+
logger.error("Invalid DOM data for text replacement")
|
| 99 |
+
return ""
|
| 100 |
+
|
| 101 |
+
# Replace text in each node
|
| 102 |
+
for index, node in node_map.items():
|
| 103 |
+
if index < len(translated_fragments):
|
| 104 |
+
# Replace the original string with the translated string
|
| 105 |
+
node.replace_with(NavigableString(translated_fragments[index]))
|
| 106 |
+
|
| 107 |
+
# Return the HTML as a string
|
| 108 |
+
return str(soup)
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Error replacing text in HTML: {str(e)}")
|
| 112 |
+
return ""
|
app/models/text_chunker.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
import nltk
|
| 4 |
+
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from nltk.tokenize import sent_tokenize
|
| 8 |
+
|
| 9 |
+
# Ensure NLTK data is downloaded
|
| 10 |
+
try:
|
| 11 |
+
nltk.data.find('tokenizers/punkt')
|
| 12 |
+
except LookupError:
|
| 13 |
+
nltk.download('punkt')
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class TextChunk:
|
| 19 |
+
"""Class to represent a chunk of text with metadata"""
|
| 20 |
+
text: str
|
| 21 |
+
index: int
|
| 22 |
+
token_count: int
|
| 23 |
+
is_partial_sentence: bool = False
|
| 24 |
+
original_start: int = 0
|
| 25 |
+
original_end: int = 0
|
| 26 |
+
|
| 27 |
+
class TextChunker:
|
| 28 |
+
"""
|
| 29 |
+
A utility class for chunking large texts into smaller pieces while preserving
|
| 30 |
+
sentence boundaries and context where possible.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
max_tokens: int = 450,
|
| 36 |
+
overlap_tokens: int = 50,
|
| 37 |
+
preserve_paragraphs: bool = True
|
| 38 |
+
):
|
| 39 |
+
"""
|
| 40 |
+
Initialize the TextChunker.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
max_tokens: Maximum number of tokens per chunk
|
| 44 |
+
overlap_tokens: Number of tokens to overlap between chunks
|
| 45 |
+
preserve_paragraphs: Whether to try to preserve paragraph boundaries
|
| 46 |
+
"""
|
| 47 |
+
self.max_tokens = max_tokens
|
| 48 |
+
self.overlap_tokens = overlap_tokens
|
| 49 |
+
self.preserve_paragraphs = preserve_paragraphs
|
| 50 |
+
|
| 51 |
+
def preprocess_text(self, text: str) -> str:
|
| 52 |
+
"""Clean and normalize text before chunking."""
|
| 53 |
+
if not text:
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
# Replace multiple newlines with single \n
|
| 57 |
+
text = re.sub(r'\n\s*\n', '\n', text)
|
| 58 |
+
|
| 59 |
+
# Replace other whitespace characters with space
|
| 60 |
+
text = re.sub(r'[\r\t\f\v]', ' ', text)
|
| 61 |
+
|
| 62 |
+
# Replace multiple spaces with single space
|
| 63 |
+
text = re.sub(r' +', ' ', text)
|
| 64 |
+
|
| 65 |
+
# Clean up spaces around newlines
|
| 66 |
+
text = re.sub(r' *\n *', '\n', text)
|
| 67 |
+
|
| 68 |
+
# Remove spaces at the start and end of the text
|
| 69 |
+
text = text.strip()
|
| 70 |
+
|
| 71 |
+
# Handle bullet points and lists consistently
|
| 72 |
+
text = re.sub(r'•\s*', '• ', text)
|
| 73 |
+
text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
|
| 74 |
+
|
| 75 |
+
return text
|
| 76 |
+
|
| 77 |
+
def estimate_tokens(self, text: str) -> int:
|
| 78 |
+
"""
|
| 79 |
+
Estimate the number of tokens in a text string.
|
| 80 |
+
This is a rough approximation - actual token count may vary by tokenizer.
|
| 81 |
+
"""
|
| 82 |
+
# Split on whitespace and punctuation
|
| 83 |
+
words = re.findall(r'\b\w+\b|[^\w\s]', text)
|
| 84 |
+
return len(words)
|
| 85 |
+
|
| 86 |
+
def split_into_sentences(self, text: str) -> List[str]:
|
| 87 |
+
"""Split text into sentences using NLTK."""
|
| 88 |
+
try:
|
| 89 |
+
return sent_tokenize(text)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.warning(f"Error in sentence tokenization: {e}")
|
| 92 |
+
# Fallback to simple period-based splitting
|
| 93 |
+
return [s.strip() + '.' for s in text.split('.') if s.strip()]
|
| 94 |
+
|
| 95 |
+
def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
|
| 96 |
+
"""
|
| 97 |
+
Get chunk text starting from start_idx that fits within max_tokens.
|
| 98 |
+
Returns tuple of (chunk_text, end_idx, is_partial_sentence).
|
| 99 |
+
"""
|
| 100 |
+
current_tokens = 0
|
| 101 |
+
current_sentences = []
|
| 102 |
+
is_partial = False
|
| 103 |
+
|
| 104 |
+
for i in range(start_idx, len(sentences)):
|
| 105 |
+
sentence = sentences[i]
|
| 106 |
+
sentence_tokens = self.estimate_tokens(sentence)
|
| 107 |
+
|
| 108 |
+
# If single sentence exceeds max tokens, split it
|
| 109 |
+
if sentence_tokens > max_tokens:
|
| 110 |
+
if not current_sentences: # First sentence
|
| 111 |
+
words = sentence.split()
|
| 112 |
+
current_chunk = []
|
| 113 |
+
word_count = 0
|
| 114 |
+
|
| 115 |
+
for word in words:
|
| 116 |
+
word_tokens = self.estimate_tokens(word)
|
| 117 |
+
if word_count + word_tokens <= max_tokens:
|
| 118 |
+
current_chunk.append(word)
|
| 119 |
+
word_count += word_tokens
|
| 120 |
+
else:
|
| 121 |
+
break
|
| 122 |
+
|
| 123 |
+
chunk_text = ' '.join(current_chunk)
|
| 124 |
+
is_partial = True
|
| 125 |
+
return chunk_text, i, is_partial
|
| 126 |
+
break
|
| 127 |
+
|
| 128 |
+
# Check if adding this sentence would exceed the limit
|
| 129 |
+
if current_tokens + sentence_tokens > max_tokens and current_sentences:
|
| 130 |
+
break
|
| 131 |
+
|
| 132 |
+
current_sentences.append(sentence)
|
| 133 |
+
current_tokens += sentence_tokens
|
| 134 |
+
|
| 135 |
+
return ' '.join(current_sentences), start_idx + len(current_sentences), is_partial
|
| 136 |
+
|
| 137 |
+
def create_chunks(self, text: str) -> List[TextChunk]:
|
| 138 |
+
"""
|
| 139 |
+
Split text into chunks that respect sentence boundaries where possible.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
text: Input text to be chunked
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
List of TextChunk objects
|
| 146 |
+
"""
|
| 147 |
+
text = self.preprocess_text(text)
|
| 148 |
+
if not text:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
chunks = []
|
| 152 |
+
current_idx = 0
|
| 153 |
+
|
| 154 |
+
# Split into paragraphs if preserve_paragraphs is True
|
| 155 |
+
if self.preserve_paragraphs:
|
| 156 |
+
paragraphs = text.split('\n')
|
| 157 |
+
else:
|
| 158 |
+
paragraphs = [text]
|
| 159 |
+
|
| 160 |
+
# Process each paragraph
|
| 161 |
+
for para in paragraphs:
|
| 162 |
+
if not para.strip():
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
sentences = self.split_into_sentences(para)
|
| 166 |
+
para_start = 0
|
| 167 |
+
|
| 168 |
+
while para_start < len(sentences):
|
| 169 |
+
chunk_text, next_start, is_partial = self.get_chunk_text(
|
| 170 |
+
sentences, para_start, self.max_tokens
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
if not chunk_text:
|
| 174 |
+
break
|
| 175 |
+
|
| 176 |
+
# Calculate original text positions
|
| 177 |
+
original_start = text.find(chunk_text)
|
| 178 |
+
original_end = original_start + len(chunk_text)
|
| 179 |
+
|
| 180 |
+
chunks.append(TextChunk(
|
| 181 |
+
text=chunk_text,
|
| 182 |
+
index=current_idx,
|
| 183 |
+
token_count=self.estimate_tokens(chunk_text),
|
| 184 |
+
is_partial_sentence=is_partial,
|
| 185 |
+
original_start=original_start,
|
| 186 |
+
original_end=original_end
|
| 187 |
+
))
|
| 188 |
+
|
| 189 |
+
current_idx += 1
|
| 190 |
+
para_start = next_start if not is_partial else next_start + 1
|
| 191 |
+
|
| 192 |
+
return chunks
|
| 193 |
+
|
| 194 |
+
def combine_translations(self, original_text: str, chunks: List[TextChunk],
|
| 195 |
+
translations: List[str]) -> str:
|
| 196 |
+
"""
|
| 197 |
+
Combine translated chunks back into a single text, handling overlaps.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
original_text: Original input text
|
| 201 |
+
chunks: List of TextChunk objects
|
| 202 |
+
translations: List of translated text chunks
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Combined translated text
|
| 206 |
+
"""
|
| 207 |
+
if len(chunks) != len(translations):
|
| 208 |
+
raise ValueError("Number of chunks and translations must match")
|
| 209 |
+
|
| 210 |
+
if len(chunks) == 0:
|
| 211 |
+
return ""
|
| 212 |
+
|
| 213 |
+
if len(chunks) == 1:
|
| 214 |
+
return translations[0]
|
| 215 |
+
|
| 216 |
+
# Combine translations, handling partial sentences
|
| 217 |
+
result = []
|
| 218 |
+
for i, (chunk, translation) in enumerate(zip(chunks, translations)):
|
| 219 |
+
if i > 0 and chunk.is_partial_sentence:
|
| 220 |
+
# For partial sentences, try to find a clean break point
|
| 221 |
+
prev_translation = translations[i-1]
|
| 222 |
+
overlap = self._find_overlap(prev_translation, translation)
|
| 223 |
+
if overlap:
|
| 224 |
+
translation = translation[len(overlap):]
|
| 225 |
+
|
| 226 |
+
result.append(translation)
|
| 227 |
+
|
| 228 |
+
return ' '.join(result)
|
| 229 |
+
|
| 230 |
+
def _find_overlap(self, text1: str, text2: str, min_length: int = 10) -> Optional[str]:
|
| 231 |
+
"""Find overlapping text between two strings."""
|
| 232 |
+
if not text1 or not text2:
|
| 233 |
+
return None
|
| 234 |
+
|
| 235 |
+
# Get the last part of text1 and first part of text2
|
| 236 |
+
end_text = text1[-100:] # Look at last 100 chars
|
| 237 |
+
start_text = text2[:100] # Look at first 100 chars
|
| 238 |
+
|
| 239 |
+
# Find the longest common substring
|
| 240 |
+
overlap = None
|
| 241 |
+
for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
|
| 242 |
+
if end_text[-length:] == start_text[:length]:
|
| 243 |
+
overlap = start_text[:length]
|
| 244 |
+
break
|
| 245 |
+
|
| 246 |
+
return overlap
|
app/models/translation_model.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import logging
|
| 3 |
+
import re
|
| 4 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class TranslationModel:
|
| 9 |
+
"""
|
| 10 |
+
Model class for handling the translation functionality using MADLAD-400 model
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, model_name: str = "google/madlad400-3b-mt"):
|
| 14 |
+
"""
|
| 15 |
+
Initialize the translation model.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
model_name: Name of the Hugging Face model to use
|
| 19 |
+
"""
|
| 20 |
+
self.model_name = model_name
|
| 21 |
+
self.model = None
|
| 22 |
+
self.tokenizer = None
|
| 23 |
+
self.device = self._get_device()
|
| 24 |
+
self._load_model()
|
| 25 |
+
|
| 26 |
+
def _get_device(self):
|
| 27 |
+
"""Get the best available device for model inference."""
|
| 28 |
+
if torch.cuda.is_available():
|
| 29 |
+
logger.info("Using CUDA GPU for translation")
|
| 30 |
+
return torch.device("cuda")
|
| 31 |
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 32 |
+
logger.info("Using Apple MPS (Metal) for translation")
|
| 33 |
+
return torch.device("mps")
|
| 34 |
+
else:
|
| 35 |
+
logger.info("Using CPU for translation")
|
| 36 |
+
return torch.device("cpu")
|
| 37 |
+
|
| 38 |
+
def _load_model(self):
|
| 39 |
+
"""Load the MADLAD-400 3B translation model."""
|
| 40 |
+
try:
|
| 41 |
+
logger.info(f"Loading translation model: {self.model_name}")
|
| 42 |
+
self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
|
| 43 |
+
|
| 44 |
+
# Use torch_dtype=torch.bfloat16 if available for faster inference
|
| 45 |
+
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
|
| 46 |
+
logger.info("Using bfloat16 precision for model loading")
|
| 47 |
+
self.model = T5ForConditionalGeneration.from_pretrained(
|
| 48 |
+
self.model_name,
|
| 49 |
+
torch_dtype=torch.bfloat16
|
| 50 |
+
)
|
| 51 |
+
else:
|
| 52 |
+
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 53 |
+
logger.info(f"Using {dtype} precision for model loading")
|
| 54 |
+
self.model = T5ForConditionalGeneration.from_pretrained(
|
| 55 |
+
self.model_name,
|
| 56 |
+
torch_dtype=dtype
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
self.model.to(self.device)
|
| 60 |
+
logger.info(f"Model loaded successfully on {self.device}")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error loading model: {str(e)}")
|
| 63 |
+
raise
|
| 64 |
+
|
| 65 |
+
def translate(self, text: str, source_lang_code: str, target_lang_code: str) -> str:
|
| 66 |
+
"""
|
| 67 |
+
Translate text from source language to target language.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
text: Text to translate
|
| 71 |
+
source_lang_code: Source language code
|
| 72 |
+
target_lang_code: Target language code
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Translated text
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
if self.model is None or self.tokenizer is None:
|
| 79 |
+
raise ValueError("Translation model not loaded")
|
| 80 |
+
|
| 81 |
+
# Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
|
| 82 |
+
input_text = f"<2{target_lang_code}> {text}"
|
| 83 |
+
|
| 84 |
+
inputs = self.tokenizer(
|
| 85 |
+
input_text,
|
| 86 |
+
return_tensors="pt",
|
| 87 |
+
padding=True,
|
| 88 |
+
truncation=True,
|
| 89 |
+
max_length=512
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 93 |
+
|
| 94 |
+
with torch.no_grad():
|
| 95 |
+
translated = self.model.generate(
|
| 96 |
+
**inputs,
|
| 97 |
+
max_length=512,
|
| 98 |
+
num_beams=5,
|
| 99 |
+
early_stopping=True
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
translated_text = self.tokenizer.batch_decode(
|
| 103 |
+
translated,
|
| 104 |
+
skip_special_tokens=True
|
| 105 |
+
)[0]
|
| 106 |
+
|
| 107 |
+
return re.sub(r'\s+', ' ', translated_text).strip()
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Translation error: {str(e)}")
|
| 111 |
+
raise
|
| 112 |
+
|
| 113 |
+
def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
|
| 114 |
+
"""
|
| 115 |
+
Process document to extract text using PyMuPDF and optional OCR.
|
| 116 |
+
This is a simplified version for the API that only returns the extracted text.
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
file_data: Raw file content
|
| 120 |
+
filename: Original filename
|
| 121 |
+
use_ocr: Whether to use OCR for text extraction
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
Extracted text as string
|
| 125 |
+
"""
|
| 126 |
+
from app.models.document_processor import DocumentProcessor
|
| 127 |
+
|
| 128 |
+
# Initialize document processor
|
| 129 |
+
doc_processor = DocumentProcessor()
|
| 130 |
+
|
| 131 |
+
# Process document and extract text
|
| 132 |
+
return doc_processor.process_document(file_data, filename, use_ocr)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.95.0
|
| 2 |
+
uvicorn==0.21.1
|
| 3 |
+
pydantic==1.10.7
|
| 4 |
+
transformers==4.30.2
|
| 5 |
+
sentencepiece==0.1.99
|
| 6 |
+
accelerate==0.20.3
|
| 7 |
+
python-multipart==0.0.6
|
| 8 |
+
pillow==9.5.0
|
| 9 |
+
nltk==3.8.1
|
| 10 |
+
tqdm==4.65.0
|
| 11 |
+
beautifulsoup4==4.12.2
|
| 12 |
+
PyMuPDF==1.22.5
|
| 13 |
+
protobuf==3.20.3
|
setup.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Script to set up the HF Spaces environment
|
| 4 |
+
set -e
|
| 5 |
+
|
| 6 |
+
echo "Setting up Universal Translator API on Hugging Face Spaces..."
|
| 7 |
+
|
| 8 |
+
# Create directories
|
| 9 |
+
mkdir -p app/models
|
| 10 |
+
mkdir -p app/utils
|
| 11 |
+
mkdir -p config
|
| 12 |
+
|
| 13 |
+
# Move Python files to their correct locations
|
| 14 |
+
mv api_server.py ./
|
| 15 |
+
mv app/models/translation_model.py app/models/
|
| 16 |
+
mv app/models/document_processor.py app/models/
|
| 17 |
+
mv app/models/html_processor.py app/models/
|
| 18 |
+
mv app/models/text_chunker.py app/models/
|
| 19 |
+
|
| 20 |
+
# Initialize __init__.py files
|
| 21 |
+
touch app/__init__.py
|
| 22 |
+
touch app/models/__init__.py
|
| 23 |
+
touch app/utils/__init__.py
|
| 24 |
+
|
| 25 |
+
# Download NLTK data
|
| 26 |
+
python -c "import nltk; nltk.download('punkt')"
|
| 27 |
+
|
| 28 |
+
echo "Setup complete!"
|