Spaces:

Arsive
/

lt_space

Build error

App Files Files Community

Arsive2 commited on Apr 14, 2025

Commit

4d48d5a

1 Parent(s): c8e757d

Initial commit of Universal Translator API

Browse files

Files changed (9) hide show

Dockerfile +33 -0
README.md +88 -2
api_server.py +160 -0
app/models/document_processor.py +68 -0
app/models/html_processor.py +112 -0
app/models/text_chunker.py +246 -0
app/models/translation_model.py +132 -0
requirements.txt +13 -0
setup.sh +28 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM python:3.10-bullseye
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libffi-dev \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install PyTorch with CUDA support
+RUN pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+# Copy requirements file
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Expose the port for the API
+EXPOSE 7860
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV TRANSFORMERS_CACHE=/app/.cache
+ENV HF_HOME=/app/.cache
+# Run the API server
+CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Lt Space
-emoji: 🐠
 colorFrom: yellow
 colorTo: purple
 sdk: docker
@@ -9,4 +9,90 @@ license: mit
 short_description: Language translation space
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Lt Space
+emoji: 🗣    ️
 colorFrom: yellow
 colorTo: purple
 sdk: docker
 short_description: Language translation space
 ---
+# Universal Translator API
+This is a Hugging Face Spaces deployment of the Universal Translator API service, which provides translation capabilities using the MADLAD-400 3B model.
+## Features
+- Text translation across 450+ languages
+- HTML translation with structure preservation
+- Document translation (PDF, images) with optional OCR
+- Efficient chunking for long text translation
+- GPU-accelerated inference
+## API Endpoints
+### Health Check
+```
+GET /
+```
+Returns the status of the service and model information.
+### Text Translation
+```
+POST /translate
+```
+Translates text from one language to another.
+**Request Body:**
+```json
+{
+  "text": "Text to translate",
+  "source_lang_code": "en",
+  "target_lang_code": "fr"
+}
+```
+### HTML Translation
+```
+POST /translate-html
+```
+Translates HTML content while preserving the HTML structure.
+**Request Body:**
+```json
+{
+  "html": "<p>Text to translate</p>",
+  "source_lang_code": "en",
+  "target_lang_code": "fr"
+}
+```
+### Document Translation
+```
+POST /process-document
+```
+Processes and translates PDF or image files.
+**Form Data:**
+- `file`: The document file (PDF or image)
+- `source_lang_code`: Source language code (e.g., "en")
+- `target_lang_code`: Target language code (e.g., "fr")
+- `use_ocr`: Whether to use OCR (boolean)
+## Language Codes
+The API uses the following language codes (ISO 639-1):
+- `en`: English
+- `fr`: French
+- `es`: Spanish
+- `de`: German
+- And many more (450+ languages supported by MADLAD-400)
+## Deployment
+This API is deployed on Hugging Face Spaces using Docker. The web interface is hosted separately on Render.
+## Model
+The service uses the Google MADLAD-400 3B model for translation.
+```
+google/madlad400-3b-mt
+```
+## License
+This project is available under the MIT License.

api_server.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import logging
+import uvicorn
+from app.models.translation_model import TranslationModel
+from app.models.html_processor import HTMLProcessor
+from app.models.text_chunker import TextChunker
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Initialize FastAPI app
+app = FastAPI(
+    title="Universal Translator API",
+    description="API for text, HTML, and document translation services",
+    version="1.0.0"
+)
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Adjust in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize translation model
+model = TranslationModel()
+html_processor = HTMLProcessor()
+text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
+# Define request/response models
+class TranslationRequest(BaseModel):
+    text: str
+    source_lang_code: str
+    target_lang_code: str
+class TranslationResponse(BaseModel):
+    translated_text: str
+class HTMLTranslationRequest(BaseModel):
+    html: str
+    source_lang_code: str
+    target_lang_code: str
+class HTMLTranslationResponse(BaseModel):
+    translated_html: str
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
+@app.post("/translate", response_model=TranslationResponse)
+async def translate_text(request: TranslationRequest):
+    """Translate text from source to target language"""
+    try:
+        # Get chunks using TextChunker
+        chunks = text_chunker.create_chunks(request.text)
+        translated_chunks = []
+        # Translate each chunk
+        for chunk in chunks:
+            translated_text = model.translate(
+                chunk.text,
+                request.source_lang_code,
+                request.target_lang_code
+            )
+            translated_chunks.append(translated_text)
+        # Combine translations
+        final_translation = text_chunker.combine_translations(
+            request.text, chunks, translated_chunks
+        )
+        return {"translated_text": final_translation}
+    except Exception as e:
+        logger.error(f"Translation error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/translate-html", response_model=HTMLTranslationResponse)
+async def translate_html(request: HTMLTranslationRequest):
+    """Translate HTML content while preserving structure"""
+    try:
+        # Extract text and maintain exact DOM structure
+        text_fragments, dom_data = html_processor.extract_text(request.html)
+        if not text_fragments:
+            return {"translated_html": request.html}  # No text to translate
+        # Process each text fragment individually
+        translated_fragments = []
+        for fragment in text_fragments:
+            if not fragment.strip():
+                translated_fragments.append(fragment)
+                continue
+            translated_text = model.translate(
+                fragment,
+                request.source_lang_code,
+                request.target_lang_code
+            )
+            translated_fragments.append(translated_text)
+        # Replace the original text with translated text in the HTML structure
+        translated_html = html_processor.replace_text(dom_data, translated_fragments)
+        return {"translated_html": translated_html}
+    except Exception as e:
+        logger.error(f"HTML translation error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/process-document")
+async def process_document(
+    file: UploadFile = File(...),
+    source_lang_code: str = Form(...),
+    target_lang_code: str = Form(...),
+    use_ocr: bool = Form(False)
+):
+    """Process and translate document (PDF or image)"""
+    try:
+        # Read file content
+        file_content = await file.read()
+        # Process document to extract text
+        extracted_text = model.process_document(
+            file_content,
+            file.filename,
+            use_ocr=use_ocr
+        )
+        if not extracted_text:
+            raise HTTPException(
+                status_code=400,
+                detail="No text could be extracted from the document"
+            )
+        # Translate the extracted text
+        translated_text = model.translate(
+            extracted_text,
+            source_lang_code,
+            target_lang_code
+        )
+        return {
+            "extracted_text": extracted_text,
+            "translated_text": translated_text
+        }
+    except Exception as e:
+        logger.error(f"Document processing error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)

app/models/document_processor.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import fitz  # PyMuPDF
+import logging
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class DocumentProcessor:
+    """Simplified document processor for the API service"""
+    def __init__(self):
+        """Initialize the document processor"""
+        self.supported_formats = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
+    def process_document(
+        self,
+        file_data: bytes,
+        filename: str,
+        use_ocr: bool = False
+    ) -> str:
+        """
+        Extract text from document (PDF or image)
+        Args:
+            file_data: Raw file content
+            filename: Original filename
+            use_ocr: Whether to use OCR (not implemented in this simplified version)
+        Returns:
+            Extracted text as string
+        """
+        try:
+            file_ext = Path(filename).suffix.lower()
+            logger.info(f"Processing file: {filename} with extension: {file_ext}")
+            if file_ext not in self.supported_formats:
+                raise ValueError(f"Unsupported file format: {file_ext}")
+            # Process PDF using PyMuPDF
+            if file_ext == '.pdf':
+                return self._process_pdf(file_data)
+            # Process image (placeholder - would need OCR integration)
+            else:
+                if use_ocr:
+                    # Placeholder for OCR implementation
+                    # You would integrate with an OCR service here
+                    raise NotImplementedError("OCR for images not implemented")
+                else:
+                    return "Text extraction from images requires OCR to be enabled"
+        except Exception as e:
+            logger.error(f"Error processing document: {str(e)}")
+            raise
+    def _process_pdf(self, file_data: bytes) -> str:
+        """Process PDF to extract text using PyMuPDF"""
+        try:
+            with fitz.open(stream=file_data, filetype="pdf") as pdf_doc:
+                text_parts = []
+                for page_num in range(len(pdf_doc)):
+                    page = pdf_doc[page_num]
+                    text = page.get_text()
+                    text_parts.append(text)
+                return "\n\n".join(text_parts)
+        except Exception as e:
+            logger.error(f"Error processing PDF: {str(e)}")
+            raise

app/models/html_processor.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import logging
+from bs4 import BeautifulSoup, NavigableString, Tag
+from typing import List, Tuple, Dict, Any
+logger = logging.getLogger(__name__)
+class HTMLProcessor:
+    """
+    A processor for HTML content that preserves exact HTML structure
+    while only translating text content.
+    """
+    def __init__(self):
+        self.skip_translation_class = 'notranslate'
+        self.skip_tags = {
+            'script', 'style', 'pre', 'code', 'head', 'title', 'meta',
+            'link', 'iframe', 'noscript', 'svg', 'path', 'img'
+        }
+    def extract_text(self, html_content: str) -> Tuple[List[str], Dict[str, Any]]:
+        """
+        Extract translatable text nodes from HTML content while preserving exact structure.
+        Args:
+            html_content: HTML content as a string
+        Returns:
+            A tuple containing:
+            - List of text fragments to translate
+            - DOM map that maintains references to the exact nodes in the original structure
+        """
+        try:
+            # Parse the HTML using 'html.parser' to ensure proper handling
+            soup = BeautifulSoup(html_content, 'html.parser')
+            # Use a list to store text fragments and their corresponding nodes
+            text_fragments = []
+            dom_map = {}
+            # Process the soup to find all text nodes
+            self._extract_text_from_node(soup, text_fragments, dom_map)
+            return text_fragments, {'soup': soup, 'node_map': dom_map}
+        except Exception as e:
+            logger.error(f"Error extracting text from HTML: {str(e)}")
+            return [], {}
+    def _extract_text_from_node(self, node, text_fragments: List[str], dom_map: Dict[int, Any], path: str = ""):
+        """
+        Recursively extract text from nodes while maintaining exact structure.
+        Args:
+            node: The current BeautifulSoup node
+            text_fragments: List to store extracted text
+            dom_map: Dictionary to map indices to nodes
+            path: Current path in the DOM tree for debugging
+        """
+        # Skip processing for certain tags
+        if isinstance(node, Tag) and node.name in self.skip_tags:
+            return
+        # Skip elements with notranslate class
+        if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
+            return
+        # Process this node
+        if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
+            # Only process non-empty text
+            text = str(node).strip()
+            if text:
+                index = len(text_fragments)
+                text_fragments.append(text)
+                dom_map[index] = node
+        # Recursively process child nodes
+        if isinstance(node, Tag):
+            for child in node.children:
+                child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
+                self._extract_text_from_node(child, text_fragments, dom_map, child_path)
+    def replace_text(self, dom_data: Dict[str, Any], translated_fragments: List[str]) -> str:
+        """
+        Replace the original text with translated text while keeping exact HTML structure.
+        Args:
+            dom_data: DOM data containing soup and node map
+            translated_fragments: List of translated text fragments
+        Returns:
+            HTML content with translated text and preserved structure
+        """
+        try:
+            soup = dom_data.get('soup')
+            node_map = dom_data.get('node_map', {})
+            if not soup or not node_map:
+                logger.error("Invalid DOM data for text replacement")
+                return ""
+            # Replace text in each node
+            for index, node in node_map.items():
+                if index < len(translated_fragments):
+                    # Replace the original string with the translated string
+                    node.replace_with(NavigableString(translated_fragments[index]))
+            # Return the HTML as a string
+            return str(soup)
+        except Exception as e:
+            logger.error(f"Error replacing text in HTML: {str(e)}")
+            return ""

app/models/text_chunker.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import re
+import logging
+import nltk
+from typing import List, Optional
+from dataclasses import dataclass
+from nltk.tokenize import sent_tokenize
+# Ensure NLTK data is downloaded
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+logger = logging.getLogger(__name__)
+@dataclass
+class TextChunk:
+    """Class to represent a chunk of text with metadata"""
+    text: str
+    index: int
+    token_count: int
+    is_partial_sentence: bool = False
+    original_start: int = 0
+    original_end: int = 0
+class TextChunker:
+    """
+    A utility class for chunking large texts into smaller pieces while preserving
+    sentence boundaries and context where possible.
+    """
+    def __init__(
+        self,
+        max_tokens: int = 450,
+        overlap_tokens: int = 50,
+        preserve_paragraphs: bool = True
+    ):
+        """
+        Initialize the TextChunker.
+        Args:
+            max_tokens: Maximum number of tokens per chunk
+            overlap_tokens: Number of tokens to overlap between chunks
+            preserve_paragraphs: Whether to try to preserve paragraph boundaries
+        """
+        self.max_tokens = max_tokens
+        self.overlap_tokens = overlap_tokens
+        self.preserve_paragraphs = preserve_paragraphs
+    def preprocess_text(self, text: str) -> str:
+        """Clean and normalize text before chunking."""
+        if not text:
+            return ""
+        # Replace multiple newlines with single \n
+        text = re.sub(r'\n\s*\n', '\n', text)
+        # Replace other whitespace characters with space
+        text = re.sub(r'[\r\t\f\v]', ' ', text)
+        # Replace multiple spaces with single space
+        text = re.sub(r' +', ' ', text)
+        # Clean up spaces around newlines
+        text = re.sub(r' *\n *', '\n', text)
+        # Remove spaces at the start and end of the text
+        text = text.strip()
+        # Handle bullet points and lists consistently
+        text = re.sub(r'•\s*', '• ', text)
+        text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
+        return text
+    def estimate_tokens(self, text: str) -> int:
+        """
+        Estimate the number of tokens in a text string.
+        This is a rough approximation - actual token count may vary by tokenizer.
+        """
+        # Split on whitespace and punctuation
+        words = re.findall(r'\b\w+\b|[^\w\s]', text)
+        return len(words)
+    def split_into_sentences(self, text: str) -> List[str]:
+        """Split text into sentences using NLTK."""
+        try:
+            return sent_tokenize(text)
+        except Exception as e:
+            logger.warning(f"Error in sentence tokenization: {e}")
+            # Fallback to simple period-based splitting
+            return [s.strip() + '.' for s in text.split('.') if s.strip()]
+    def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
+        """
+        Get chunk text starting from start_idx that fits within max_tokens.
+        Returns tuple of (chunk_text, end_idx, is_partial_sentence).
+        """
+        current_tokens = 0
+        current_sentences = []
+        is_partial = False
+        for i in range(start_idx, len(sentences)):
+            sentence = sentences[i]
+            sentence_tokens = self.estimate_tokens(sentence)
+            # If single sentence exceeds max tokens, split it
+            if sentence_tokens > max_tokens:
+                if not current_sentences:  # First sentence
+                    words = sentence.split()
+                    current_chunk = []
+                    word_count = 0
+                    for word in words:
+                        word_tokens = self.estimate_tokens(word)
+                        if word_count + word_tokens <= max_tokens:
+                            current_chunk.append(word)
+                            word_count += word_tokens
+                        else:
+                            break
+                    chunk_text = ' '.join(current_chunk)
+                    is_partial = True
+                    return chunk_text, i, is_partial
+                break
+            # Check if adding this sentence would exceed the limit
+            if current_tokens + sentence_tokens > max_tokens and current_sentences:
+                break
+            current_sentences.append(sentence)
+            current_tokens += sentence_tokens
+        return ' '.join(current_sentences), start_idx + len(current_sentences), is_partial
+    def create_chunks(self, text: str) -> List[TextChunk]:
+        """
+        Split text into chunks that respect sentence boundaries where possible.
+        Args:
+            text: Input text to be chunked
+        Returns:
+            List of TextChunk objects
+        """
+        text = self.preprocess_text(text)
+        if not text:
+            return []
+        chunks = []
+        current_idx = 0
+        # Split into paragraphs if preserve_paragraphs is True
+        if self.preserve_paragraphs:
+            paragraphs = text.split('\n')
+        else:
+            paragraphs = [text]
+        # Process each paragraph
+        for para in paragraphs:
+            if not para.strip():
+                continue
+            sentences = self.split_into_sentences(para)
+            para_start = 0
+            while para_start < len(sentences):
+                chunk_text, next_start, is_partial = self.get_chunk_text(
+                    sentences, para_start, self.max_tokens
+                )
+                if not chunk_text:
+                    break
+                # Calculate original text positions
+                original_start = text.find(chunk_text)
+                original_end = original_start + len(chunk_text)
+                chunks.append(TextChunk(
+                    text=chunk_text,
+                    index=current_idx,
+                    token_count=self.estimate_tokens(chunk_text),
+                    is_partial_sentence=is_partial,
+                    original_start=original_start,
+                    original_end=original_end
+                ))
+                current_idx += 1
+                para_start = next_start if not is_partial else next_start + 1
+        return chunks
+    def combine_translations(self, original_text: str, chunks: List[TextChunk],
+                           translations: List[str]) -> str:
+        """
+        Combine translated chunks back into a single text, handling overlaps.
+        Args:
+            original_text: Original input text
+            chunks: List of TextChunk objects
+            translations: List of translated text chunks
+        Returns:
+            Combined translated text
+        """
+        if len(chunks) != len(translations):
+            raise ValueError("Number of chunks and translations must match")
+        if len(chunks) == 0:
+            return ""
+        if len(chunks) == 1:
+            return translations[0]
+        # Combine translations, handling partial sentences
+        result = []
+        for i, (chunk, translation) in enumerate(zip(chunks, translations)):
+            if i > 0 and chunk.is_partial_sentence:
+                # For partial sentences, try to find a clean break point
+                prev_translation = translations[i-1]
+                overlap = self._find_overlap(prev_translation, translation)
+                if overlap:
+                    translation = translation[len(overlap):]
+            result.append(translation)
+        return ' '.join(result)
+    def _find_overlap(self, text1: str, text2: str, min_length: int = 10) -> Optional[str]:
+        """Find overlapping text between two strings."""
+        if not text1 or not text2:
+            return None
+        # Get the last part of text1 and first part of text2
+        end_text = text1[-100:]  # Look at last 100 chars
+        start_text = text2[:100]  # Look at first 100 chars
+        # Find the longest common substring
+        overlap = None
+        for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
+            if end_text[-length:] == start_text[:length]:
+                overlap = start_text[:length]
+                break
+        return overlap

app/models/translation_model.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import logging
+import re
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+logger = logging.getLogger(__name__)
+class TranslationModel:
+    """
+    Model class for handling the translation functionality using MADLAD-400 model
+    """
+    def __init__(self, model_name: str = "google/madlad400-3b-mt"):
+        """
+        Initialize the translation model.
+        Args:
+            model_name: Name of the Hugging Face model to use
+        """
+        self.model_name = model_name
+        self.model = None
+        self.tokenizer = None
+        self.device = self._get_device()
+        self._load_model()
+    def _get_device(self):
+        """Get the best available device for model inference."""
+        if torch.cuda.is_available():
+            logger.info("Using CUDA GPU for translation")
+            return torch.device("cuda")
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            logger.info("Using Apple MPS (Metal) for translation")
+            return torch.device("mps")
+        else:
+            logger.info("Using CPU for translation")
+            return torch.device("cpu")
+    def _load_model(self):
+        """Load the MADLAD-400 3B translation model."""
+        try:
+            logger.info(f"Loading translation model: {self.model_name}")
+            self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
+            # Use torch_dtype=torch.bfloat16 if available for faster inference
+            if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
+                logger.info("Using bfloat16 precision for model loading")
+                self.model = T5ForConditionalGeneration.from_pretrained(
+                    self.model_name,
+                    torch_dtype=torch.bfloat16
+                )
+            else:
+                dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+                logger.info(f"Using {dtype} precision for model loading")
+                self.model = T5ForConditionalGeneration.from_pretrained(
+                    self.model_name,
+                    torch_dtype=dtype
+                )
+            self.model.to(self.device)
+            logger.info(f"Model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Error loading model: {str(e)}")
+            raise
+    def translate(self, text: str, source_lang_code: str, target_lang_code: str) -> str:
+        """
+        Translate text from source language to target language.
+        Args:
+            text: Text to translate
+            source_lang_code: Source language code
+            target_lang_code: Target language code
+        Returns:
+            Translated text
+        """
+        try:
+            if self.model is None or self.tokenizer is None:
+                raise ValueError("Translation model not loaded")
+            # Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
+            input_text = f"<2{target_lang_code}> {text}"
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                translated = self.model.generate(
+                    **inputs,
+                    max_length=512,
+                    num_beams=5,
+                    early_stopping=True
+                )
+            translated_text = self.tokenizer.batch_decode(
+                translated,
+                skip_special_tokens=True
+            )[0]
+            return re.sub(r'\s+', ' ', translated_text).strip()
+        except Exception as e:
+            logger.error(f"Translation error: {str(e)}")
+            raise
+    def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
+        """
+        Process document to extract text using PyMuPDF and optional OCR.
+        This is a simplified version for the API that only returns the extracted text.
+        Args:
+            file_data: Raw file content
+            filename: Original filename
+            use_ocr: Whether to use OCR for text extraction
+        Returns:
+            Extracted text as string
+        """
+        from app.models.document_processor import DocumentProcessor
+        # Initialize document processor
+        doc_processor = DocumentProcessor()
+        # Process document and extract text
+        return doc_processor.process_document(file_data, filename, use_ocr)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.95.0
+uvicorn==0.21.1
+pydantic==1.10.7
+transformers==4.30.2
+sentencepiece==0.1.99
+accelerate==0.20.3
+python-multipart==0.0.6
+pillow==9.5.0
+nltk==3.8.1
+tqdm==4.65.0
+beautifulsoup4==4.12.2
+PyMuPDF==1.22.5
+protobuf==3.20.3

setup.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# Script to set up the HF Spaces environment
+set -e
+echo "Setting up Universal Translator API on Hugging Face Spaces..."
+# Create directories
+mkdir -p app/models
+mkdir -p app/utils
+mkdir -p config
+# Move Python files to their correct locations
+mv api_server.py ./
+mv app/models/translation_model.py app/models/
+mv app/models/document_processor.py app/models/
+mv app/models/html_processor.py app/models/
+mv app/models/text_chunker.py app/models/
+# Initialize __init__.py files
+touch app/__init__.py
+touch app/models/__init__.py
+touch app/utils/__init__.py
+# Download NLTK data
+python -c "import nltk; nltk.download('punkt')"
+echo "Setup complete!"