Spaces:

Arsive
/

lt_space

Build error

App Files Files Community

Arsive2 commited on Apr 14, 2025

Commit

fb3dfc3

1 Parent(s): 4d48d5a

Updated permissions

Browse files

Files changed (5) hide show

Dockerfile +11 -5
api_server.py +58 -8
app/models/text_chunker.py +11 -2
app/models/translation_model.py +29 -8
fix_permissions.sh +21 -0

Dockerfile CHANGED Viewed

@@ -1,5 +1,4 @@
 FROM python:3.10-bullseye
 WORKDIR /app
 # Install system dependencies
@@ -9,8 +8,14 @@ RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
-# Install PyTorch with CUDA support
-RUN pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
 # Copy requirements file
 COPY requirements.txt .
@@ -18,6 +23,9 @@ COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
@@ -26,8 +34,6 @@ EXPOSE 7860
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
-ENV TRANSFORMERS_CACHE=/app/.cache
-ENV HF_HOME=/app/.cache
 # Run the API server
 CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-bullseye
 WORKDIR /app
 # Install system dependencies
     git \
     && rm -rf /var/lib/apt/lists/*
+# Set up directories with proper permissions
+RUN mkdir -p /app/.cache /app/nltk_data && \
+    chmod 777 /app/.cache /app/nltk_data
+# Set environment variables for cache directories
+ENV TRANSFORMERS_CACHE=/app/.cache
+ENV HF_HOME=/app/.cache
+ENV NLTK_DATA=/app/nltk_data
 # Copy requirements file
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+# Pre-download NLTK data before copying application code
+RUN python -c "import nltk; nltk.download('punkt', download_dir='/app/nltk_data')"
 # Copy application code
 COPY . .
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 # Run the API server
 CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]

api_server.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import logging
 import uvicorn
-from app.models.translation_model import TranslationModel
-from app.models.html_processor import HTMLProcessor
-from app.models.text_chunker import TextChunker
 # Configure logging
 logging.basicConfig(
@@ -30,10 +30,29 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Initialize translation model
-model = TranslationModel()
-html_processor = HTMLProcessor()
-text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
 # Define request/response models
 class TranslationRequest(BaseModel):
@@ -55,11 +74,36 @@ class HTMLTranslationResponse(BaseModel):
 @app.get("/")
 async def root():
     """Health check endpoint"""
     return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
 @app.post("/translate", response_model=TranslationResponse)
 async def translate_text(request: TranslationRequest):
     """Translate text from source to target language"""
     try:
         # Get chunks using TextChunker
         chunks = text_chunker.create_chunks(request.text)
@@ -87,6 +131,9 @@ async def translate_text(request: TranslationRequest):
 @app.post("/translate-html", response_model=HTMLTranslationResponse)
 async def translate_html(request: HTMLTranslationRequest):
     """Translate HTML content while preserving structure"""
     try:
         # Extract text and maintain exact DOM structure
         text_fragments, dom_data = html_processor.extract_text(request.html)
@@ -124,6 +171,9 @@ async def process_document(
     use_ocr: bool = Form(False)
 ):
     """Process and translate document (PDF or image)"""
     try:
         # Read file content
         file_content = await file.read()
@@ -157,4 +207,4 @@ async def process_document(
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
-    uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)

 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from typing import Optional, Dict, Any, List
+import torch
+import os
 import logging
 import uvicorn
 # Configure logging
 logging.basicConfig(
     allow_headers=["*"],
 )
+# Set environment variables if not already set
+os.environ.setdefault('TRANSFORMERS_CACHE', '/app/.cache')
+os.environ.setdefault('HF_HOME', '/app/.cache')
+os.environ.setdefault('NLTK_DATA', '/app/nltk_data')
+# Create necessary directories with proper permissions
+os.makedirs(os.environ.get('TRANSFORMERS_CACHE'), exist_ok=True)
+os.makedirs(os.environ.get('NLTK_DATA'), exist_ok=True)
+try:
+    from app.models.text_chunker import TextChunker
+    from app.models.html_processor import HTMLProcessor
+    from app.models.translation_model import TranslationModel
+    # Initialize components
+    text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
+    html_processor = HTMLProcessor()
+    model = TranslationModel()
+    initialization_error = None
+except Exception as e:
+    logger.error(f"Error initializing components: {str(e)}")
+    initialization_error = str(e)
 # Define request/response models
 class TranslationRequest(BaseModel):
 @app.get("/")
 async def root():
     """Health check endpoint"""
+    if initialization_error:
+        return {
+            "status": "error",
+            "message": "Service initialization failed",
+            "error": initialization_error
+        }
     return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
+@app.get("/health")
+async def health_check():
+    """Extended health check with environment information"""
+    return {
+        "status": "ok" if not initialization_error else "error",
+        "error": initialization_error,
+        "environment": {
+            "transformers_cache": os.environ.get('TRANSFORMERS_CACHE'),
+            "hf_home": os.environ.get('HF_HOME'),
+            "nltk_data": os.environ.get('NLTK_DATA'),
+            "python_version": os.environ.get('PYTHON_VERSION'),
+            "cuda_available": torch.cuda.is_available() if 'torch' in globals() else "Unknown",
+            "device": str(model.device) if 'model' in globals() and hasattr(model, 'device') else "Unknown"
+        }
+    }
 @app.post("/translate", response_model=TranslationResponse)
 async def translate_text(request: TranslationRequest):
     """Translate text from source to target language"""
+    if initialization_error:
+        raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         # Get chunks using TextChunker
         chunks = text_chunker.create_chunks(request.text)
 @app.post("/translate-html", response_model=HTMLTranslationResponse)
 async def translate_html(request: HTMLTranslationRequest):
     """Translate HTML content while preserving structure"""
+    if initialization_error:
+        raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         # Extract text and maintain exact DOM structure
         text_fragments, dom_data = html_processor.extract_text(request.html)
     use_ocr: bool = Form(False)
 ):
     """Process and translate document (PDF or image)"""
+    if initialization_error:
+        raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
     try:
         # Read file content
         file_content = await file.read()
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)

app/models/text_chunker.py CHANGED Viewed

@@ -1,16 +1,25 @@
 import re
 import logging
 import nltk
 from typing import List, Optional
 from dataclasses import dataclass
 from nltk.tokenize import sent_tokenize
 # Ensure NLTK data is downloaded
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
-    nltk.download('punkt')
 logger = logging.getLogger(__name__)
@@ -243,4 +252,4 @@ class TextChunker:
                 overlap = start_text[:length]
                 break
-        return overlap

 import re
 import logging
+import os
 import nltk
 from typing import List, Optional
 from dataclasses import dataclass
 from nltk.tokenize import sent_tokenize
+# Set NLTK data path from environment variable if available
+nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
+nltk.data.path.append(nltk_data_path)
 # Ensure NLTK data is downloaded
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
+    try:
+        nltk.download('punkt', download_dir=nltk_data_path)
+    except Exception as e:
+        logging.warning(f"Failed to download NLTK data: {e}")
+        # Fallback to not using NLTK if download fails
 logger = logging.getLogger(__name__)
                 overlap = start_text[:length]
                 break
+        return overlap

app/models/translation_model.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import torch
 import logging
 import re
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 logger = logging.getLogger(__name__)
@@ -21,7 +23,19 @@ class TranslationModel:
         self.model = None
         self.tokenizer = None
         self.device = self._get_device()
-        self._load_model()
     def _get_device(self):
         """Get the best available device for model inference."""
@@ -39,21 +53,26 @@ class TranslationModel:
         """Load the MADLAD-400 3B translation model."""
         try:
             logger.info(f"Loading translation model: {self.model_name}")
-            self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
             # Use torch_dtype=torch.bfloat16 if available for faster inference
             if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
                 logger.info("Using bfloat16 precision for model loading")
                 self.model = T5ForConditionalGeneration.from_pretrained(
                     self.model_name,
-                    torch_dtype=torch.bfloat16
                 )
             else:
                 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
                 logger.info(f"Using {dtype} precision for model loading")
                 self.model = T5ForConditionalGeneration.from_pretrained(
                     self.model_name,
-                    torch_dtype=dtype
                 )
             self.model.to(self.device)
@@ -75,8 +94,8 @@ class TranslationModel:
             Translated text
         """
         try:
-            if self.model is None or self.tokenizer is None:
-                raise ValueError("Translation model not loaded")
             # Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
             input_text = f"<2{target_lang_code}> {text}"
@@ -113,7 +132,6 @@ class TranslationModel:
     def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
         """
         Process document to extract text using PyMuPDF and optional OCR.
-        This is a simplified version for the API that only returns the extracted text.
         Args:
             file_data: Raw file content
@@ -123,10 +141,13 @@ class TranslationModel:
         Returns:
             Extracted text as string
         """
         from app.models.document_processor import DocumentProcessor
         # Initialize document processor
         doc_processor = DocumentProcessor()
         # Process document and extract text
-        return doc_processor.process_document(file_data, filename, use_ocr)

 import torch
 import logging
 import re
+import os
+from typing import Optional, Dict, Any, List
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 logger = logging.getLogger(__name__)
         self.model = None
         self.tokenizer = None
         self.device = self._get_device()
+        self.initialized = False
+        self.initialization_error = None
+        # Ensure cache directory exists and is writable
+        cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
+        os.makedirs(cache_dir, exist_ok=True)
+        try:
+            self._load_model()
+            self.initialized = True
+        except Exception as e:
+            self.initialization_error = str(e)
+            logger.error(f"Failed to initialize translation model: {str(e)}")
     def _get_device(self):
         """Get the best available device for model inference."""
         """Load the MADLAD-400 3B translation model."""
         try:
             logger.info(f"Loading translation model: {self.model_name}")
+            self.tokenizer = T5Tokenizer.from_pretrained(
+                self.model_name,
+                cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
+            )
             # Use torch_dtype=torch.bfloat16 if available for faster inference
             if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
                 logger.info("Using bfloat16 precision for model loading")
                 self.model = T5ForConditionalGeneration.from_pretrained(
                     self.model_name,
+                    torch_dtype=torch.bfloat16,
+                    cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
                 )
             else:
                 dtype = torch.float16 if torch.cuda.is_available() else torch.float32
                 logger.info(f"Using {dtype} precision for model loading")
                 self.model = T5ForConditionalGeneration.from_pretrained(
                     self.model_name,
+                    torch_dtype=dtype,
+                    cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
                 )
             self.model.to(self.device)
             Translated text
         """
         try:
+            if not self.initialized:
+                raise ValueError(f"Translation model not properly initialized: {self.initialization_error}")
             # Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
             input_text = f"<2{target_lang_code}> {text}"
     def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
         """
         Process document to extract text using PyMuPDF and optional OCR.
         Args:
             file_data: Raw file content
         Returns:
             Extracted text as string
         """
+        if not self.initialized:
+            raise ValueError(f"Translation model not properly initialized: {self.initialization_error}")
         from app.models.document_processor import DocumentProcessor
         # Initialize document processor
         doc_processor = DocumentProcessor()
         # Process document and extract text
+        return doc_processor.process_document(file_data, filename, use_ocr)

fix_permissions.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+# Script to fix permissions in the container
+set -e
+echo "Setting up permissions for Universal Translator API..."
+# Ensure directories exist
+mkdir -p /app/.cache
+mkdir -p /app/nltk_data
+# Set permissions
+chmod -R 777 /app/.cache
+chmod -R 777 /app/nltk_data
+echo "Permissions setup complete!"
+# Verify NLTK data
+python -c "import nltk; nltk.download('punkt', download_dir='/app/nltk_data')"
+echo "NLTK data verification complete!"