Arsive2 commited on
Commit
fb3dfc3
·
1 Parent(s): 4d48d5a

Updated permissions

Browse files
Dockerfile CHANGED
@@ -1,5 +1,4 @@
1
  FROM python:3.10-bullseye
2
-
3
  WORKDIR /app
4
 
5
  # Install system dependencies
@@ -9,8 +8,14 @@ RUN apt-get update && apt-get install -y \
9
  git \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Install PyTorch with CUDA support
13
- RUN pip install --no-cache-dir torch==2.0.1+cu118 torchvision==0.15.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html
 
 
 
 
 
 
14
 
15
  # Copy requirements file
16
  COPY requirements.txt .
@@ -18,6 +23,9 @@ COPY requirements.txt .
18
  # Install Python dependencies
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
 
 
 
21
  # Copy application code
22
  COPY . .
23
 
@@ -26,8 +34,6 @@ EXPOSE 7860
26
 
27
  # Set environment variables
28
  ENV PYTHONUNBUFFERED=1
29
- ENV TRANSFORMERS_CACHE=/app/.cache
30
- ENV HF_HOME=/app/.cache
31
 
32
  # Run the API server
33
  CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-bullseye
 
2
  WORKDIR /app
3
 
4
  # Install system dependencies
 
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Set up directories with proper permissions
12
+ RUN mkdir -p /app/.cache /app/nltk_data && \
13
+ chmod 777 /app/.cache /app/nltk_data
14
+
15
+ # Set environment variables for cache directories
16
+ ENV TRANSFORMERS_CACHE=/app/.cache
17
+ ENV HF_HOME=/app/.cache
18
+ ENV NLTK_DATA=/app/nltk_data
19
 
20
  # Copy requirements file
21
  COPY requirements.txt .
 
23
  # Install Python dependencies
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
+ # Pre-download NLTK data before copying application code
27
+ RUN python -c "import nltk; nltk.download('punkt', download_dir='/app/nltk_data')"
28
+
29
  # Copy application code
30
  COPY . .
31
 
 
34
 
35
  # Set environment variables
36
  ENV PYTHONUNBUFFERED=1
 
 
37
 
38
  # Run the API server
39
  CMD ["uvicorn", "api_server:app", "--host", "0.0.0.0", "--port", "7860"]
api_server.py CHANGED
@@ -1,11 +1,11 @@
1
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
 
 
 
4
  import logging
5
  import uvicorn
6
- from app.models.translation_model import TranslationModel
7
- from app.models.html_processor import HTMLProcessor
8
- from app.models.text_chunker import TextChunker
9
 
10
  # Configure logging
11
  logging.basicConfig(
@@ -30,10 +30,29 @@ app.add_middleware(
30
  allow_headers=["*"],
31
  )
32
 
33
- # Initialize translation model
34
- model = TranslationModel()
35
- html_processor = HTMLProcessor()
36
- text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # Define request/response models
39
  class TranslationRequest(BaseModel):
@@ -55,11 +74,36 @@ class HTMLTranslationResponse(BaseModel):
55
  @app.get("/")
56
  async def root():
57
  """Health check endpoint"""
 
 
 
 
 
 
58
  return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  @app.post("/translate", response_model=TranslationResponse)
61
  async def translate_text(request: TranslationRequest):
62
  """Translate text from source to target language"""
 
 
 
63
  try:
64
  # Get chunks using TextChunker
65
  chunks = text_chunker.create_chunks(request.text)
@@ -87,6 +131,9 @@ async def translate_text(request: TranslationRequest):
87
  @app.post("/translate-html", response_model=HTMLTranslationResponse)
88
  async def translate_html(request: HTMLTranslationRequest):
89
  """Translate HTML content while preserving structure"""
 
 
 
90
  try:
91
  # Extract text and maintain exact DOM structure
92
  text_fragments, dom_data = html_processor.extract_text(request.html)
@@ -124,6 +171,9 @@ async def process_document(
124
  use_ocr: bool = Form(False)
125
  ):
126
  """Process and translate document (PDF or image)"""
 
 
 
127
  try:
128
  # Read file content
129
  file_content = await file.read()
@@ -157,4 +207,4 @@ async def process_document(
157
  raise HTTPException(status_code=500, detail=str(e))
158
 
159
  if __name__ == "__main__":
160
- uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
 
1
  from fastapi import FastAPI, UploadFile, File, Form, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from pydantic import BaseModel
4
+ from typing import Optional, Dict, Any, List
5
+ import torch
6
+ import os
7
  import logging
8
  import uvicorn
 
 
 
9
 
10
  # Configure logging
11
  logging.basicConfig(
 
30
  allow_headers=["*"],
31
  )
32
 
33
+ # Set environment variables if not already set
34
+ os.environ.setdefault('TRANSFORMERS_CACHE', '/app/.cache')
35
+ os.environ.setdefault('HF_HOME', '/app/.cache')
36
+ os.environ.setdefault('NLTK_DATA', '/app/nltk_data')
37
+
38
+ # Create necessary directories with proper permissions
39
+ os.makedirs(os.environ.get('TRANSFORMERS_CACHE'), exist_ok=True)
40
+ os.makedirs(os.environ.get('NLTK_DATA'), exist_ok=True)
41
+
42
+ try:
43
+ from app.models.text_chunker import TextChunker
44
+ from app.models.html_processor import HTMLProcessor
45
+ from app.models.translation_model import TranslationModel
46
+
47
+ # Initialize components
48
+ text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
49
+ html_processor = HTMLProcessor()
50
+ model = TranslationModel()
51
+
52
+ initialization_error = None
53
+ except Exception as e:
54
+ logger.error(f"Error initializing components: {str(e)}")
55
+ initialization_error = str(e)
56
 
57
  # Define request/response models
58
  class TranslationRequest(BaseModel):
 
74
  @app.get("/")
75
  async def root():
76
  """Health check endpoint"""
77
+ if initialization_error:
78
+ return {
79
+ "status": "error",
80
+ "message": "Service initialization failed",
81
+ "error": initialization_error
82
+ }
83
  return {"status": "ok", "model": "MADLAD-400", "version": "3B"}
84
 
85
+ @app.get("/health")
86
+ async def health_check():
87
+ """Extended health check with environment information"""
88
+ return {
89
+ "status": "ok" if not initialization_error else "error",
90
+ "error": initialization_error,
91
+ "environment": {
92
+ "transformers_cache": os.environ.get('TRANSFORMERS_CACHE'),
93
+ "hf_home": os.environ.get('HF_HOME'),
94
+ "nltk_data": os.environ.get('NLTK_DATA'),
95
+ "python_version": os.environ.get('PYTHON_VERSION'),
96
+ "cuda_available": torch.cuda.is_available() if 'torch' in globals() else "Unknown",
97
+ "device": str(model.device) if 'model' in globals() and hasattr(model, 'device') else "Unknown"
98
+ }
99
+ }
100
+
101
  @app.post("/translate", response_model=TranslationResponse)
102
  async def translate_text(request: TranslationRequest):
103
  """Translate text from source to target language"""
104
+ if initialization_error:
105
+ raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
106
+
107
  try:
108
  # Get chunks using TextChunker
109
  chunks = text_chunker.create_chunks(request.text)
 
131
  @app.post("/translate-html", response_model=HTMLTranslationResponse)
132
  async def translate_html(request: HTMLTranslationRequest):
133
  """Translate HTML content while preserving structure"""
134
+ if initialization_error:
135
+ raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
136
+
137
  try:
138
  # Extract text and maintain exact DOM structure
139
  text_fragments, dom_data = html_processor.extract_text(request.html)
 
171
  use_ocr: bool = Form(False)
172
  ):
173
  """Process and translate document (PDF or image)"""
174
+ if initialization_error:
175
+ raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
176
+
177
  try:
178
  # Read file content
179
  file_content = await file.read()
 
207
  raise HTTPException(status_code=500, detail=str(e))
208
 
209
  if __name__ == "__main__":
210
+ uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
app/models/text_chunker.py CHANGED
@@ -1,16 +1,25 @@
1
  import re
2
  import logging
 
3
  import nltk
4
 
5
  from typing import List, Optional
6
  from dataclasses import dataclass
7
  from nltk.tokenize import sent_tokenize
8
 
 
 
 
 
9
  # Ensure NLTK data is downloaded
10
  try:
11
  nltk.data.find('tokenizers/punkt')
12
  except LookupError:
13
- nltk.download('punkt')
 
 
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
@@ -243,4 +252,4 @@ class TextChunker:
243
  overlap = start_text[:length]
244
  break
245
 
246
- return overlap
 
1
  import re
2
  import logging
3
+ import os
4
  import nltk
5
 
6
  from typing import List, Optional
7
  from dataclasses import dataclass
8
  from nltk.tokenize import sent_tokenize
9
 
10
+ # Set NLTK data path from environment variable if available
11
+ nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
12
+ nltk.data.path.append(nltk_data_path)
13
+
14
  # Ensure NLTK data is downloaded
15
  try:
16
  nltk.data.find('tokenizers/punkt')
17
  except LookupError:
18
+ try:
19
+ nltk.download('punkt', download_dir=nltk_data_path)
20
+ except Exception as e:
21
+ logging.warning(f"Failed to download NLTK data: {e}")
22
+ # Fallback to not using NLTK if download fails
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
252
  overlap = start_text[:length]
253
  break
254
 
255
+ return overlap
app/models/translation_model.py CHANGED
@@ -1,6 +1,8 @@
1
  import torch
2
  import logging
3
  import re
 
 
4
  from transformers import T5ForConditionalGeneration, T5Tokenizer
5
 
6
  logger = logging.getLogger(__name__)
@@ -21,7 +23,19 @@ class TranslationModel:
21
  self.model = None
22
  self.tokenizer = None
23
  self.device = self._get_device()
24
- self._load_model()
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def _get_device(self):
27
  """Get the best available device for model inference."""
@@ -39,21 +53,26 @@ class TranslationModel:
39
  """Load the MADLAD-400 3B translation model."""
40
  try:
41
  logger.info(f"Loading translation model: {self.model_name}")
42
- self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
 
 
 
43
 
44
  # Use torch_dtype=torch.bfloat16 if available for faster inference
45
  if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
46
  logger.info("Using bfloat16 precision for model loading")
47
  self.model = T5ForConditionalGeneration.from_pretrained(
48
  self.model_name,
49
- torch_dtype=torch.bfloat16
 
50
  )
51
  else:
52
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
53
  logger.info(f"Using {dtype} precision for model loading")
54
  self.model = T5ForConditionalGeneration.from_pretrained(
55
  self.model_name,
56
- torch_dtype=dtype
 
57
  )
58
 
59
  self.model.to(self.device)
@@ -75,8 +94,8 @@ class TranslationModel:
75
  Translated text
76
  """
77
  try:
78
- if self.model is None or self.tokenizer is None:
79
- raise ValueError("Translation model not loaded")
80
 
81
  # Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
82
  input_text = f"<2{target_lang_code}> {text}"
@@ -113,7 +132,6 @@ class TranslationModel:
113
  def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
114
  """
115
  Process document to extract text using PyMuPDF and optional OCR.
116
- This is a simplified version for the API that only returns the extracted text.
117
 
118
  Args:
119
  file_data: Raw file content
@@ -123,10 +141,13 @@ class TranslationModel:
123
  Returns:
124
  Extracted text as string
125
  """
 
 
 
126
  from app.models.document_processor import DocumentProcessor
127
 
128
  # Initialize document processor
129
  doc_processor = DocumentProcessor()
130
 
131
  # Process document and extract text
132
- return doc_processor.process_document(file_data, filename, use_ocr)
 
1
  import torch
2
  import logging
3
  import re
4
+ import os
5
+ from typing import Optional, Dict, Any, List
6
  from transformers import T5ForConditionalGeneration, T5Tokenizer
7
 
8
  logger = logging.getLogger(__name__)
 
23
  self.model = None
24
  self.tokenizer = None
25
  self.device = self._get_device()
26
+ self.initialized = False
27
+ self.initialization_error = None
28
+
29
+ # Ensure cache directory exists and is writable
30
+ cache_dir = os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
31
+ os.makedirs(cache_dir, exist_ok=True)
32
+
33
+ try:
34
+ self._load_model()
35
+ self.initialized = True
36
+ except Exception as e:
37
+ self.initialization_error = str(e)
38
+ logger.error(f"Failed to initialize translation model: {str(e)}")
39
 
40
  def _get_device(self):
41
  """Get the best available device for model inference."""
 
53
  """Load the MADLAD-400 3B translation model."""
54
  try:
55
  logger.info(f"Loading translation model: {self.model_name}")
56
+ self.tokenizer = T5Tokenizer.from_pretrained(
57
+ self.model_name,
58
+ cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
59
+ )
60
 
61
  # Use torch_dtype=torch.bfloat16 if available for faster inference
62
  if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
63
  logger.info("Using bfloat16 precision for model loading")
64
  self.model = T5ForConditionalGeneration.from_pretrained(
65
  self.model_name,
66
+ torch_dtype=torch.bfloat16,
67
+ cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
68
  )
69
  else:
70
  dtype = torch.float16 if torch.cuda.is_available() else torch.float32
71
  logger.info(f"Using {dtype} precision for model loading")
72
  self.model = T5ForConditionalGeneration.from_pretrained(
73
  self.model_name,
74
+ torch_dtype=dtype,
75
+ cache_dir=os.environ.get('TRANSFORMERS_CACHE', '/app/.cache')
76
  )
77
 
78
  self.model.to(self.device)
 
94
  Translated text
95
  """
96
  try:
97
+ if not self.initialized:
98
+ raise ValueError(f"Translation model not properly initialized: {self.initialization_error}")
99
 
100
  # Prepare input with MADLAD-400 format: <2{target_lang}> {source_text}
101
  input_text = f"<2{target_lang_code}> {text}"
 
132
  def process_document(self, file_data: bytes, filename: str, use_ocr: bool = False) -> str:
133
  """
134
  Process document to extract text using PyMuPDF and optional OCR.
 
135
 
136
  Args:
137
  file_data: Raw file content
 
141
  Returns:
142
  Extracted text as string
143
  """
144
+ if not self.initialized:
145
+ raise ValueError(f"Translation model not properly initialized: {self.initialization_error}")
146
+
147
  from app.models.document_processor import DocumentProcessor
148
 
149
  # Initialize document processor
150
  doc_processor = DocumentProcessor()
151
 
152
  # Process document and extract text
153
+ return doc_processor.process_document(file_data, filename, use_ocr)
fix_permissions.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Script to fix permissions in the container
4
+ set -e
5
+
6
+ echo "Setting up permissions for Universal Translator API..."
7
+
8
+ # Ensure directories exist
9
+ mkdir -p /app/.cache
10
+ mkdir -p /app/nltk_data
11
+
12
+ # Set permissions
13
+ chmod -R 777 /app/.cache
14
+ chmod -R 777 /app/nltk_data
15
+
16
+ echo "Permissions setup complete!"
17
+
18
+ # Verify NLTK data
19
+ python -c "import nltk; nltk.download('punkt', download_dir='/app/nltk_data')"
20
+
21
+ echo "NLTK data verification complete!"