Soumik Bose commited on
Commit
b76793f
·
0 Parent(s):

Initial commit

Browse files
Files changed (3) hide show
  1. Dockerfile +45 -0
  2. main.py +322 -0
  3. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # UPDATE: Changed from 3.9 to 3.11 to support newer NumPy/Pandas versions
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ # Added 'poppler-utils' (required for pdf2image)
9
+ # Added 'libgl1' & 'libglib2.0-0' (required for OpenCV headless)
10
+ # Added 'curl' just in case, though we use python requests for healthcheck
11
+ RUN apt-get update && apt-get install -y \
12
+ tesseract-ocr \
13
+ tesseract-ocr-eng \
14
+ tesseract-ocr-deu \
15
+ tesseract-ocr-fra \
16
+ tesseract-ocr-spa \
17
+ tesseract-ocr-por \
18
+ tesseract-ocr-ita \
19
+ tesseract-ocr-rus \
20
+ tesseract-ocr-chi-sim \
21
+ tesseract-ocr-jpn \
22
+ tesseract-ocr-kor \
23
+ poppler-utils \
24
+ libgl1 \
25
+ libglib2.0-0 \
26
+ && rm -rf /var/lib/apt/lists/*
27
+
28
+ # Upgrade pip, setuptools, and wheel before installing deps
29
+ COPY requirements.txt .
30
+ RUN pip install --upgrade pip setuptools wheel \
31
+ && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
32
+
33
+ # Copy the current directory contents into the container
34
+ COPY . .
35
+
36
+ # Create a non-root user for security (Production Best Practice)
37
+ RUN useradd -m appuser && chown -R appuser /app
38
+ USER appuser
39
+
40
+ # Healthcheck as requested
41
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
42
+ CMD python -c "import requests; requests.get('http://localhost:8000/api/v1/ping', timeout=5)" || exit 1
43
+
44
+ # Run the application
45
+ CMD ["python", "main.py"]
main.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import uuid
4
+ import time
5
+ import logging
6
+ import shutil
7
+ import tempfile
8
+ from typing import Optional, List
9
+ from enum import Enum
10
+ from pathlib import Path
11
+
12
+ # Third-party imports
13
+ import uvicorn
14
+ import pytesseract
15
+ from fastapi import (
16
+ FastAPI, File, UploadFile, Depends,
17
+ HTTPException, Request, status
18
+ )
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
21
+ from fastapi.responses import JSONResponse
22
+ from fastapi.concurrency import run_in_threadpool
23
+ from pydantic import BaseModel
24
+ from dotenv import load_dotenv
25
+ from PIL import Image
26
+ from pdf2image import convert_from_path
27
+
28
+ # ==========================================
29
+ # 1. CONFIGURATION & ENV LOADING
30
+ # ==========================================
31
+ load_dotenv()
32
+
33
+ class Config:
34
+ APP_NAME = os.getenv("APP_NAME", "OCR API")
35
+ API_TOKEN = os.getenv("API_BEARER_TOKEN")
36
+ MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
37
+
38
+ # Parse allowed origins from comma-separated string
39
+ allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
40
+ ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
41
+ ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
42
+
43
+ # Validation check on startup
44
+ if not Config.API_TOKEN:
45
+ print("CRITICAL WARNING: API_BEARER_TOKEN is not set in .env")
46
+
47
+ # ==========================================
48
+ # 2. LOGGING SETUP
49
+ # ==========================================
50
+ class RequestIdFilter(logging.Filter):
51
+ def filter(self, record):
52
+ if not hasattr(record, 'request_id'):
53
+ record.request_id = 'system'
54
+ return True
55
+
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
59
+ datefmt='%Y-%m-%d %H:%M:%S'
60
+ )
61
+ logger = logging.getLogger("ocr_api")
62
+ logger.addFilter(RequestIdFilter())
63
+
64
+ # ==========================================
65
+ # 3. PYDANTIC MODELS
66
+ # ==========================================
67
+ class StatusEnum(str, Enum):
68
+ SUCCESS = "success"
69
+ ERROR = "error"
70
+
71
+ class BaseResponse(BaseModel):
72
+ request_id: str
73
+ process_time_ms: float
74
+ status: StatusEnum
75
+ message: Optional[str] = None
76
+
77
+ class OCRResult(BaseModel):
78
+ filename: str
79
+ content_type: str
80
+ pages: int
81
+ text: str
82
+
83
+ class APIResponse(BaseResponse):
84
+ data: Optional[OCRResult] = None
85
+ error_message: Optional[str] = None
86
+
87
+ # ==========================================
88
+ # 4. BUSINESS LOGIC SERVICES
89
+ # ==========================================
90
+
91
+ class SecurityService:
92
+ security_scheme = HTTPBearer()
93
+
94
+ @staticmethod
95
+ async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
96
+ """
97
+ Validates the Bearer token.
98
+ """
99
+ if credentials.credentials != Config.API_TOKEN:
100
+ logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
101
+ raise HTTPException(
102
+ status_code=status.HTTP_401_UNAUTHORIZED,
103
+ detail="Invalid Bearer Token",
104
+ headers={"WWW-Authenticate": "Bearer"},
105
+ )
106
+ return credentials.credentials
107
+
108
+ class FileValidator:
109
+ @staticmethod
110
+ def validate(file: UploadFile):
111
+ if file.content_type not in Config.ALLOWED_TYPES:
112
+ raise HTTPException(
113
+ status_code=status.HTTP_400_BAD_REQUEST,
114
+ detail=f"Invalid file type. Allowed: {Config.ALLOWED_TYPES}"
115
+ )
116
+
117
+ @staticmethod
118
+ def check_size_and_save(file: UploadFile) -> str:
119
+ """
120
+ Stream file to disk to check size without loading entire file into RAM.
121
+ Returns path to temp file.
122
+ """
123
+ try:
124
+ suffix = Path(file.filename).suffix
125
+ # Create a named temp file that persists so Tesseract can read it
126
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
127
+ shutil.copyfileobj(file.file, buffer)
128
+ tmp_path = buffer.name
129
+
130
+ # Check size
131
+ file_size = os.path.getsize(tmp_path)
132
+ if file_size > Config.MAX_SIZE:
133
+ os.remove(tmp_path)
134
+ raise HTTPException(
135
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
136
+ detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
137
+ )
138
+
139
+ return tmp_path
140
+ except HTTPException:
141
+ raise
142
+ except Exception as e:
143
+ logger.error(f"File save error: {e}")
144
+ raise HTTPException(status.HTTP_500_INTERNAL_SERVER_ERROR, "File upload failed")
145
+
146
+ class OCRProcessor:
147
+ @classmethod
148
+ def process_file(cls, file_path: str, content_type: str) -> dict:
149
+ """
150
+ Heavy CPU Logic.
151
+ """
152
+ start = time.perf_counter()
153
+ text = ""
154
+ pages = 1
155
+
156
+ try:
157
+ if content_type == "application/pdf":
158
+ # Convert PDF to images
159
+ images = convert_from_path(file_path)
160
+ pages = len(images)
161
+ # Extract text from each page
162
+ page_texts = []
163
+ for idx, img in enumerate(images):
164
+ page_texts.append(f"--- Page {idx+1} ---\n{pytesseract.image_to_string(img)}")
165
+ text = "\n\n".join(page_texts)
166
+ else:
167
+ # Standard Image
168
+ text = pytesseract.image_to_string(Image.open(file_path))
169
+
170
+ duration = (time.perf_counter() - start) * 1000
171
+ logger.info(f"OCR CPU Engine finished in {duration:.2f}ms")
172
+
173
+ return {"pages": pages, "text": text}
174
+ except Exception as e:
175
+ logger.error(f"OCR Extraction Error: {str(e)}")
176
+ raise ValueError("Failed to extract text from document")
177
+
178
+ # ==========================================
179
+ # 5. FASTAPI APP INIT
180
+ # ==========================================
181
+
182
+ app = FastAPI(
183
+ title=Config.APP_NAME,
184
+ version="1.0.0",
185
+ docs_url="/docs", # You can disable this in prod by setting to None
186
+ redoc_url=None
187
+ )
188
+
189
+ # STRICT CORS CONFIGURATION
190
+ app.add_middleware(
191
+ CORSMiddleware,
192
+ allow_origins=Config.ALLOWED_ORIGINS, # Loaded from .env
193
+ allow_credentials=True,
194
+ allow_methods=["GET", "POST"], # STRICT: Only GET and POST allowed
195
+ allow_headers=["Authorization", "Content-Type", "X-Request-ID"], # Allowed headers
196
+ )
197
+
198
+ # Middleware: Request ID & Logging
199
+ @app.middleware("http")
200
+ async def request_context_middleware(request: Request, call_next):
201
+ req_id = str(uuid.uuid4())
202
+ request.state.request_id = req_id
203
+
204
+ # Inject ID into logger
205
+ old_factory = logging.getLogRecordFactory()
206
+ def record_factory(*args, **kwargs):
207
+ record = old_factory(*args, **kwargs)
208
+ record.request_id = req_id
209
+ return record
210
+ logging.setLogRecordFactory(record_factory)
211
+
212
+ start_time = time.perf_counter()
213
+ logger.info(f"Incoming Request: {request.method} {request.url.path} | Origin: {request.headers.get('origin', 'unknown')}")
214
+
215
+ try:
216
+ response = await call_next(request)
217
+ process_time = (time.perf_counter() - start_time) * 1000
218
+ response.headers["X-Request-ID"] = req_id
219
+ response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
220
+ logger.info(f"Response: {response.status_code} | Time: {process_time:.2f}ms")
221
+ return response
222
+ except Exception as e:
223
+ logger.exception("Unhandled Exception in Middleware")
224
+ return JSONResponse(
225
+ status_code=500,
226
+ content={"status": "error", "message": "Internal Server Error", "request_id": req_id}
227
+ )
228
+
229
+ # ==========================================
230
+ # 6. ENDPOINTS
231
+ # ==========================================
232
+
233
+ @app.get("/", response_model=BaseResponse)
234
+ async def root(request: Request):
235
+ """Simple connectivity check."""
236
+ return {
237
+ "request_id": request.state.request_id,
238
+ "process_time_ms": 0,
239
+ "status": StatusEnum.SUCCESS,
240
+ "message": "OCR API is running."
241
+ }
242
+
243
+ @app.get("/api/v1/ping", response_model=BaseResponse)
244
+ async def health_check(request: Request):
245
+ """Docker Healthcheck Endpoint."""
246
+ return {
247
+ "request_id": request.state.request_id,
248
+ "process_time_ms": 0,
249
+ "status": StatusEnum.SUCCESS,
250
+ "message": "OCR API is healthy."
251
+ }
252
+
253
+ @app.post("/api/v1/get_data", response_model=APIResponse)
254
+ async def extract_data(
255
+ request: Request,
256
+ file: UploadFile = File(...),
257
+ token: str = Depends(SecurityService.validate_token)
258
+ ):
259
+ """
260
+ Main OCR Endpoint.
261
+ Non-blocking: Offloads OCR to threadpool.
262
+ """
263
+ start_ts = time.perf_counter()
264
+ tmp_file_path = None
265
+
266
+ try:
267
+ # 1. Validate File Type
268
+ FileValidator.validate(file)
269
+
270
+ # 2. Save File (IO Bound)
271
+ tmp_file_path = FileValidator.check_size_and_save(file)
272
+
273
+ # 3. Process (CPU Bound) - Run in ThreadPool for Non-Blocking
274
+ result = await run_in_threadpool(
275
+ OCRProcessor.process_file,
276
+ tmp_file_path,
277
+ file.content_type
278
+ )
279
+
280
+ return {
281
+ "request_id": request.state.request_id,
282
+ "process_time_ms": (time.perf_counter() - start_ts) * 1000,
283
+ "status": StatusEnum.SUCCESS,
284
+ "message": "OCR Extraction Successful",
285
+ "data": {
286
+ "filename": file.filename,
287
+ "content_type": file.content_type,
288
+ "pages": result["pages"],
289
+ "text": result["text"]
290
+ }
291
+ }
292
+
293
+ except HTTPException as he:
294
+ raise he
295
+ except ValueError as ve:
296
+ # OCR logic errors
297
+ return {
298
+ "request_id": request.state.request_id,
299
+ "process_time_ms": (time.perf_counter() - start_ts) * 1000,
300
+ "status": StatusEnum.ERROR,
301
+ "error_message": str(ve)
302
+ }
303
+ except Exception as e:
304
+ logger.error(f"Unexpected API Error: {e}")
305
+ return {
306
+ "request_id": request.state.request_id,
307
+ "process_time_ms": (time.perf_counter() - start_ts) * 1000,
308
+ "status": StatusEnum.ERROR,
309
+ "error_message": "An unexpected error occurred."
310
+ }
311
+ finally:
312
+ # Cleanup temp file
313
+ if tmp_file_path and os.path.exists(tmp_file_path):
314
+ try:
315
+ os.remove(tmp_file_path)
316
+ logger.info("Temp file deleted.")
317
+ except OSError:
318
+ pass
319
+
320
+ if __name__ == "__main__":
321
+ # Production: Workers should be handled by Gunicorn or Docker orchestration
322
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, workers=1)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.100
2
+ uvicorn[standard]>=0.30
3
+ python-multipart>=0.0.12
4
+ python-dotenv>=1.0
5
+ aiohttp==3.11.13
6
+ requests==2.32.3
7
+ pypdf==5.1.0
8
+ pytesseract==0.3.13
9
+ opencv-python-headless==4.12.0.88
10
+ numpy<2.3.0
11
+ pdf2image==1.17.0
12
+ Pillow==11.2.1