| | import os |
| | import uuid |
| | import time |
| | import logging |
| | import shutil |
| | import tempfile |
| | from typing import Optional, List |
| | from enum import Enum |
| | from pathlib import Path |
| | from contextvars import ContextVar |
| | import uvicorn |
| | import pytesseract |
| | from fastapi import ( |
| | FastAPI, File, UploadFile, Depends, |
| | HTTPException, Request, status |
| | ) |
| | from fastapi.middleware.cors import CORSMiddleware |
| | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials |
| | from fastapi.responses import JSONResponse |
| | from fastapi.concurrency import run_in_threadpool |
| | from pydantic import BaseModel |
| | from dotenv import load_dotenv |
| | from PIL import Image |
| | from pdf2image import convert_from_path |
| |
|
| | |
| | |
| | |
| | load_dotenv() |
| |
|
| | |
| | request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system") |
| |
|
| | class Config: |
| | APP_NAME = os.getenv("APP_NAME", "OCR API") |
| | API_TOKEN = os.getenv("API_BEARER_TOKEN") |
| | MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) |
| | ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()] |
| | ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"] |
| |
|
| | class RequestIdFilter(logging.Filter): |
| | def filter(self, record): |
| | |
| | record.request_id = request_id_ctx.get() |
| | return True |
| |
|
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s', |
| | datefmt='%Y-%m-%d %H:%M:%S', |
| | force=True |
| | ) |
| | logger = logging.getLogger("ocr_api") |
| | logger.addFilter(RequestIdFilter()) |
| |
|
| | |
| | |
| | |
| | class StatusEnum(str, Enum): |
| | SUCCESS = "success" |
| | ERROR = "error" |
| |
|
| | class BaseResponse(BaseModel): |
| | request_id: str |
| | process_time_ms: float |
| | status: StatusEnum |
| | message: Optional[str] = None |
| |
|
| | class PageResult(BaseModel): |
| | index: int |
| | page_number: int |
| | text: str |
| |
|
| | class OCRResult(BaseModel): |
| | filename: str |
| | content_type: str |
| | saved_file_path: str |
| | total_pages: int |
| | pages_content: List[PageResult] |
| |
|
| | class APIResponse(BaseResponse): |
| | data: Optional[OCRResult] = None |
| | error_message: Optional[str] = None |
| |
|
| | |
| | |
| | |
| |
|
| | class SecurityService: |
| | security_scheme = HTTPBearer() |
| |
|
| | @staticmethod |
| | async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)): |
| | if credentials.credentials != Config.API_TOKEN: |
| | logger.warning("Auth Failed: Invalid Token") |
| | raise HTTPException(status_code=401, detail="Invalid Bearer Token") |
| | return credentials.credentials |
| |
|
| | class FileValidator: |
| | @staticmethod |
| | def validate(file: UploadFile): |
| | if file.content_type not in Config.ALLOWED_TYPES: |
| | raise HTTPException(400, f"Invalid file type: {file.content_type}") |
| |
|
| | @staticmethod |
| | def check_size_and_save(file: UploadFile) -> str: |
| | suffix = Path(file.filename).suffix |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer: |
| | shutil.copyfileobj(file.file, buffer) |
| | tmp_path = os.path.abspath(buffer.name) |
| | |
| | if os.path.getsize(tmp_path) > Config.MAX_SIZE: |
| | os.remove(tmp_path) |
| | raise HTTPException(413, "File too large") |
| | return tmp_path |
| |
|
| | class OCRProcessor: |
| | @classmethod |
| | def process_file(cls, file_path: str, content_type: str) -> dict: |
| | """Note: No longer passing request_id; logger picks it up from contextvars automatically.""" |
| | start = time.perf_counter() |
| | pages_content = [] |
| | |
| | try: |
| | logger.info(f"Processing File: {file_path}") |
| |
|
| | if content_type == "application/pdf": |
| | logger.info("Converting PDF to Images...") |
| | images = convert_from_path(file_path) |
| | total = len(images) |
| | |
| | for idx, img in enumerate(images): |
| | page_num = idx + 1 |
| | logger.info(f"Scanning Page {page_num}/{total}") |
| | text = pytesseract.image_to_string(img).strip() |
| | pages_content.append({"index": idx, "page_number": page_num, "text": text}) |
| | else: |
| | logger.info("Scanning Single Image...") |
| | img = Image.open(file_path) |
| | text = pytesseract.image_to_string(img).strip() |
| | pages_content.append({"index": 0, "page_number": 1, "text": text}) |
| |
|
| | logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms") |
| | return {"total_pages": len(pages_content), "pages_content": pages_content} |
| |
|
| | except Exception as e: |
| | logger.error(f"OCR Logic Failure: {str(e)}") |
| | raise ValueError(str(e)) |
| |
|
| | |
| | |
| | |
| | app = FastAPI(title=Config.APP_NAME) |
| |
|
| | app.add_middleware( |
| | CORSMiddleware, |
| | allow_origins=Config.ALLOWED_ORIGINS if Config.ALLOWED_ORIGINS else ["*"], |
| | allow_methods=["*"], |
| | allow_headers=["*"], |
| | ) |
| |
|
| | @app.middleware("http") |
| | async def request_context_middleware(request: Request, call_next): |
| | |
| | req_id = str(uuid.uuid4()) |
| | |
| | token = request_id_ctx.set(req_id) |
| | request.state.request_id = req_id |
| |
|
| | start_time = time.perf_counter() |
| | logger.info(f"Start: {request.method} {request.url.path}") |
| |
|
| | try: |
| | response = await call_next(request) |
| | duration = (time.perf_counter() - start_time) * 1000 |
| | response.headers["X-Request-ID"] = req_id |
| | logger.info(f"Finish: {response.status_code} in {duration:.2f}ms") |
| | return response |
| | except Exception as e: |
| | logger.exception("Middleware caught crash") |
| | return JSONResponse(status_code=500, content={"status":"error","message":"Internal Server Error","request_id":req_id}) |
| | finally: |
| | |
| | request_id_ctx.reset(token) |
| |
|
| | |
| | |
| | |
| |
|
| | @app.get("/") |
| | async def root(request: Request): |
| | return { |
| | "request_id": request.state.request_id, |
| | "process_time_ms": 0, |
| | "status": StatusEnum.SUCCESS, |
| | "message": "OCR API Active" |
| | } |
| |
|
| | @app.post("/api/v1/get_data", response_model=APIResponse) |
| | async def extract_data( |
| | request: Request, |
| | file: UploadFile = File(...), |
| | token: str = Depends(SecurityService.validate_token) |
| | ): |
| | start_ts = time.perf_counter() |
| | tmp_path = None |
| | req_id = request.state.request_id |
| |
|
| | try: |
| | FileValidator.validate(file) |
| | tmp_path = FileValidator.check_size_and_save(file) |
| | |
| | |
| | |
| | result = await run_in_threadpool( |
| | OCRProcessor.process_file, |
| | tmp_path, |
| | file.content_type |
| | ) |
| |
|
| | return { |
| | "request_id": req_id, |
| | "process_time_ms": (time.perf_counter() - start_ts) * 1000, |
| | "status": StatusEnum.SUCCESS, |
| | "message": "OCR Extraction Successful", |
| | "data": { |
| | "filename": file.filename, |
| | "content_type": file.content_type, |
| | "saved_file_path": tmp_path, |
| | "total_pages": result["total_pages"], |
| | "pages_content": result["pages_content"] |
| | } |
| | } |
| |
|
| | except Exception as e: |
| | logger.error(f"Request failed: {str(e)}") |
| | status_code = getattr(e, "status_code", 500) |
| | return JSONResponse( |
| | status_code=status_code, |
| | content={ |
| | "request_id": req_id, |
| | "process_time_ms": (time.perf_counter() - start_ts) * 1000, |
| | "status": StatusEnum.ERROR, |
| | "error_message": getattr(e, "detail", str(e)) |
| | } |
| | ) |
| | finally: |
| | if tmp_path: |
| | logger.info(f"File preserved at: {tmp_path}") |
| | try: |
| | os.remove(tmp_path) |
| | logger.info(f"Temporary file deleted: {tmp_path}") |
| | except Exception as e: |
| | logger.warning(f"Failed to delete temp file: {str(e)}") |