Spaces:

xce009
/

ocr-api

Running

ocr-api / main.py

Soumik Bose

8d66792 about 1 month ago

8.62 kB

	import os
	import uuid
	import time
	import logging
	import shutil
	import tempfile
	from typing import Optional, List
	from enum import Enum
	from pathlib import Path
	from contextvars import ContextVar
	import uvicorn
	import pytesseract
	from fastapi import (
	FastAPI, File, UploadFile, Depends,
	HTTPException, Request, status
	)
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
	from fastapi.responses import JSONResponse
	from fastapi.concurrency import run_in_threadpool
	from pydantic import BaseModel
	from dotenv import load_dotenv
	from PIL import Image
	from pdf2image import convert_from_path

	# ==========================================
	# 1. CONFIGURATION & LOGGING SETUP
	# ==========================================
	load_dotenv()

	# ContextVar for thread-safe Request ID tracking
	request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")

	class Config:
	APP_NAME = os.getenv("APP_NAME", "OCR API")
	API_TOKEN = os.getenv("API_BEARER_TOKEN")
	MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
	ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
	ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]

	class RequestIdFilter(logging.Filter):
	def filter(self, record):
	# Automatically pull request_id from the context variable
	record.request_id = request_id_ctx.get()
	return True

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s \| %(levelname)s \| ReqID:%(request_id)s \| %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S',
	force=True # Ensures our config is applied
	)
	logger = logging.getLogger("ocr_api")
	logger.addFilter(RequestIdFilter())

	# ==========================================
	# 2. MODELS
	# ==========================================
	class StatusEnum(str, Enum):
	SUCCESS = "success"
	ERROR = "error"

	class BaseResponse(BaseModel):
	request_id: str
	process_time_ms: float
	status: StatusEnum
	message: Optional[str] = None

	class PageResult(BaseModel):
	index: int
	page_number: int
	text: str

	class OCRResult(BaseModel):
	filename: str
	content_type: str
	saved_file_path: str
	total_pages: int
	pages_content: List[PageResult]

	class APIResponse(BaseResponse):
	data: Optional[OCRResult] = None
	error_message: Optional[str] = None

	# ==========================================
	# 3. SERVICES
	# ==========================================

	class SecurityService:
	security_scheme = HTTPBearer()

	@staticmethod
	async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
	if credentials.credentials != Config.API_TOKEN:
	logger.warning("Auth Failed: Invalid Token")
	raise HTTPException(status_code=401, detail="Invalid Bearer Token")
	return credentials.credentials

	class FileValidator:
	@staticmethod
	def validate(file: UploadFile):
	if file.content_type not in Config.ALLOWED_TYPES:
	raise HTTPException(400, f"Invalid file type: {file.content_type}")

	@staticmethod
	def check_size_and_save(file: UploadFile) -> str:
	suffix = Path(file.filename).suffix
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
	shutil.copyfileobj(file.file, buffer)
	tmp_path = os.path.abspath(buffer.name)

	if os.path.getsize(tmp_path) > Config.MAX_SIZE:
	os.remove(tmp_path)
	raise HTTPException(413, "File too large")
	return tmp_path

	class OCRProcessor:
	@classmethod
	def process_file(cls, file_path: str, content_type: str) -> dict:
	"""Note: No longer passing request_id; logger picks it up from contextvars automatically."""
	start = time.perf_counter()
	pages_content = []

	try:
	logger.info(f"Processing File: {file_path}")

	if content_type == "application/pdf":
	logger.info("Converting PDF to Images...")
	images = convert_from_path(file_path)
	total = len(images)

	for idx, img in enumerate(images):
	page_num = idx + 1
	logger.info(f"Scanning Page {page_num}/{total}")
	text = pytesseract.image_to_string(img).strip()
	pages_content.append({"index": idx, "page_number": page_num, "text": text})
	else:
	logger.info("Scanning Single Image...")
	img = Image.open(file_path)
	text = pytesseract.image_to_string(img).strip()
	pages_content.append({"index": 0, "page_number": 1, "text": text})

	logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms")
	return {"total_pages": len(pages_content), "pages_content": pages_content}

	except Exception as e:
	logger.error(f"OCR Logic Failure: {str(e)}")
	raise ValueError(str(e))

	# ==========================================
	# 4. APP & MIDDLEWARE
	# ==========================================
	app = FastAPI(title=Config.APP_NAME)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=Config.ALLOWED_ORIGINS if Config.ALLOWED_ORIGINS else ["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@app.middleware("http")
	async def request_context_middleware(request: Request, call_next):
	# 1. Generate ID
	req_id = str(uuid.uuid4())
	# 2. Set Context (Crucial for thread logging)
	token = request_id_ctx.set(req_id)
	request.state.request_id = req_id

	start_time = time.perf_counter()
	logger.info(f"Start: {request.method} {request.url.path}")

	try:
	response = await call_next(request)
	duration = (time.perf_counter() - start_time) * 1000
	response.headers["X-Request-ID"] = req_id
	logger.info(f"Finish: {response.status_code} in {duration:.2f}ms")
	return response
	except Exception as e:
	logger.exception("Middleware caught crash")
	return JSONResponse(status_code=500, content={"status":"error","message":"Internal Server Error","request_id":req_id})
	finally:
	# 3. Clean up Context
	request_id_ctx.reset(token)

	# ==========================================
	# 5. ENDPOINTS
	# ==========================================

	@app.get("/")
	async def root(request: Request):
	return {
	"request_id": request.state.request_id,
	"process_time_ms": 0,
	"status": StatusEnum.SUCCESS,
	"message": "OCR API Active"
	}

	@app.post("/api/v1/get_data", response_model=APIResponse)
	async def extract_data(
	request: Request,
	file: UploadFile = File(...),
	token: str = Depends(SecurityService.validate_token)
	):
	start_ts = time.perf_counter()
	tmp_path = None
	req_id = request.state.request_id

	try:
	FileValidator.validate(file)
	tmp_path = FileValidator.check_size_and_save(file)

	# CPU heavy task run in thread pool.
	# ContextVars are automatically copied to the thread.
	result = await run_in_threadpool(
	OCRProcessor.process_file,
	tmp_path,
	file.content_type
	)

	return {
	"request_id": req_id,
	"process_time_ms": (time.perf_counter() - start_ts) * 1000,
	"status": StatusEnum.SUCCESS,
	"message": "OCR Extraction Successful",
	"data": {
	"filename": file.filename,
	"content_type": file.content_type,
	"saved_file_path": tmp_path,
	"total_pages": result["total_pages"],
	"pages_content": result["pages_content"]
	}
	}

	except Exception as e:
	logger.error(f"Request failed: {str(e)}")
	status_code = getattr(e, "status_code", 500)
	return JSONResponse(
	status_code=status_code,
	content={
	"request_id": req_id,
	"process_time_ms": (time.perf_counter() - start_ts) * 1000,
	"status": StatusEnum.ERROR,
	"error_message": getattr(e, "detail", str(e))
	}
	)
	finally:
	if tmp_path:
	logger.info(f"File preserved at: {tmp_path}")
	try:
	os.remove(tmp_path)
	logger.info(f"Temporary file deleted: {tmp_path}")
	except Exception as e:
	logger.warning(f"Failed to delete temp file: {str(e)}")