import os import io import warnings import easyocr import numpy as np from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse from PIL import Image from pdf2image import convert_from_bytes from concurrent.futures import ProcessPoolExecutor # ========================= # EasyOCR config # ========================= warnings.filterwarnings("ignore", message=".*pin_memory.*") MODEL_DIR = "/app/.EasyOCR" USER_NET_DIR = os.path.join(MODEL_DIR, "user_network") os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(USER_NET_DIR, exist_ok=True) # βœ… preload reader (CPU only for Hugging Face) reader = easyocr.Reader( ['en', 'hi'], model_storage_directory=MODEL_DIR, user_network_directory=USER_NET_DIR, download_enabled=False, gpu=False ) # ========================= # FastAPI app # ========================= app = FastAPI() @app.get("/") async def root(): return {"message": "OCR API is running on Hugging Face πŸš€"} def run_ocr_on_image(image_bytes: bytes) -> str: """OCR worker: takes image bytes, runs OCR, returns text""" image = Image.open(io.BytesIO(image_bytes)) image_np = np.array(image) results = reader.readtext(image_np) return " ".join([str(text) for _, text, _ in results]).strip() @app.post("/ocr") async def ocr(file: UploadFile = File(...)): try: contents = await file.read() if file.filename.lower().endswith(".pdf"): # βœ… Convert PDF β†’ images (lower DPI for speed) pages = convert_from_bytes(contents, dpi=150) # Convert each page to raw bytes (so workers don’t share PIL objects) page_bytes = [] for p in pages: buf = io.BytesIO() p.save(buf, format="PNG") page_bytes.append(buf.getvalue()) text_results = [] # βœ… Multiprocessing (parallel CPU workers) with ProcessPoolExecutor(max_workers=4) as executor: # adjust workers page_texts = list(executor.map(run_ocr_on_image, page_bytes)) for i, text in enumerate(page_texts, start=1): text_results.append({"page": i, "text": text}) return JSONResponse(content={"pdf_results": text_results}) else: # βœ… Single image case text = run_ocr_on_image(contents) return JSONResponse(content={"text": text}) except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=500)