import os import uuid from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse from typing import List import fitz from PIL import Image # ------------------------------------------------------------------- # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS # ------------------------------------------------------------------- os.environ["PADDLE_HOME"] = "/app/paddle_home" os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache" os.makedirs("/app/paddle_home", exist_ok=True) os.makedirs("/app/xdg_cache", exist_ok=True) # now safe to import paddlex/paddleocr from paddleocr import PaddleOCR # ------------------------------------------------------------------- # CONFIGURATION # ------------------------------------------------------------------- MAX_DIMENSION = 1024 # Max width or height for OCR processing PDF_DPI = 150 # Lower DPI = faster (was 220) # ------------------------------------------------------------------- # IMAGE OPTIMIZATION # ------------------------------------------------------------------- def optimize_image_for_ocr(input_path: str, output_path: str) -> str: """Resize image if too large, keeping aspect ratio.""" with Image.open(input_path) as img: # Convert to RGB if needed if img.mode in ('RGBA', 'LA', 'P'): img = img.convert('RGB') elif img.mode != 'RGB': img = img.convert('RGB') width, height = img.size # Only resize if larger than MAX_DIMENSION if width > MAX_DIMENSION or height > MAX_DIMENSION: if width > height: new_width = MAX_DIMENSION new_height = int(height * (MAX_DIMENSION / width)) else: new_height = MAX_DIMENSION new_width = int(width * (MAX_DIMENSION / height)) img = img.resize((new_width, new_height), Image.LANCZOS) img.save(output_path, 'JPEG', quality=85) return output_path # ------------------------------------------------------------------- # PDF → IMAGE (optimized) # ------------------------------------------------------------------- def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]: if not os.path.exists(pdf_path): raise FileNotFoundError(pdf_path) doc = fitz.open(pdf_path) page_count = len(doc) limit = page_count if max_pages is None else min(max_pages, page_count) output_paths: List[str] = [] out_dir = "/app/pdf_images" os.makedirs(out_dir, exist_ok=True) for i in range(limit): page = doc.load_page(i) pix = page.get_pixmap(dpi=PDF_DPI) # Lower DPI for speed img_name = f"{uuid.uuid4()}.jpg" img_path = os.path.join(out_dir, img_name) # Save initial temp_path = img_path + ".tmp.jpg" pix.save(temp_path) # Optimize (resize if needed) optimize_image_for_ocr(temp_path, img_path) # Cleanup temp if os.path.exists(temp_path): os.remove(temp_path) output_paths.append(img_path) doc.close() return output_paths # ------------------------------------------------------------------- # OCR ENGINE # ------------------------------------------------------------------- ocr_engine = PaddleOCR( lang="mr", text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec", use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False ) def extract_text(image_path: str): result = ocr_engine.predict(input=image_path) output = [] for block in result: texts = block["rec_texts"] scores = block["rec_scores"] for t, s in zip(texts, scores): output.append({"text": t, "confidence": float(s)}) return output # ------------------------------------------------------------------- # FASTAPI # ------------------------------------------------------------------- app = FastAPI() UPLOAD_DIR = "/app/uploads" os.makedirs(UPLOAD_DIR, exist_ok=True) @app.post("/ocr") async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3): if len(files) > 15: raise HTTPException(status_code=400, detail="Maximum 15 files allowed.") structured_output = {"files": []} for index, file in enumerate(files, start=1): filename = file.filename.lower() ext = filename.split(".")[-1] temp_name = f"{uuid.uuid4()}.{ext}" temp_path = os.path.join(UPLOAD_DIR, temp_name) with open(temp_path, "wb") as f: f.write(await file.read()) file_record = { "file_id": f"file_{index}", "filename": filename, "pages": [] } # ------------------------------- # PDF # ------------------------------- if filename.endswith(".pdf"): img_paths = pdf_to_images(temp_path, max_pages=max_pages) for page_idx, img_path in enumerate(img_paths): ocr_results = extract_text(img_path) file_record["pages"].append({ "page_index": page_idx, "results": ocr_results }) # Cleanup processed image if os.path.exists(img_path): os.remove(img_path) # ------------------------------- # IMAGE # ------------------------------- elif filename.endswith((".jpg", ".jpeg", ".png")): # Optimize image before OCR optimized_path = os.path.join(UPLOAD_DIR, f"opt_{uuid.uuid4()}.jpg") optimize_image_for_ocr(temp_path, optimized_path) ocr_results = extract_text(optimized_path) file_record["pages"].append({ "page_index": 0, "results": ocr_results }) # Cleanup optimized image if os.path.exists(optimized_path): os.remove(optimized_path) else: raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}") # Cleanup uploaded file if os.path.exists(temp_path): os.remove(temp_path) structured_output["files"].append(file_record) return JSONResponse(structured_output)