Spaces:
Running
Running
| """ | |
| Receipt OCR Service — FastAPI entry point. | |
| Endpoints: | |
| POST /ocr Upload a receipt image, get structured JSON back. | |
| GET /health Liveness check. | |
| Usage: | |
| uvicorn main:app --host 0.0.0.0 --port 8000 --reload | |
| """ | |
| import asyncio | |
| import logging | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, File, HTTPException, Query, UploadFile | |
| from fastapi.responses import JSONResponse | |
| from ocr.reader import PaddleOCRReader | |
| from ocr.parser import parse_blocks | |
| from utils.image_prep import preprocess_image | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Global reader instance — loaded once at startup | |
| _reader: PaddleOCRReader | None = None | |
| async def _background_warmup(): | |
| import numpy as np | |
| logger.info("Background warm-up starting...") | |
| try: | |
| await asyncio.get_event_loop().run_in_executor( | |
| None, lambda: _reader.extract(np.ones((200, 600, 3), dtype=np.uint8) * 255) | |
| ) | |
| logger.info("Background warm-up complete.") | |
| except Exception as exc: | |
| logger.warning("Background warm-up failed (non-fatal): %s", exc) | |
| async def lifespan(app: FastAPI): | |
| global _reader | |
| _reader = PaddleOCRReader() | |
| # Run warm-up in background so the server starts accepting connections | |
| # immediately. First real request after restart may still be slow (JIT), | |
| # but the server won't appear unresponsive during startup. | |
| asyncio.create_task(_background_warmup()) | |
| yield | |
| _reader = None | |
| app = FastAPI( | |
| title="Receipt OCR API", | |
| description="Extract structured data from receipt images using PaddleOCR.", | |
| version="1.0.0", | |
| lifespan=lifespan, | |
| ) | |
| async def health(): | |
| return {"status": "ok", "model_loaded": _reader is not None} | |
| async def warmup(): | |
| """Run OCR on a blank image to exercise the inference engine. Used by keepalive cron.""" | |
| if _reader is None: | |
| raise HTTPException(status_code=503, detail="OCR model not loaded yet") | |
| import numpy as np | |
| _reader.extract(np.ones((200, 600, 3), dtype=np.uint8) * 255) | |
| return {"status": "warm"} | |
| async def ocr_receipt( | |
| file: UploadFile = File(..., description="Receipt image (JPEG, PNG, etc.)"), | |
| debug: bool = Query(False, description="Include raw OCR blocks in response"), | |
| ): | |
| """ | |
| Process a receipt image and return structured JSON. | |
| - **file**: multipart image upload | |
| - **debug**: if true, includes raw OCR text blocks in the response | |
| """ | |
| if _reader is None: | |
| raise HTTPException(status_code=503, detail="OCR model not loaded yet") | |
| # Validate content type loosely | |
| content_type = file.content_type or "" | |
| if content_type and not content_type.startswith("image/"): | |
| raise HTTPException( | |
| status_code=415, | |
| detail=f"Unsupported media type: {content_type}. Upload an image file.", | |
| ) | |
| import time | |
| t0 = time.perf_counter() | |
| raw_bytes = await file.read() | |
| if not raw_bytes: | |
| raise HTTPException(status_code=400, detail="Empty file uploaded") | |
| try: | |
| image = preprocess_image(raw_bytes) | |
| except (ValueError, FileNotFoundError) as exc: | |
| raise HTTPException(status_code=400, detail=f"Image preprocessing failed: {exc}") | |
| t1 = time.perf_counter() | |
| print(f"TIMING preprocess: {t1-t0:.2f}s | image size: {image.shape[1]}x{image.shape[0]} | upload: {len(raw_bytes)//1024}KB", flush=True) | |
| try: | |
| blocks = _reader.extract(image) | |
| except Exception as exc: | |
| logger.exception("OCR extraction failed") | |
| raise HTTPException(status_code=500, detail=f"OCR failed: {exc}") | |
| t2 = time.perf_counter() | |
| print(f"TIMING ocr extract: {t2-t1:.2f}s | blocks found: {len(blocks)}", flush=True) | |
| result = parse_blocks(blocks) | |
| t3 = time.perf_counter() | |
| print(f"TIMING parse: {t3-t2:.2f}s | items: {len(result.get('line_items', []))} | total: {t3-t0:.2f}s", flush=True) | |
| if debug: | |
| result["_raw_blocks"] = blocks | |
| return JSONResponse(content=result) | |