Spaces:
Sleeping
Sleeping
| """ | |
| OpenSkill OCR Service β v4.0 | |
| FastAPI application for Hugging Face Docker Space (CPU / pipeline backend) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ARCHITECTURE (v4.0 β OCR-only, AI-first) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| This service is an extraction layer only. It does NOT: | |
| - classify documents | |
| - extract named entities | |
| - validate fields | |
| - generate summaries | |
| - perform board/marksheet/JEE-specific logic | |
| All document understanding is delegated to the AI layer downstream. | |
| PATH A β Fast OCR (images: jpg / png / webp / bmp / heic / heif / avif) | |
| Engine : rapidocr-onnxruntime β₯ 1.3.22 | |
| Models : Bundled in pip wheel β zero first-use download, ~50 MB | |
| Resize : images capped at MAX_OCR_SIDE px (default 1600) before inference | |
| Target : 1β4 s (acceptable < 8 s) | |
| Fallback: if confidence < FAST_CONFIDENCE_THRESHOLD β MinerU fallback | |
| PATH B β Full pipeline (PDFs, multi-page, layout-sensitive docs) | |
| Engine : MinerU magic-pdf pipeline backend | |
| Models : opendatalab/PDF-Extract-Kit-1.0 (downloaded at build time) | |
| Target : 5β20 s (acceptable < 30 s) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RESPONSE FORMAT (v4.0) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| { | |
| "success": true, | |
| "filename": "scan.jpg", | |
| "engine": "rapidocr", | |
| "confidence": 0.91, | |
| "text": "...", | |
| "markdown": "...", | |
| "pageCount": 1, | |
| "cached": false, | |
| "processingTimeMs": 1840, | |
| "timings": { | |
| "uploadMs": 12, | |
| "hashMs": 4, | |
| "memCheckMs": 8, | |
| "decodeMs": 55, | |
| "resizeMs": 18, | |
| "detectMs": 610, | |
| "recognizeMs": 980, | |
| "postProcessMs": 14, | |
| "totalMs": 1840 | |
| }, | |
| "metadata": { | |
| "imgW": 3024, | |
| "imgH": 4032, | |
| "imgWResized": 1200, | |
| "imgHResized": 1600, | |
| "textBlocks": 47, | |
| "passesUsed": 1, | |
| "backend": "rapidocr" | |
| } | |
| } | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| API ENDPOINTS | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| GET /health Liveness (always fast) | |
| GET /status Node status: memory, uptime, cache, engine state | |
| GET /warmup Pre-load both OCR engines (also called at startup) | |
| GET /diagnostics Full environment + model inventory | |
| POST /benchmark Multi-size RapidOCR timing benchmark (small/medium/large) | |
| POST /extract Single file β PDF or image β with SHA256 cache | |
| POST /batch Up to 8 files, sequential, per-file error isolation | |
| """ | |
| import hashlib | |
| import io | |
| import os | |
| import re | |
| import shutil | |
| import sys | |
| import tempfile | |
| import threading | |
| import time | |
| import traceback | |
| import logging | |
| from importlib.metadata import version as pkg_version | |
| from typing import Any, Optional | |
| import fitz # PyMuPDF | |
| import numpy as np | |
| from PIL import Image | |
| from fastapi import FastAPI, File, UploadFile | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import JSONResponse | |
| # ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-8s %(name)s %(message)s", | |
| ) | |
| logger = logging.getLogger("ocr-service") | |
| # ββ Start time ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _START_TIME: float = time.time() | |
| # ββ Upload / batch limits βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_UPLOAD_BYTES = 30 * 1024 * 1024 # 30 MB | |
| BATCH_MAX_FILES = 8 | |
| # ββ File type sets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PDF_EXTENSIONS = {"pdf"} | |
| NATIVE_IMAGE_EXTENSIONS = {"jpg", "jpeg", "png"} | |
| PILLOW_IMAGE_EXTENSIONS = {"webp", "bmp", "tiff", "tif", "gif", "heic", "heif", "avif"} | |
| IMAGE_EXTENSIONS = NATIVE_IMAGE_EXTENSIONS | PILLOW_IMAGE_EXTENSIONS | |
| OFFICE_EXTENSIONS = {"docx", "pptx", "xlsx"} | |
| ALLOWED_EXTENSIONS = PDF_EXTENSIONS | IMAGE_EXTENSIONS | OFFICE_EXTENSIONS | |
| # ββ OCR tuning ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| FAST_CONFIDENCE_THRESHOLD = 0.65 # below this β MinerU fallback | |
| MAX_OCR_SIDE = 1600 # pixels β longest side cap before OCR | |
| # # General-purpose safe value. Lowering to 1280 gains ~20% | |
| # # speed but risks losing small text in UI/code screenshots: | |
| # # a 1913px-wide screen at 1280px canvas β 11 px fonts scale | |
| # # to ~8 px, which is the CRNN recognition floor. | |
| # # Performance table (119 blocks, measured calibration 967 ms/batch): | |
| # # 1600 px / batch=6 (pre-optimisation): ~19 300 ms | |
| # # 1600 px / batch=24 (v4.1, this build): ~4 800 ms (β75%) | |
| # # 1280 px / batch=24 (marksheet-only): ~3 900 ms (β80%) | |
| # # Set to 1280 only if all inputs are printed A4 documents. | |
| REC_BATCH_NUM = 24 # recognition batch size (default in RapidOCR wheel: 6) | |
| # # Higher β fewer sequential ONNX calls β faster. | |
| # # 119 blocks / 6 = 20 calls β 119 / 24 = 5 calls | |
| # # Accuracy impact: NONE β same model, same crops, same CTC decode. | |
| # # Memory impact: negligible on 16 GB HF free tier. | |
| DET_BOX_THRESH = 0.50 # detection confidence threshold (RapidOCR default: 0.50) | |
| # # Keep at 0.50 for general-purpose use. Raising to 0.60 drops | |
| # # ~15% of blocks (noise) and saves one ONNX call on dense docs, | |
| # # but risks missing low-contrast text in UI/code screenshots | |
| # # (dark-background text can score in the 0.50β0.65 range). | |
| # # Safe to raise to 0.60β0.65 only for printed-document pipelines. | |
| # ββ Memory safety βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BYTES_PER_OCR_PAGE = 100 * 1024 * 1024 | |
| IMAGE_MEMORY_FACTOR = 4 | |
| # 100 MB floor β was 1024. psutil reads HOST RAM on HF Spaces (not the | |
| # container cgroup), so the floor must be small enough to pass on a busy | |
| # host that has only a few hundred MB of host-level free memory. The | |
| # per-file estimate already encodes the request's working-memory cost; | |
| # this floor is purely a last-resort guard against near-empty headroom. | |
| MEM_SAFETY_FLOOR_MB = 100 | |
| # ββ SHA256 extraction cache βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _cache: dict[str, dict[str, Any]] = {} | |
| _cache_lock = threading.Lock() | |
| # ββ Active-request counter ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _active_requests: int = 0 | |
| _active_lock = threading.Lock() | |
| # ββ Engine state ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _rapidocr_engine: Any = None | |
| _rapidocr_lock = threading.Lock() | |
| _rapidocr_load_ms: int = 0 | |
| _rapidocr_ready: bool = False | |
| _pipeline_ready: bool = False | |
| _pipeline_lock = threading.Lock() | |
| _pipeline_load_ms: int = 0 | |
| # ββ Startup issues ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _startup_issues: list[str] = [] | |
| _startup_done: bool = False | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Structured error | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ExtractionError(Exception): | |
| def __init__( | |
| self, | |
| stage: str, | |
| code: str, | |
| message: str, | |
| http_status: int = 422, | |
| root_cause: str = "", | |
| recommendation: str = "", | |
| ) -> None: | |
| self.stage = stage | |
| self.code = code | |
| self.message = message | |
| self.http_status = http_status | |
| self.root_cause = root_cause or message | |
| self.recommendation = recommendation | |
| super().__init__(message) | |
| def to_dict(self) -> dict[str, Any]: | |
| return { | |
| "success": False, | |
| "stage": self.stage, | |
| "errorCode": self.code, | |
| "rootCause": self.root_cause, | |
| "recommendation": self.recommendation, | |
| "message": self.message, | |
| } | |
| def _err( | |
| stage: str, | |
| code: str, | |
| msg: str, | |
| status: int = 422, | |
| root_cause: str = "", | |
| recommendation: str = "", | |
| ) -> ExtractionError: | |
| return ExtractionError(stage, code, msg, status, root_cause, recommendation) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Active-request helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _inc_active() -> None: | |
| global _active_requests | |
| with _active_lock: | |
| _active_requests += 1 | |
| def _dec_active() -> None: | |
| global _active_requests | |
| with _active_lock: | |
| _active_requests = max(0, _active_requests - 1) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Engine loaders | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _ensure_rapidocr() -> Any: | |
| """Load the RapidOCR engine once; return the singleton on every subsequent call.""" | |
| global _rapidocr_engine, _rapidocr_ready, _rapidocr_load_ms | |
| if _rapidocr_ready: | |
| return _rapidocr_engine | |
| with _rapidocr_lock: | |
| if _rapidocr_ready: | |
| return _rapidocr_engine | |
| t0 = time.perf_counter() | |
| try: | |
| from rapidocr_onnxruntime import RapidOCR | |
| _rapidocr_engine = RapidOCR( | |
| det_limit_side_len=MAX_OCR_SIDE, | |
| det_limit_type="max", | |
| # ββ Recognition batch size βββββββββββββββββββββββββββββββββββ | |
| # Default in RapidOCR wheel is 6; 24 reduces ONNX calls by ~4Γ | |
| # for typical documents (76 blocks β 4 calls instead of 13). | |
| # Accuracy impact: zero β same CRNN model, same crops, same CTC. | |
| rec_batch_num=REC_BATCH_NUM, | |
| # ββ Angle classifier disabled ββββββββββββββββββββββββββββββββ | |
| # Classifier (ch_ppocr_mobile_v2.0_cls_infer.onnx) runs a full | |
| # ONNX pass on every crop to detect 180Β° rotation. For straight | |
| # document scans (marksheets, certificates) this is pure overhead. | |
| # Saves ~1 300 ms on 119 blocks (cls_batch_num=6 Γ ~65 ms/call). | |
| # Re-enable if the service receives upside-down images. | |
| use_cls=False, | |
| ) | |
| _rapidocr_load_ms = int((time.perf_counter() - t0) * 1000) | |
| _rapidocr_ready = True | |
| logger.info("RapidOCR engine ready load_ms=%d", _rapidocr_load_ms) | |
| except Exception as exc: | |
| raise _err( | |
| "model_load", "RAPIDOCR_LOAD_FAILED", | |
| f"RapidOCR failed to load: {exc}", 503, | |
| root_cause=str(exc), | |
| recommendation="Check that rapidocr-onnxruntime is installed.", | |
| ) from exc | |
| return _rapidocr_engine | |
| def _ensure_pipeline() -> None: | |
| """Import and verify the MinerU pipeline once.""" | |
| global _pipeline_ready, _pipeline_load_ms | |
| if _pipeline_ready: | |
| return | |
| with _pipeline_lock: | |
| if _pipeline_ready: | |
| return | |
| config_path = os.path.expanduser("~/magic-pdf.json") | |
| if not os.path.exists(config_path): | |
| raise _err( | |
| "model_load", "CONFIG_MISSING", | |
| f"magic-pdf.json not found at {config_path}.", 503, | |
| root_cause="download_models.py did not run or /root was wiped.", | |
| recommendation="Check Docker build log for download_models.py output.", | |
| ) | |
| t0 = time.perf_counter() | |
| try: | |
| from magic_pdf.data.dataset import PymuDocDataset, ImageDataset # noqa | |
| from magic_pdf.data.data_reader_writer import ( # noqa | |
| FileBasedDataReader, FileBasedDataWriter) | |
| except ImportError as exc: | |
| raise _err( | |
| "model_load", "IMPORT_FAILED", | |
| f"magic_pdf not importable: {exc}", 503, | |
| root_cause=str(exc), | |
| recommendation="Check that magic-pdf[full]==1.3.12 is installed.", | |
| ) from exc | |
| _pipeline_load_ms = int((time.perf_counter() - t0) * 1000) | |
| _pipeline_ready = True | |
| logger.info("MinerU pipeline ready load_ms=%d", _pipeline_load_ms) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FastAPI app | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI( | |
| title="OpenSkill OCR Service", | |
| description="OCR-only text extraction. Document understanding is handled by the AI layer.", | |
| version="4.0.0", | |
| ) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["GET", "POST"], | |
| allow_headers=["*"], | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Startup β pre-load RapidOCR so first request has zero cold-start cost | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def startup_warmup() -> None: | |
| """ | |
| Pre-load the RapidOCR engine at container start. | |
| Without this, the first /extract request pays 600β2 500 ms for ONNX model | |
| loading on top of normal inference time. Loading here moves that cost to | |
| startup where it is invisible to the user. | |
| """ | |
| global _startup_done | |
| issues: list[str] = [] | |
| # ββ Dependency smoke-check ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| checks = [ | |
| ("cv2", lambda: __import__("cv2").__version__), | |
| ("torch", lambda: __import__("torch").__version__), | |
| ("rapidocr", lambda: pkg_version("rapidocr-onnxruntime")), | |
| ("magic_pdf", lambda: __import__("magic_pdf").__version__), | |
| ] | |
| for name, fn in checks: | |
| try: | |
| ver = fn() | |
| logger.info("startup β %-12s %s", name, ver) | |
| except Exception as exc: | |
| msg = f"{name} unavailable: {exc}" | |
| issues.append(msg) | |
| logger.critical("startup FAIL %s", msg) | |
| if not os.path.exists(os.path.expanduser("~/magic-pdf.json")): | |
| issues.append("magic-pdf.json missing") | |
| if not os.path.isdir("/app/models/PDF-Extract-Kit-1.0/models"): | |
| issues.append("Models directory missing: /app/models/PDF-Extract-Kit-1.0/models") | |
| # ββ Pre-load RapidOCR βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| _ensure_rapidocr() | |
| logger.info("startup: RapidOCR pre-loaded load_ms=%d", _rapidocr_load_ms) | |
| except Exception as exc: | |
| msg = f"RapidOCR warmup failed: {exc}" | |
| issues.append(msg) | |
| logger.error("startup: %s", msg) | |
| _startup_issues.extend(issues) | |
| _startup_done = True | |
| if issues: | |
| logger.error("Startup completed with %d issue(s): %s", len(issues), issues) | |
| else: | |
| logger.info("Startup complete β all systems ready.") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET /health | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def health() -> dict[str, Any]: | |
| return {"status": "healthy", "version": "4.0.0"} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET /status | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def status() -> dict[str, Any]: | |
| used_mb, total_mb = _mem_mb() | |
| return { | |
| "status": "healthy" if not _startup_issues else "degraded", | |
| "version": "4.0.0", | |
| "architecture": "ocr-only", | |
| "engines": { | |
| "rapidocr": { | |
| "ready": _rapidocr_ready, | |
| "loadMs": _rapidocr_load_ms, | |
| "purpose": "images (1β4 s)", | |
| }, | |
| "mineru": { | |
| "ready": _pipeline_ready, | |
| "loadMs": _pipeline_load_ms, | |
| "purpose": "PDFs + fallback", | |
| }, | |
| }, | |
| "config": { | |
| "maxOcrSidePx": MAX_OCR_SIDE, | |
| "confidenceThreshold": FAST_CONFIDENCE_THRESHOLD, | |
| "maxUploadMb": MAX_UPLOAD_BYTES // (1024 * 1024), | |
| }, | |
| "startupIssues": _startup_issues, | |
| "uptimeSeconds": int(time.time() - _START_TIME), | |
| "memoryUsedMB": used_mb, | |
| "memoryTotalMB": total_mb, | |
| "activeRequests": _active_requests, | |
| "cacheEntries": len(_cache), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET /warmup | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def warmup() -> dict[str, Any]: | |
| """Explicitly pre-load engines. Idempotent β safe to call repeatedly.""" | |
| results: dict[str, Any] = {} | |
| t0 = time.perf_counter() | |
| try: | |
| _ensure_rapidocr() | |
| results["rapidocr"] = {"status": "ready", "loadMs": _rapidocr_load_ms} | |
| except Exception as exc: | |
| results["rapidocr"] = {"status": "failed", "error": str(exc)} | |
| try: | |
| _ensure_pipeline() | |
| results["mineru"] = {"status": "ready", "loadMs": _pipeline_load_ms} | |
| except Exception as exc: | |
| results["mineru"] = {"status": "failed", "error": str(exc)} | |
| results["totalElapsedMs"] = int((time.perf_counter() - t0) * 1000) | |
| results["allReady"] = _rapidocr_ready and _pipeline_ready | |
| return results | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET /diagnostics | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def diagnostics() -> dict[str, Any]: | |
| import platform | |
| pkgs: dict[str, str] = {} | |
| for name in ( | |
| "magic-pdf", "rapidocr-onnxruntime", "torch", "torchvision", | |
| "ultralytics", "doclayout-yolo", "rapid-table", "onnxruntime", | |
| "opencv-python-headless", "Pillow", "fastapi", "uvicorn", | |
| ): | |
| try: | |
| pkgs[name] = pkg_version(name) | |
| except Exception: | |
| pkgs[name] = "not found" | |
| models_root = "/app/models/PDF-Extract-Kit-1.0/models" | |
| model_files: dict[str, str] = {} | |
| for rel in [ | |
| "OCR/paddleocr_torch/ch_PP-OCRv5_det_infer.pth", | |
| "OCR/paddleocr_torch/ch_PP-OCRv5_rec_infer.pth", | |
| "Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt", | |
| ]: | |
| full = os.path.join(models_root, rel) | |
| model_files[rel] = ( | |
| f"{os.path.getsize(full) / (1024 * 1024):.1f} MB" | |
| if os.path.isfile(full) else "MISSING" | |
| ) | |
| used_mb, total_mb = _mem_mb() | |
| return { | |
| "python": platform.python_version(), | |
| "packages": pkgs, | |
| "modelFiles": model_files, | |
| "memory": {"usedMB": used_mb, "totalMB": total_mb}, | |
| "engines": { | |
| "rapidocr": {"ready": _rapidocr_ready, "loadMs": _rapidocr_load_ms}, | |
| "mineru": {"ready": _pipeline_ready, "loadMs": _pipeline_load_ms}, | |
| }, | |
| "config": { | |
| "maxOcrSidePx": MAX_OCR_SIDE, | |
| "confidenceThreshold": FAST_CONFIDENCE_THRESHOLD, | |
| }, | |
| "uptime": int(time.time() - _START_TIME), | |
| "cacheEntries": len(_cache), | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GET /benchmark | |
| # Runs RapidOCR on three synthetic images (small / medium / large) and returns | |
| # full stage timings for each. Use this to measure the resize optimisation. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def benchmark() -> JSONResponse: | |
| import cv2 | |
| def _make_test_image(width: int, height: int) -> "np.ndarray": | |
| img = np.ones((height, width, 3), dtype=np.uint8) * 255 | |
| lines = [ | |
| "184 ENGLISH LNG & LIT. 073 020 093", | |
| "085 HINDI COURSE-B 075 020 095", | |
| "041 MATHEMATICS STD 063 020 083", | |
| "086 SCIENCE 065 020 085", | |
| "087 SOCIAL SCIENCE 057 020 077", | |
| "Roll No: 28169763 Name: TEST STUDENT", | |
| "Total: 433 / 500 Percentage: 86.6%", | |
| ] | |
| line_h = max(20, height // (len(lines) + 2)) | |
| scale = max(0.5, min(1.5, width / 900)) | |
| for i, text in enumerate(lines): | |
| y = line_h * (i + 1) | |
| if y < height - 10: | |
| cv2.putText(img, text, (20, y), | |
| cv2.FONT_HERSHEY_SIMPLEX, scale, (0, 0, 0), 2) | |
| return img | |
| SIZES = [ | |
| ("small", 800, 1200), | |
| ("medium", 1600, 2400), | |
| ("large", 3000, 4000), | |
| ] | |
| results: dict[str, Any] = {} | |
| engine = _ensure_rapidocr() | |
| for label, w, h in SIZES: | |
| img = _make_test_image(w, h) | |
| orig_h, orig_w = img.shape[:2] | |
| # Resize | |
| t_resize = time.perf_counter() | |
| img_resized, was_resized = _resize_for_ocr(img) | |
| resize_ms = int((time.perf_counter() - t_resize) * 1000) | |
| new_h, new_w = img_resized.shape[:2] | |
| # OCR | |
| t_ocr = time.perf_counter() | |
| ocr_result, elapse = engine(img_resized, box_thresh=DET_BOX_THRESH) | |
| ocr_ms = int((time.perf_counter() - t_ocr) * 1000) | |
| det_ms, rec_ms = _split_elapse(elapse, ocr_ms) | |
| texts = [item[1] for item in (ocr_result or []) if len(item) > 1] | |
| scores = [item[2] for item in (ocr_result or []) if len(item) > 2 and item[2] is not None] | |
| conf = round(sum(scores) / len(scores), 4) if scores else 0.0 | |
| results[label] = { | |
| "originalDimensions": f"{orig_w}Γ{orig_h}", | |
| "resizedDimensions": f"{new_w}Γ{new_h}", | |
| "wasResized": was_resized, | |
| "resizeMs": resize_ms, | |
| "detectMs": det_ms, | |
| "recognizeMs": rec_ms, | |
| "ocrTotalMs": ocr_ms, | |
| "textBlocks": len(texts), | |
| "confidence": conf, | |
| } | |
| used_mb, total_mb = _mem_mb() | |
| return JSONResponse(content={ | |
| "results": results, | |
| "memory": {"usedMB": used_mb, "totalMB": total_mb}, | |
| "maxOcrSide": MAX_OCR_SIDE, | |
| "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), | |
| }) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # POST /extract | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def extract(file: UploadFile = File(...)) -> JSONResponse: | |
| t_upload_start = time.perf_counter() | |
| try: | |
| raw, filename, ext = await _read_upload(file) | |
| upload_ms = int((time.perf_counter() - t_upload_start) * 1000) | |
| result = _run_extraction(raw, filename, ext, upload_ms=upload_ms) | |
| return JSONResponse(content=result) | |
| except ExtractionError as exc: | |
| logger.warning("/extract [%s/%s]: %s", exc.stage, exc.code, exc.message) | |
| return JSONResponse(status_code=exc.http_status, content=exc.to_dict()) | |
| except Exception as exc: | |
| logger.exception("/extract unhandled error") | |
| return JSONResponse( | |
| status_code=500, | |
| content={ | |
| "success": False, | |
| "stage": "unknown", | |
| "errorCode": "INTERNAL_ERROR", | |
| "rootCause": str(exc), | |
| "recommendation": "Check HF Space logs for full traceback.", | |
| "message": str(exc), | |
| "traceback": traceback.format_exc()[-3000:], | |
| }, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # POST /batch | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def batch(files: list[UploadFile] = File(...)) -> JSONResponse: | |
| candidates = files[:BATCH_MAX_FILES] | |
| results: list[dict[str, Any]] = [] | |
| for upload in candidates: | |
| t0 = time.perf_counter() | |
| try: | |
| raw, filename, ext = await _read_upload(upload) | |
| result = _run_extraction( | |
| raw, filename, ext, | |
| upload_ms=int((time.perf_counter() - t0) * 1000), | |
| ) | |
| except ExtractionError as exc: | |
| result = exc.to_dict() | |
| result["filename"] = _sanitize_filename(upload.filename or "upload") | |
| except Exception as exc: | |
| fname = _sanitize_filename(upload.filename or "upload") | |
| logger.exception("Batch item failed: %s", fname) | |
| result = { | |
| "success": False, | |
| "filename": fname, | |
| "stage": "unknown", | |
| "errorCode": "INTERNAL_ERROR", | |
| "rootCause": str(exc), | |
| "recommendation": "Check HF Space logs.", | |
| "message": str(exc), | |
| } | |
| results.append(result) | |
| return JSONResponse(content={ | |
| "success": True, | |
| "processed": len(results), | |
| "results": results, | |
| }) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Upload reader | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def _read_upload(upload: UploadFile) -> tuple[bytes, str, str]: | |
| filename = _sanitize_filename(upload.filename or "upload") | |
| ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else "" | |
| if ext not in ALLOWED_EXTENSIONS: | |
| raise _err( | |
| "validation", "UNSUPPORTED_TYPE", | |
| f"Unsupported file type '.{ext}'. " | |
| f"Supported: {sorted(ALLOWED_EXTENSIONS)}", | |
| 415, | |
| root_cause=f"Extension '{ext}' is not in the allowed set.", | |
| recommendation="Convert to PDF, JPG, PNG, or WEBP before uploading.", | |
| ) | |
| raw = await upload.read(MAX_UPLOAD_BYTES + 1) | |
| if len(raw) > MAX_UPLOAD_BYTES: | |
| raise _err( | |
| "upload", "FILE_TOO_LARGE", | |
| f"'{filename}' exceeds {MAX_UPLOAD_BYTES // 1024 // 1024} MB.", 413, | |
| root_cause=f"File is {len(raw) // 1024 // 1024} MB.", | |
| recommendation="Compress or split the file.", | |
| ) | |
| if len(raw) == 0: | |
| raise _err("upload", "EMPTY_FILE", f"'{filename}' is empty.", 400, | |
| root_cause="Zero bytes received.", | |
| recommendation="Check the file before uploading.") | |
| return raw, filename, ext | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Extraction dispatcher | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _run_extraction( | |
| raw: bytes, filename: str, ext: str, upload_ms: int = 0 | |
| ) -> dict[str, Any]: | |
| logger.info("request_received file=%s size=%d ext=%s", filename, len(raw), ext) | |
| # ββ Hash + cache lookup βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t_hash = time.perf_counter() | |
| file_hash = hashlib.sha256(raw).hexdigest() | |
| hash_ms = int((time.perf_counter() - t_hash) * 1000) | |
| logger.info("cache_lookup sha256=%.12s⦠hash_ms=%d", file_hash, hash_ms) | |
| with _cache_lock: | |
| cached = _cache.get(file_hash) | |
| if cached is not None: | |
| logger.info("cache_hit sha256=%.12s⦠file=%s", file_hash, filename) | |
| out = {**cached} | |
| out["cached"] = True | |
| out["processingTimeMs"] = 0 | |
| out["timings"] = {**cached.get("timings", {}), "totalMs": 0} | |
| return out | |
| logger.info("cache_miss sha256=%.12sβ¦", file_hash) | |
| # ββ Memory safety βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t_mem = time.perf_counter() | |
| _assert_memory_safe(raw, ext) | |
| mem_check_ms = int((time.perf_counter() - t_mem) * 1000) | |
| _inc_active() | |
| work_dir = tempfile.mkdtemp(prefix="ocr_") | |
| t0 = time.perf_counter() | |
| try: | |
| if ext in PDF_EXTENSIONS: | |
| logger.info("engine_selected engine=mineru file=%s", filename) | |
| _ensure_pipeline() | |
| result = _process_pdf(raw, filename, work_dir, upload_ms=upload_ms) | |
| elif ext in OFFICE_EXTENSIONS: | |
| logger.info("engine_selected engine=office_text file=%s ext=%s", filename, ext) | |
| result = _process_office(raw, filename, ext, upload_ms=upload_ms) | |
| else: | |
| logger.info("engine_selected engine=rapidocr file=%s", filename) | |
| result = _process_image(raw, filename, ext, work_dir, upload_ms=upload_ms) | |
| total_ms = int((time.perf_counter() - t0) * 1000) | |
| result["timings"]["uploadMs"] = upload_ms | |
| result["timings"]["hashMs"] = hash_ms | |
| result["timings"]["memCheckMs"] = mem_check_ms | |
| result["timings"]["totalMs"] = total_ms | |
| result["processingTimeMs"] = total_ms | |
| result["cached"] = False | |
| # Store in cache (strip per-request fields that change on replay) | |
| entry = {k: v for k, v in result.items() | |
| if k not in ("cached", "processingTimeMs", "timings")} | |
| entry["timings"] = {k: v for k, v in result["timings"].items() | |
| if k not in ("totalMs", "hashMs", "memCheckMs", "uploadMs")} | |
| with _cache_lock: | |
| _cache[file_hash] = entry | |
| logger.info( | |
| "response_sent file=%s engine=%s conf=%.3f total_ms=%d", | |
| filename, result.get("engine", "?"), result.get("confidence", 0), total_ms, | |
| ) | |
| return result | |
| except ExtractionError: | |
| raise | |
| except Exception as exc: | |
| logger.exception("extraction_failed file=%s", filename) | |
| raise _err( | |
| "unknown", "INTERNAL_ERROR", f"Unexpected error: {exc}", 500, | |
| root_cause=str(exc), | |
| recommendation="Check HF Space logs for full traceback.", | |
| ) from exc | |
| finally: | |
| _dec_active() | |
| shutil.rmtree(work_dir, ignore_errors=True) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Image processor β RapidOCR fast path + MinerU fallback | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _process_image( | |
| raw: bytes, filename: str, ext: str, work_dir: str, upload_ms: int = 0 | |
| ) -> dict[str, Any]: | |
| import cv2 | |
| # ββ Decode ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t_decode = time.perf_counter() | |
| img_bgr = _decode_image_to_bgr(raw, ext) | |
| decode_ms = int((time.perf_counter() - t_decode) * 1000) | |
| orig_h, orig_w = img_bgr.shape[:2] | |
| logger.info("image_decoded file=%s dims=%dx%d decode_ms=%d", | |
| filename, orig_w, orig_h, decode_ms) | |
| # ββ Resize ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t_resize = time.perf_counter() | |
| img_ocr, was_resized = _resize_for_ocr(img_bgr) | |
| resize_ms = int((time.perf_counter() - t_resize) * 1000) | |
| new_h, new_w = img_ocr.shape[:2] | |
| logger.info("image_resized file=%s original=%dx%d resized=%dx%d" | |
| " was_resized=%s resize_ms=%d", | |
| filename, orig_w, orig_h, new_w, new_h, was_resized, resize_ms) | |
| # ββ RapidOCR ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logger.info("ocr_started file=%s engine=rapidocr dims=%dx%d", | |
| filename, new_w, new_h) | |
| t_ocr = time.perf_counter() | |
| try: | |
| engine = _ensure_rapidocr() | |
| # box_thresh: drops detection boxes below this confidence BEFORE recognition. | |
| # Zero recognition cost for dropped boxes. See DET_BOX_THRESH constant. | |
| ocr_result, elapse = engine(img_ocr, box_thresh=DET_BOX_THRESH) | |
| except ExtractionError: | |
| raise | |
| except Exception as exc: | |
| raise _err( | |
| "ocr", "OCR_ENGINE_FAILED", f"RapidOCR failed: {exc}", 500, | |
| root_cause=str(exc), | |
| recommendation="Check rapidocr-onnxruntime in Dockerfile Layer 1.", | |
| ) from exc | |
| ocr_ms = int((time.perf_counter() - t_ocr) * 1000) | |
| det_ms, rec_ms = _split_elapse(elapse, ocr_ms) | |
| logger.info("ocr_finished file=%s engine=rapidocr ocr_ms=%d" | |
| " det_ms=%d rec_ms=%d", filename, ocr_ms, det_ms, rec_ms) | |
| # ββ Parse output ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t_post = time.perf_counter() | |
| plain_text, confidence = _parse_rapidocr_output(ocr_result) | |
| post_ms = int((time.perf_counter() - t_post) * 1000) | |
| logger.info("post_process file=%s conf=%.3f text_len=%d blocks=%d post_ms=%d", | |
| filename, confidence, len(plain_text), | |
| len(ocr_result) if ocr_result else 0, post_ms) | |
| # ββ MinerU fallback if confidence is low ββββββββββββββββββββββββββββββββββ | |
| passes_used = 1 | |
| engine_name = "rapidocr" | |
| if confidence < FAST_CONFIDENCE_THRESHOLD and plain_text.strip(): | |
| logger.info( | |
| "fallback_triggered conf=%.3f < %.2f file=%s trying mineru", | |
| confidence, FAST_CONFIDENCE_THRESHOLD, filename, | |
| ) | |
| try: | |
| _ensure_pipeline() | |
| mr = _process_image_mineru(raw, filename, ext, work_dir) | |
| if len(mr.get("text", "")) > len(plain_text) * 0.8: | |
| mr["engine"] = "mineru_fallback" | |
| mr["metadata"]["passesUsed"] = 2 | |
| mr["timings"]["pass1RapidOCRMs"] = ocr_ms | |
| mr["timings"]["decodeMs"] = decode_ms | |
| mr["timings"]["resizeMs"] = resize_ms | |
| logger.info("fallback_used file=%s mineru result accepted", filename) | |
| return mr | |
| except Exception as exc: | |
| logger.warning("fallback_failed file=%s error=%s using rapidocr result", filename, exc) | |
| passes_used = 2 | |
| else: | |
| logger.info("fallback_not_needed conf=%.3f file=%s", confidence, filename) | |
| return { | |
| "success": True, | |
| "filename": filename, | |
| "engine": engine_name, | |
| "confidence": confidence, | |
| "text": plain_text, | |
| "markdown": plain_text, | |
| "pageCount": 1, | |
| "timings": { | |
| "uploadMs": upload_ms, | |
| "hashMs": 0, | |
| "memCheckMs": 0, | |
| "decodeMs": decode_ms, | |
| "resizeMs": resize_ms, | |
| "detectMs": det_ms, | |
| "recognizeMs": rec_ms, | |
| "postProcessMs": post_ms, | |
| "totalMs": 0, | |
| }, | |
| "metadata": { | |
| "imgW": orig_w, | |
| "imgH": orig_h, | |
| "imgWResized": new_w, | |
| "imgHResized": new_h, | |
| "wasResized": was_resized, | |
| "textBlocks": len(ocr_result) if ocr_result else 0, | |
| "passesUsed": passes_used, | |
| "backend": "rapidocr", | |
| }, | |
| } | |
| def _process_image_mineru( | |
| raw: bytes, filename: str, ext: str, work_dir: str | |
| ) -> dict[str, Any]: | |
| from magic_pdf.data.data_reader_writer import ( | |
| FileBasedDataReader, FileBasedDataWriter) | |
| from magic_pdf.data.dataset import ImageDataset | |
| from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
| images_dir = os.path.join(work_dir, "images_mineru") | |
| os.makedirs(images_dir, exist_ok=True) | |
| if ext in PILLOW_IMAGE_EXTENSIONS: | |
| raw = _convert_to_png(raw, ext) | |
| save_ext = "png" | |
| else: | |
| save_ext = ext | |
| img_path = os.path.join(work_dir, f"input_mineru.{save_ext}") | |
| with open(img_path, "wb") as fh: | |
| fh.write(raw) | |
| t_ocr = time.perf_counter() | |
| try: | |
| reader = FileBasedDataReader(work_dir) | |
| image_bytes = reader.read(f"input_mineru.{save_ext}") | |
| ds = ImageDataset(image_bytes) | |
| infer_result = ds.apply(doc_analyze, ocr=True) | |
| pipe_result = infer_result.pipe_ocr_mode(FileBasedDataWriter(images_dir)) | |
| except Exception as exc: | |
| raise _err( | |
| "ocr", "OCR_PIPELINE_FAILED", | |
| f"MinerU image pipeline failed: {exc}", 500, | |
| root_cause=str(exc), | |
| recommendation="Check magic-pdf installation and model files.", | |
| ) from exc | |
| ocr_ms = int((time.perf_counter() - t_ocr) * 1000) | |
| t_md = time.perf_counter() | |
| try: | |
| markdown = pipe_result.get_markdown(images_dir) | |
| except Exception as exc: | |
| raise _err("markdown", "MARKDOWN_FAILED", f"get_markdown failed: {exc}") from exc | |
| md_ms = int((time.perf_counter() - t_md) * 1000) | |
| plain_text = _markdown_to_plain(markdown) | |
| return { | |
| "success": True, | |
| "filename": filename, | |
| "engine": "mineru", | |
| "confidence": 0.85, | |
| "text": plain_text, | |
| "markdown": markdown, | |
| "pageCount": 1, | |
| "timings": { | |
| "uploadMs": 0, | |
| "hashMs": 0, | |
| "memCheckMs": 0, | |
| "decodeMs": 0, | |
| "resizeMs": 0, | |
| "detectMs": 0, | |
| "recognizeMs": ocr_ms, | |
| "postProcessMs": md_ms, | |
| "totalMs": 0, | |
| }, | |
| "metadata": { | |
| "imgW": 0, "imgH": 0, | |
| "imgWResized": 0, "imgHResized": 0, | |
| "wasResized": False, | |
| "textBlocks": 0, | |
| "passesUsed": 1, | |
| "backend": "pipeline", | |
| }, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Office document processor β DOCX / PPTX / XLSX (text extraction, no OCR) | |
| # No image rendering or OCR is performed. Text is read directly from the | |
| # structured XML inside the Office Open XML container. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _process_office( | |
| raw: bytes, filename: str, ext: str, upload_ms: int = 0 | |
| ) -> dict[str, Any]: | |
| t0 = time.perf_counter() | |
| logger.info("ocr_started file=%s engine=office_text ext=%s", filename, ext) | |
| try: | |
| if ext == "docx": | |
| plain_text, page_count = _extract_docx(raw) | |
| elif ext == "pptx": | |
| plain_text, page_count = _extract_pptx(raw) | |
| elif ext == "xlsx": | |
| plain_text, page_count = _extract_xlsx(raw) | |
| else: | |
| raise _err("decode", "UNSUPPORTED_OFFICE_TYPE", | |
| f"Unrecognised office extension: {ext}", 415) | |
| except ExtractionError: | |
| raise | |
| except Exception as exc: | |
| raise _err( | |
| "ocr", "OFFICE_EXTRACT_FAILED", | |
| f"Could not extract text from {ext.upper()}: {exc}", 422, | |
| root_cause=str(exc), | |
| recommendation=f"Ensure the file is a valid, non-password-protected {ext.upper()}.", | |
| ) from exc | |
| extract_ms = int((time.perf_counter() - t0) * 1000) | |
| logger.info("ocr_finished file=%s engine=office_text extract_ms=%d text_len=%d", | |
| filename, extract_ms, len(plain_text)) | |
| return { | |
| "success": True, | |
| "filename": filename, | |
| "engine": f"office_text_{ext}", | |
| "confidence": 1.0, | |
| "text": plain_text, | |
| "markdown": plain_text, | |
| "pageCount": page_count, | |
| "timings": { | |
| "uploadMs": upload_ms, | |
| "hashMs": 0, | |
| "memCheckMs": 0, | |
| "decodeMs": 0, | |
| "resizeMs": 0, | |
| "detectMs": 0, | |
| "recognizeMs": extract_ms, | |
| "postProcessMs": 0, | |
| "totalMs": 0, | |
| }, | |
| "metadata": { | |
| "imgW": 0, "imgH": 0, | |
| "imgWResized": 0, "imgHResized": 0, | |
| "wasResized": False, | |
| "textBlocks": plain_text.count("\n") + 1, | |
| "passesUsed": 1, | |
| "backend": f"office_text_{ext}", | |
| }, | |
| } | |
| def _extract_docx(raw: bytes) -> tuple[str, int]: | |
| """Extract plain text from a DOCX file. Returns (text, page_estimate).""" | |
| try: | |
| import docx as _docx | |
| except ImportError as exc: | |
| raise _err("decode", "DOCX_DEPS_MISSING", | |
| "python-docx is not installed.", 503, | |
| recommendation="Add python-docx to Dockerfile Layer 1.") from exc | |
| doc = _docx.Document(io.BytesIO(raw)) | |
| paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] | |
| # Tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| row_text = " | ".join( | |
| cell.text.strip() for cell in row.cells if cell.text.strip() | |
| ) | |
| if row_text: | |
| paragraphs.append(row_text) | |
| text = "\n".join(paragraphs) | |
| # Rough page estimate: ~3 000 chars per page | |
| pages = max(1, len(text) // 3000) | |
| return text, pages | |
| def _extract_pptx(raw: bytes) -> tuple[str, int]: | |
| """Extract plain text from a PPTX file. Returns (text, slide_count).""" | |
| try: | |
| from pptx import Presentation as _Presentation | |
| except ImportError as exc: | |
| raise _err("decode", "PPTX_DEPS_MISSING", | |
| "python-pptx is not installed.", 503, | |
| recommendation="Add python-pptx to Dockerfile Layer 1.") from exc | |
| prs = _Presentation(io.BytesIO(raw)) | |
| lines: list[str] = [] | |
| for slide_num, slide in enumerate(prs.slides, 1): | |
| lines.append(f"--- Slide {slide_num} ---") | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text") and shape.text.strip(): | |
| lines.append(shape.text.strip()) | |
| return "\n".join(lines), len(prs.slides) | |
| def _extract_xlsx(raw: bytes) -> tuple[str, int]: | |
| """Extract plain text from an XLSX file. Returns (text, sheet_count).""" | |
| try: | |
| import openpyxl as _openpyxl | |
| except ImportError as exc: | |
| raise _err("decode", "XLSX_DEPS_MISSING", | |
| "openpyxl is not installed.", 503, | |
| recommendation="Add openpyxl to Dockerfile Layer 1.") from exc | |
| wb = _openpyxl.load_workbook(io.BytesIO(raw), read_only=True, data_only=True) | |
| lines: list[str] = [] | |
| for sheet in wb.worksheets: | |
| lines.append(f"--- Sheet: {sheet.title} ---") | |
| for row in sheet.iter_rows(values_only=True): | |
| row_text = " | ".join( | |
| str(cell) for cell in row if cell is not None and str(cell).strip() | |
| ) | |
| if row_text: | |
| lines.append(row_text) | |
| wb.close() | |
| return "\n".join(lines), len(wb.worksheets) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PDF processor β MinerU | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _process_pdf( | |
| raw: bytes, filename: str, work_dir: str, upload_ms: int = 0 | |
| ) -> dict[str, Any]: | |
| from magic_pdf.data.data_reader_writer import FileBasedDataWriter | |
| from magic_pdf.data.dataset import PymuDocDataset | |
| from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
| from magic_pdf.config.enums import SupportedPdfParseMethod | |
| images_dir = os.path.join(work_dir, "images") | |
| os.makedirs(images_dir, exist_ok=True) | |
| page_count = _pdf_page_count(raw) | |
| logger.info("pdf_classify file=%s pages=%d", filename, page_count) | |
| t_classify = time.perf_counter() | |
| try: | |
| ds = PymuDocDataset(raw) | |
| method = ds.classify() | |
| except Exception as exc: | |
| raise _err( | |
| "decode", "PDF_PARSE_FAILED", f"Could not parse PDF: {exc}", 422, | |
| root_cause=str(exc), | |
| recommendation="Ensure the file is a valid, non-encrypted PDF.", | |
| ) from exc | |
| classify_ms = int((time.perf_counter() - t_classify) * 1000) | |
| logger.info("ocr_started file=%s engine=mineru method=%s", filename, method) | |
| t_ocr = time.perf_counter() | |
| try: | |
| image_writer = FileBasedDataWriter(images_dir) | |
| if method == SupportedPdfParseMethod.TXT: | |
| infer_result = ds.apply(doc_analyze, ocr=False) | |
| pipe_result = infer_result.pipe_txt_mode(image_writer) | |
| parse_method = "txt" | |
| else: | |
| infer_result = ds.apply(doc_analyze, ocr=True) | |
| pipe_result = infer_result.pipe_ocr_mode(image_writer) | |
| parse_method = "ocr" | |
| except Exception as exc: | |
| raise _err( | |
| "ocr", "OCR_PIPELINE_FAILED", f"doc_analyze/pipe failed: {exc}", 500, | |
| root_cause=str(exc), | |
| recommendation="Check model files in /app/models and validate.py output.", | |
| ) from exc | |
| ocr_ms = int((time.perf_counter() - t_ocr) * 1000) | |
| logger.info("ocr_finished file=%s engine=mineru ocr_ms=%d", filename, ocr_ms) | |
| t_md = time.perf_counter() | |
| try: | |
| markdown = pipe_result.get_markdown(images_dir) | |
| except Exception as exc: | |
| raise _err("markdown", "MARKDOWN_FAILED", f"get_markdown failed: {exc}") from exc | |
| md_ms = int((time.perf_counter() - t_md) * 1000) | |
| plain_text = _markdown_to_plain(markdown) | |
| return { | |
| "success": True, | |
| "filename": filename, | |
| "engine": "mineru", | |
| "confidence": 0.9 if parse_method == "txt" else 0.85, | |
| "text": plain_text, | |
| "markdown": markdown, | |
| "pageCount": page_count, | |
| "timings": { | |
| "uploadMs": upload_ms, | |
| "hashMs": 0, | |
| "memCheckMs": 0, | |
| "decodeMs": classify_ms, | |
| "resizeMs": 0, | |
| "detectMs": 0, | |
| "recognizeMs": ocr_ms, | |
| "postProcessMs": md_ms, | |
| "totalMs": 0, | |
| }, | |
| "metadata": { | |
| "imgW": 0, "imgH": 0, | |
| "imgWResized": 0, "imgHResized": 0, | |
| "wasResized": False, | |
| "textBlocks": 0, | |
| "passesUsed": 1, | |
| "backend": "pipeline", | |
| "parseMethod": parse_method, | |
| "pages": page_count, | |
| }, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Image helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _resize_for_ocr(img: "np.ndarray") -> tuple["np.ndarray", bool]: | |
| """ | |
| Resize image so the longest side is at most MAX_OCR_SIDE pixels. | |
| Returns (resized_img, was_resized). | |
| Uses cv2.INTER_AREA which is the correct algorithm for downscaling: | |
| it averages pixels (anti-aliasing) rather than sampling individual pixels, | |
| preserving text legibility at smaller sizes. | |
| No upscaling: images smaller than MAX_OCR_SIDE are returned unchanged. | |
| """ | |
| import cv2 | |
| h, w = img.shape[:2] | |
| longest = max(h, w) | |
| if longest <= MAX_OCR_SIDE: | |
| return img, False | |
| scale = MAX_OCR_SIDE / longest | |
| new_w = int(w * scale) | |
| new_h = int(h * scale) | |
| resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) | |
| return resized, True | |
| def _decode_image_to_bgr(raw: bytes, ext: str) -> "np.ndarray": | |
| import cv2 | |
| if ext in {"heic", "heif"}: | |
| try: | |
| from pillow_heif import register_heif_opener | |
| register_heif_opener() | |
| except ImportError: | |
| raise _err( | |
| "decode", "HEIF_NOT_SUPPORTED", | |
| "HEIC/HEIF requires pillow-heif.", 415, | |
| recommendation="Add pillow-heif to Dockerfile Layer 1.", | |
| ) | |
| try: | |
| pil_img = Image.open(io.BytesIO(raw)).convert("RGB") | |
| buf = io.BytesIO() | |
| pil_img.save(buf, format="PNG") | |
| raw = buf.getvalue() | |
| except Exception as exc: | |
| raise _err("decode", "HEIF_DECODE_FAILED", | |
| f"HEIF decode error: {exc}") from exc | |
| arr = np.frombuffer(raw, np.uint8) | |
| img = cv2.imdecode(arr, cv2.IMREAD_COLOR) | |
| if img is None: | |
| try: | |
| pil_img = Image.open(io.BytesIO(raw)).convert("RGB") | |
| img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) | |
| except Exception as exc: | |
| raise _err( | |
| "decode", "IMAGE_DECODE_FAILED", | |
| f"Could not decode image: {exc}", 422, | |
| root_cause=str(exc), | |
| recommendation="Ensure the file is a valid, non-corrupted image.", | |
| ) from exc | |
| return img | |
| def _convert_to_png(raw: bytes, ext: str) -> bytes: | |
| if ext in {"heic", "heif"}: | |
| try: | |
| from pillow_heif import register_heif_opener | |
| register_heif_opener() | |
| except ImportError: | |
| raise _err("decode", "HEIF_NOT_SUPPORTED", | |
| "HEIC/HEIF requires pillow-heif.", 415) | |
| try: | |
| img = Image.open(io.BytesIO(raw)).convert("RGB") | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| return buf.getvalue() | |
| except Exception as exc: | |
| raise _err("decode", "IMAGE_DECODE_FAILED", | |
| f"Pillow could not open image: {exc}", 422) from exc | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # RapidOCR output parser | |
| # Returns (plain_text, mean_confidence) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _parse_rapidocr_output(result: Any) -> tuple[str, float]: | |
| if not result: | |
| return "", 0.0 | |
| def _avg_y(item: Any) -> float: | |
| box = item[0] | |
| try: | |
| return sum(pt[1] for pt in box) / 4 | |
| except Exception: | |
| return 0.0 | |
| def _avg_x(item: Any) -> float: | |
| box = item[0] | |
| try: | |
| return sum(pt[0] for pt in box) / 4 | |
| except Exception: | |
| return 0.0 | |
| sorted_items = sorted(result, key=_avg_y) | |
| LINE_GAP = 20 | |
| lines: list[list[Any]] = [] | |
| if sorted_items: | |
| current: list[Any] = [sorted_items[0]] | |
| for item in sorted_items[1:]: | |
| if abs(_avg_y(item) - _avg_y(current[-1])) < LINE_GAP: | |
| current.append(item) | |
| else: | |
| lines.append(current) | |
| current = [item] | |
| lines.append(current) | |
| text_lines: list[str] = [] | |
| for line in lines: | |
| words = sorted(line, key=_avg_x) | |
| text_lines.append(" ".join(str(item[1]) for item in words if len(item) > 1)) | |
| plain_text = "\n".join(text_lines) | |
| scores = [item[2] for item in result if len(item) > 2 and item[2] is not None] | |
| mean_conf = float(sum(scores) / len(scores)) if scores else 0.5 | |
| return plain_text, round(mean_conf, 4) | |
| def _split_elapse(elapse: Any, total_ms: int) -> tuple[int, int]: | |
| """ | |
| Extract det_ms / rec_ms from RapidOCR's elapse return value. | |
| rapidocr-onnxruntime β₯ 1.3 returns a dict: {"det": s, "rec": s, "cls": s}. | |
| Older versions return a scalar total. We handle both. | |
| """ | |
| if isinstance(elapse, dict): | |
| det_ms = int(elapse.get("det", 0) * 1000) | |
| rec_ms = int(elapse.get("rec", 0) * 1000) | |
| return det_ms, rec_ms | |
| # Scalar fallback β measured total, no reliable split available | |
| return 0, total_ms | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Misc helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _sanitize_filename(name: str) -> str: | |
| name = os.path.basename(name) | |
| name = re.sub(r"[^\w.\-]", "_", name) | |
| return name[:200] or "upload" | |
| def _markdown_to_plain(markdown: str) -> str: | |
| text = re.sub(r"!\[.*?\]\(.*?\)", "", markdown) | |
| text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) | |
| text = re.sub(r"#{1,6}\s*", "", text) | |
| text = re.sub(r"\*{1,2}([^*]+)\*{1,2}", r"\1", text) | |
| text = re.sub(r"`{1,3}[^`]*`{1,3}", "", text) | |
| text = re.sub(r"\|", " ", text) | |
| text = re.sub(r"-{3,}", "", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |
| def _pdf_page_count(raw: bytes) -> int: | |
| try: | |
| doc = fitz.open(stream=raw, filetype="pdf") | |
| count = doc.page_count | |
| doc.close() | |
| return count | |
| except Exception: | |
| return 1 | |
| def _mem_mb() -> tuple[int, int]: | |
| try: | |
| import psutil | |
| vm = psutil.virtual_memory() | |
| return (vm.total - vm.available) // (1024 * 1024), vm.total // (1024 * 1024) | |
| except Exception: | |
| pass | |
| try: | |
| info: dict[str, int] = {} | |
| with open("/proc/meminfo") as f: | |
| for line in f: | |
| parts = line.split() | |
| if len(parts) >= 2: | |
| info[parts[0].rstrip(":")] = int(parts[1]) | |
| total_kb = info.get("MemTotal", 0) | |
| avail_kb = info.get("MemAvailable", 0) | |
| return (total_kb - avail_kb) // 1024, total_kb // 1024 | |
| except Exception: | |
| return 0, 0 | |
| def _assert_memory_safe(raw: bytes, ext: str) -> None: | |
| """ | |
| Reject requests that would likely exhaust available RAM. | |
| For images: estimate from raw byte count only (no Pillow decode needed β | |
| avoids the double-decode that existed in v3.0). Raw JPEG at 3 MP β 1β3 MB; | |
| the decompressed BGR array is w*h*3 bytes. We conservatively multiply by | |
| IMAGE_MEMORY_FACTOR to cover both the decode buffer and OCR working memory. | |
| """ | |
| used_mb, total_mb = _mem_mb() | |
| if total_mb == 0: | |
| return | |
| available_mb = total_mb - used_mb | |
| if ext in PDF_EXTENSIONS: | |
| page_count = max(1, _pdf_page_count(raw)) | |
| estimated_mb = (page_count * BYTES_PER_OCR_PAGE) // (1024 * 1024) | |
| else: | |
| # Estimate from compressed size β no Pillow decode required. | |
| # Compressed-to-raw expansion ratio for JPEG β 10β20Γ; we use 20Γ and | |
| # multiply by IMAGE_MEMORY_FACTOR for working memory overhead. | |
| estimated_mb = len(raw) * 20 * IMAGE_MEMORY_FACTOR // (1024 * 1024) | |
| free_after = available_mb - estimated_mb | |
| logger.info( | |
| "memory_check avail_mb=%d est_mb=%d free_after_mb=%d", | |
| available_mb, estimated_mb, free_after, | |
| ) | |
| if free_after < MEM_SAFETY_FLOOR_MB: | |
| raise _err( | |
| "validation", "LOW_MEMORY", | |
| f"Insufficient memory. Available: {available_mb} MB, " | |
| f"Estimated needed: {estimated_mb} MB.", 507, | |
| root_cause=f"Container has {available_mb} MB free; " | |
| f"pipeline needs ~{estimated_mb} MB.", | |
| recommendation="Wait for active requests to complete, " | |
| "or use a smaller file.", | |
| ) | |