""" HuggingFace Inference Endpoint custom handler for PaddleOCR. Conforms to the HF Inference Toolkit EndpointHandler contract: - __init__(path="") called once at startup - __call__(data) called per request; data always contains "inputs" key Supports two call modes determined by the shape of `data["inputs"]`: Single image: { "inputs": "" } Returns: { "results": [["text", confidence], ...] } Batch images (send ALL tiles for a page in one call for maximum GPU throughput): { "inputs": [{"id": "", "image_base64": ""}, ...] } Returns: { "results": {"": [["text", confidence], ...], ...} } Performance note: PaddleOCR 3.x predict() accepts a list of numpy arrays and processes them as a single GPU batch — dramatically faster than calling it per-image. Always prefer one batch call per page over multiple single calls. """ from __future__ import annotations import base64 import io import logging from typing import Any, Dict, List, Tuple # Install langchain shim BEFORE paddleocr / paddlex ever gets imported. import _shim # noqa: F401 import numpy as np from PIL import Image logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("paddleocr-handler") def _decode_image(image_base64: str) -> np.ndarray: """Decode a base64-encoded PNG/JPEG string into an RGB numpy array.""" if "," in image_base64 and image_base64.strip().startswith("data:"): image_base64 = image_base64.split(",", 1)[1] raw = base64.b64decode(image_base64) img = Image.open(io.BytesIO(raw)).convert("RGB") return np.array(img) def _parse_single_result(page_result) -> List[Tuple[str, float]]: """ Normalize one page's PaddleOCR output to a flat list of (text, confidence). Handles both v2.x nested-list format and v3.x dict format. """ results: List[Tuple[str, float]] = [] if not page_result: return results # PaddleOCR 3.x predict() per-image result: {'rec_texts': [...], 'rec_scores': [...]} if isinstance(page_result, dict): texts = page_result.get("rec_texts") or [] scores = page_result.get("rec_scores") or [] for t, s in zip(texts, scores): results.append((str(t), float(s))) return results # Legacy: page_result is a list [[box, (text, conf)], ...] # When called via ocr(), raw = [[...]] so caller passes raw[0] for line in page_result or []: try: text_part = line[1] results.append((str(text_part[0]), float(text_part[1]))) except (IndexError, TypeError, ValueError): continue return results class EndpointHandler: def __init__(self, path: str = ""): """Called once when the endpoint starts. Loads the PaddleOCR engine.""" from paddleocr import PaddleOCR logger.info("Initializing PaddleOCR engine (GPU)...") # use_gpu is picked up automatically when a CUDA device is present; # setting it explicitly ensures it is not silently skipped. self._ocr = PaddleOCR(lang="en", use_textline_orientation=True, use_gpu=True) logger.info("PaddleOCR engine ready.") def _run_batch(self, images: List[np.ndarray]) -> List[List[Tuple[str, float]]]: """ Run OCR on a list of images in one GPU call via predict(). Falls back to per-image ocr() if predict() is unavailable. Returns a list of results, one per input image. """ ocr = self._ocr # --- Fast path: predict() accepts a list and batches on GPU --- if hasattr(ocr, "predict"): try: raw_batch = ocr.predict(images) # returns list, one result per image return [_parse_single_result(r) for r in raw_batch] except Exception as exc: logger.warning(f"predict() batch failed, falling back to per-image ocr(): {exc}") # --- Fallback: ocr() called per image --- all_results = [] for img in images: raw = None try: raw = ocr.ocr(img, cls=True) except TypeError: raw = ocr.ocr(img) except Exception: raw = None page = raw[0] if raw else None all_results.append(_parse_single_result(page)) return all_results def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """ Route to single or batch processing based on the shape of `inputs`. Single: data = {"inputs": ""} Batch: data = {"inputs": [{"id": "...", "image_base64": "..."}, ...]} """ inputs = data.get("inputs", data) # ---- Batch mode: decode all images then run as one GPU batch ---- if isinstance(inputs, list): ids: List[str] = [] arrays: List[np.ndarray] = [] decode_errors: Dict[str, str] = {} for i, item in enumerate(inputs): item_id = item.get("id", str(i)) ids.append(item_id) try: arrays.append(_decode_image(item["image_base64"])) except Exception as exc: logger.warning(f"Decode error for id={item_id}: {exc}") decode_errors[item_id] = str(exc) arrays.append(None) # placeholder to keep index alignment # Filter out failed decodes, run batch, then re-align results valid_indices = [i for i, a in enumerate(arrays) if a is not None] valid_arrays = [arrays[i] for i in valid_indices] batch_results = self._run_batch(valid_arrays) if valid_arrays else [] out: Dict[str, Any] = {} result_iter = iter(batch_results) for i, item_id in enumerate(ids): if arrays[i] is None: out[item_id] = [] else: out[item_id] = next(result_iter, []) return {"results": out} # ---- Single mode ---- try: img_array = _decode_image(str(inputs)) results = self._run_batch([img_array]) return {"results": results[0] if results else []} except Exception as exc: logger.exception(f"OCR error: {exc}") return {"error": str(exc), "results": []}