import gradio as gr import numpy as np import cv2 from paddleocr import PaddleOCR ocr = None # lazy init def get_ocr(): global ocr if ocr is None: ocr = PaddleOCR(use_angle_cls=True, lang="en") return ocr def preprocess_for_ocr(pil_img): rgb = np.array(pil_img) if rgb.dtype != np.uint8: rgb = rgb.astype(np.uint8) bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) # upscale helps thin kid-book text h, w = bgr.shape[:2] scale = 1.8 bgr = cv2.resize(bgr, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_CUBIC) # contrast boost lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB) l, a, b = cv2.split(lab) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) l2 = clahe.apply(l) lab2 = cv2.merge([l2, a, b]) bgr = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR) return bgr def crop_bottom_text_region(bgr): """Kids books often place text at the bottom. Crop bottom ~35%.""" h, w = bgr.shape[:2] return bgr[int(h * 0.65):h, 0:w] def cloud_ocr_stub(_pil_img): """ Placeholder for GPU/VLM OCR (olmOCR-2 / Nanonets OCR2). Later this will call an API endpoint. """ return "[Cloud OCR placeholder] PaddleOCR confidence was low. Next: call olmOCR-2 / OCR2 via API." def run_ocr(img): if img is None: return "(No image)", 0.0, "No" bgr = preprocess_for_ocr(img) bgr_crop = crop_bottom_text_region(bgr) ocr_engine = get_ocr() result = ocr_engine.ocr(bgr_crop) lines = [] confs = [] blocks = result[0] if isinstance(result, list) and result and isinstance(result[0], list) else result for item in blocks: try: text, conf = item[1] lines.append(str(text)) confs.append(float(conf)) except Exception: continue extracted = "\n".join(lines).strip() avg_conf = float(sum(confs) / len(confs)) if confs else 0.0 # ---- fallback decision (simple + effective) ---- needs_cloud = (avg_conf < 0.45) or (len(extracted) < 15) if needs_cloud: extracted = extracted if extracted else "(PaddleOCR found no text)\n\n" + cloud_ocr_stub(img) return extracted, avg_conf, "Yes" return extracted, avg_conf, "No" demo = gr.Interface( fn=run_ocr, inputs=gr.Image(type="pil", label="Upload a page photo"), outputs=[ gr.Textbox(label="Extracted text", lines=12), gr.Number(label="Average confidence (0–1)"), gr.Textbox(label="Cloud fallback needed?", interactive=False), ], title="BookReader × Reachy Mini", description="CPU PaddleOCR + smart fallback (VLM OCR stub). Crops bottom text region for kid books.", ) demo.launch(ssr_mode=False)