chitrark's picture
Add bottom text cropping + cloud OCR fallback stub
1ee4116 verified
import gradio as gr
import numpy as np
import cv2
from paddleocr import PaddleOCR
ocr = None # lazy init
def get_ocr():
global ocr
if ocr is None:
ocr = PaddleOCR(use_angle_cls=True, lang="en")
return ocr
def preprocess_for_ocr(pil_img):
rgb = np.array(pil_img)
if rgb.dtype != np.uint8:
rgb = rgb.astype(np.uint8)
bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
# upscale helps thin kid-book text
h, w = bgr.shape[:2]
scale = 1.8
bgr = cv2.resize(bgr, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_CUBIC)
# contrast boost
lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB)
l, a, b = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
l2 = clahe.apply(l)
lab2 = cv2.merge([l2, a, b])
bgr = cv2.cvtColor(lab2, cv2.COLOR_LAB2BGR)
return bgr
def crop_bottom_text_region(bgr):
"""Kids books often place text at the bottom. Crop bottom ~35%."""
h, w = bgr.shape[:2]
return bgr[int(h * 0.65):h, 0:w]
def cloud_ocr_stub(_pil_img):
"""
Placeholder for GPU/VLM OCR (olmOCR-2 / Nanonets OCR2).
Later this will call an API endpoint.
"""
return "[Cloud OCR placeholder] PaddleOCR confidence was low. Next: call olmOCR-2 / OCR2 via API."
def run_ocr(img):
if img is None:
return "(No image)", 0.0, "No"
bgr = preprocess_for_ocr(img)
bgr_crop = crop_bottom_text_region(bgr)
ocr_engine = get_ocr()
result = ocr_engine.ocr(bgr_crop)
lines = []
confs = []
blocks = result[0] if isinstance(result, list) and result and isinstance(result[0], list) else result
for item in blocks:
try:
text, conf = item[1]
lines.append(str(text))
confs.append(float(conf))
except Exception:
continue
extracted = "\n".join(lines).strip()
avg_conf = float(sum(confs) / len(confs)) if confs else 0.0
# ---- fallback decision (simple + effective) ----
needs_cloud = (avg_conf < 0.45) or (len(extracted) < 15)
if needs_cloud:
extracted = extracted if extracted else "(PaddleOCR found no text)\n\n" + cloud_ocr_stub(img)
return extracted, avg_conf, "Yes"
return extracted, avg_conf, "No"
demo = gr.Interface(
fn=run_ocr,
inputs=gr.Image(type="pil", label="Upload a page photo"),
outputs=[
gr.Textbox(label="Extracted text", lines=12),
gr.Number(label="Average confidence (0–1)"),
gr.Textbox(label="Cloud fallback needed?", interactive=False),
],
title="BookReader × Reachy Mini",
description="CPU PaddleOCR + smart fallback (VLM OCR stub). Crops bottom text region for kid books.",
)
demo.launch(ssr_mode=False)