""" RapidOCR (ONNX Runtime, PP-OCRv4) — Standalone OCR Benchmark Space """ import os import time import json import importlib import importlib.metadata import tempfile from pathlib import Path from collections import OrderedDict import gradio as gr import numpy as np from PIL import Image from jiwer import cer, wer from datasets import load_dataset # --------------------------------------------------------------------------- # Dataset registry # --------------------------------------------------------------------------- DATASETS = OrderedDict( { "FUNSD — Forms (50 test docs)": { "hf_id": "nielsr/funsd", "split": "test", "image_col": "image", "gt_fn": "funsd", "description": "Form Understanding in Noisy Scanned Documents. 50 test documents with word-level GT.", }, "IAM — Handwriting lines (test set, 50 samples)": { "hf_id": "Teklia/IAM-line", "split": "test", "image_col": "image", "gt_fn": "iam", "description": "IAM handwriting database, line-level images with transcriptions.", }, "CORD-v2 — Receipts (50 samples)": { "hf_id": "naver-clova-ix/cord-v2", "split": "test", "image_col": "image", "gt_fn": "cord", "description": "Consolidated Receipt Dataset v2. Complex receipt images with structured GT.", }, "Invoices & Receipts (50 samples)": { "hf_id": "mychen76/invoices-and-receipts_ocr_v1", "split": "test", "image_col": "image", "gt_fn": "invoices", "description": "Invoices and receipts with OCR ground truth text.", }, } ) MAX_SAMPLES = 50 # --------------------------------------------------------------------------- # Ground-truth extraction helpers # --------------------------------------------------------------------------- def _gt_funsd(row): words = row.get("words", []) return " ".join(words) def _gt_iam(row): return row.get("text", "") def _gt_cord(row): try: gt = json.loads(row.get("ground_truth", "{}")) parse = gt.get("gt_parse", {}) parts = [] for menu_item in parse.get("menu", []): for key in ("nm", "cnt", "price", "unitprice", "itemsubtotal", "sub", "etc"): val = menu_item.get(key) if val and isinstance(val, str): parts.append(val) elif isinstance(val, dict): for v2 in val.values(): if isinstance(v2, str): parts.append(v2) for section in ("subtotal", "total", "tax"): sec_data = parse.get(section, {}) if isinstance(sec_data, dict): for v in sec_data.values(): if isinstance(v, str): parts.append(v) elif isinstance(sec_data, list): for item in sec_data: if isinstance(item, dict): for v in item.values(): if isinstance(v, str): parts.append(v) return " ".join(parts) if parts else "" except Exception: return "" def _gt_invoices(row): try: raw = json.loads(row.get("raw_data", "{}")) words_str = raw.get("ocr_words", "") if isinstance(words_str, str) and words_str.startswith("["): import ast words = ast.literal_eval(words_str) return " ".join(words) return str(words_str) except Exception: return "" GT_EXTRACTORS = { "funsd": _gt_funsd, "iam": _gt_iam, "cord": _gt_cord, "invoices": _gt_invoices, } # --------------------------------------------------------------------------- # OCR engine # --------------------------------------------------------------------------- class RapidOCREngine: def __init__(self): from rapidocr_onnxruntime import RapidOCR self.ocr = RapidOCR() def run(self, image: Image.Image): img_array = np.array(image.convert("RGB")) img_bgr = img_array[:, :, ::-1].copy() t0 = time.perf_counter() result, elapse = self.ocr(img_bgr) elapsed = time.perf_counter() - t0 texts, scores = [], [] if result: for item in result: texts.append(item[1]) scores.append(float(item[2])) return texts, scores, elapsed # --------------------------------------------------------------------------- # Deployment size — REAL on-disk measurement # --------------------------------------------------------------------------- def _get_dist_dirs(dist_name: str) -> list[str]: """Find all directories on disk belonging to a pip distribution.""" try: dist = importlib.metadata.distribution(dist_name) except importlib.metadata.PackageNotFoundError: return [] dist_info_path = Path(dist._path) site_packages = dist_info_path.parent dirs: set[str] = set() dirs.add(str(dist_info_path)) # top_level.txt lists the importable package names try: top_level = dist.read_text("top_level.txt") if top_level: for name in top_level.strip().splitlines(): name = name.strip() candidate = site_packages / name if candidate.is_dir(): dirs.add(str(candidate)) elif candidate.with_suffix(".py").is_file(): dirs.add(str(candidate.with_suffix(".py"))) except Exception: pass # Also check RECORD for top-level dirs we may have missed if dist.files: for f in dist.files: parts = str(f).split("/") if parts and parts[0] not in (".", ".."): top = parts[0] if top.endswith((".dist-info", ".egg-info")): continue candidate = site_packages / top if candidate.is_dir(): dirs.add(str(candidate)) elif candidate.is_file(): dirs.add(str(candidate)) return list(dirs) def _size_bytes(path: str) -> int: """Recursively sum real file sizes under a path.""" p = Path(path) if p.is_file(): return p.stat().st_size total = 0 for dirpath, _, filenames in os.walk(p): for fname in filenames: try: total += os.path.getsize(os.path.join(dirpath, fname)) except OSError: pass return total def get_package_real_size_mb(dist_name: str) -> float | None: """Measure the REAL on-disk installed size of a package in MB.""" dirs = _get_dist_dirs(dist_name) if not dirs: return None total = sum(_size_bytes(d) for d in dirs) return total / (1024 * 1024) def estimate_deployment_size(): """Measure real installed sizes for the RapidOCR deployment stack.""" packages = [ ("rapidocr-onnxruntime", "rapidocr-onnxruntime"), ("onnxruntime", "onnxruntime"), ("opencv-python-headless", "opencv-python-headless"), ("opencv-python", "opencv-python"), ("opencv-contrib-python", "opencv-contrib-python"), ("numpy", "numpy"), ("Pillow", "Pillow"), ("shapely", "shapely"), ("pyclipper", "pyclipper"), ] total = 0.0 details = {} for label, dist_name in packages: size = get_package_real_size_mb(dist_name) if size is not None and size > 0.1: # skip trivially small total += size details[label] = round(size, 1) return round(total, 1), details # --------------------------------------------------------------------------- # Metrics # --------------------------------------------------------------------------- def compute_metrics(gt_text: str, ocr_text: str): if not gt_text.strip() or not ocr_text.strip(): return {"CER": None, "WER": None} try: c = cer(gt_text.strip(), ocr_text.strip()) except Exception: c = None try: w = wer(gt_text.strip(), ocr_text.strip()) except Exception: w = None return {"CER": c, "WER": w} # --------------------------------------------------------------------------- # Benchmark runner # --------------------------------------------------------------------------- def run_benchmark(dataset_name, num_samples, progress=gr.Progress()): if dataset_name not in DATASETS: return "❌ Unknown dataset", None, None, None, None ds_info = DATASETS[dataset_name] progress(0, desc=f"Loading dataset: {ds_info['hf_id']}...") try: ds = load_dataset(ds_info["hf_id"], split=ds_info["split"], trust_remote_code=True) except Exception as e: return f"❌ Failed to load dataset: {e}", None, None, None, None n = min(int(num_samples), len(ds), MAX_SAMPLES) ds = ds.select(range(n)) gt_fn = GT_EXTRACTORS[ds_info["gt_fn"]] progress(0.05, desc="Initializing RapidOCR (PP-OCRv4 ONNX) engine...") try: engine = RapidOCREngine() except Exception as e: return f"❌ Failed to init RapidOCR: {e}", None, None, None, None results = [] per_sample = [] for i, row in enumerate(ds): progress((0.1 + 0.85 * i / n), desc=f"Processing sample {i+1}/{n}...") image = row[ds_info["image_col"]] if not isinstance(image, Image.Image): continue gt_text = gt_fn(row) if not gt_text.strip(): continue sample = {"#": i, "Ground Truth": gt_text[:120] + "..." if len(gt_text) > 120 else gt_text} try: texts, scores, elapsed = engine.run(image) ocr_text = " ".join(texts) metrics = compute_metrics(gt_text, ocr_text) results.append({ "elapsed": elapsed, "cer": metrics["CER"], "wer": metrics["WER"], "num_detections": len(texts), "mean_confidence": float(np.mean(scores)) if scores else 0, }) sample["OCR Text"] = ocr_text[:120] + "..." if len(ocr_text) > 120 else ocr_text sample["CER"] = round(metrics["CER"], 4) if metrics["CER"] is not None else "N/A" sample["WER"] = round(metrics["WER"], 4) if metrics["WER"] is not None else "N/A" sample["Confidence"] = round(float(np.mean(scores)), 4) if scores else "N/A" sample["Time (s)"] = round(elapsed, 3) except Exception as e: sample["OCR Text"] = f"ERROR: {e}" sample["CER"] = "N/A" sample["WER"] = "N/A" sample["Confidence"] = "N/A" sample["Time (s)"] = "N/A" per_sample.append(sample) progress(0.97, desc="Computing summary...") if not results: return "❌ No valid results", None, None, None, None cers = [r["cer"] for r in results if r["cer"] is not None] wers = [r["wer"] for r in results if r["wer"] is not None] times = [r["elapsed"] for r in results] confs = [r["mean_confidence"] for r in results] summary = [ {"Metric": "Mean CER ↓", "Value": f"{np.mean(cers):.4f}" if cers else "N/A"}, {"Metric": "Median CER ↓", "Value": f"{np.median(cers):.4f}" if cers else "N/A"}, {"Metric": "Mean WER ↓", "Value": f"{np.mean(wers):.4f}" if wers else "N/A"}, {"Metric": "Median WER ↓", "Value": f"{np.median(wers):.4f}" if wers else "N/A"}, {"Metric": "Mean inference time (s) ↓", "Value": f"{np.mean(times):.3f}"}, {"Metric": "Median inference time (s) ↓", "Value": f"{np.median(times):.3f}"}, {"Metric": "Total time (s)", "Value": f"{sum(times):.2f}"}, {"Metric": "Mean confidence", "Value": f"{np.mean(confs):.4f}" if confs else "N/A"}, {"Metric": "Samples processed", "Value": str(len(results))}, ] progress(0.99, desc="Measuring deployment size (real on-disk)...") total_mb, pkg_details = estimate_deployment_size() size_rows = [{"Package": pkg, "Size (MB)": sz} for pkg, sz in pkg_details.items()] size_rows.append({"Package": "📦 TOTAL (installed)", "Size (MB)": total_mb}) lambda_fits = total_mb < 250 verdict_lines = [ "## 📊 Summary\n", f"**Engine:** RapidOCR PP-OCRv4 (ONNX Runtime)", f"\n**Accuracy:** Mean CER = {np.mean(cers):.4f}, Mean WER = {np.mean(wers):.4f}" if cers else "\n**Accuracy:** N/A", f"\n**Speed:** {np.mean(times):.3f}s avg per image ({len(results)} samples)", f"\n**Deployment footprint:** ~{total_mb} MB installed on disk", f"\n**AWS Lambda 250 MB zip limit:** {'Fits ✅' if lambda_fits else 'Exceeds ❌ — use Lambda container image (10 GB limit) or strip/slim dependencies'}", f"\n\n> ⚠️ Sizes are measured from **actual installed files** on disk via `os.walk`, not from pip metadata. " f"Zip-compressed deployment packages will be smaller (~40-60% of installed size), " f"but the Lambda unzipped limit of 250 MB applies to the installed footprint.", ] return ( f"✅ Benchmark complete — {len(results)} samples processed", summary, per_sample, size_rows, "\n".join(verdict_lines), ) # --------------------------------------------------------------------------- # Single image # --------------------------------------------------------------------------- def run_single_image(image): if image is None: return "Upload an image first" if not isinstance(image, Image.Image): image = Image.fromarray(image) try: engine = RapidOCREngine() texts, scores, elapsed = engine.run(image) lines = [f"[{s:.2f}] {t}" for t, s in zip(texts, scores)] header = f"### RapidOCR (PP-OCRv4 ONNX) — {elapsed:.3f}s — {len(texts)} detections\n" return header + ("\n".join(lines) if lines else "(no text detected)") except Exception as e: return f"### ERROR\n{e}" # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- HEADER = """ # ⚡ RapidOCR Benchmark (PP-OCRv4 — ONNX Runtime) Benchmark **RapidOCR** with the lightweight **ONNX Runtime** inference backend on public OCR datasets. | Property | Value | |---|---| | **Engine** | RapidOCR (rapidocr-onnxruntime) | | **Model version** | PP-OCRv4 (ONNX exported) | | **Runtime** | ONNX Runtime | | **AWS Lambda zip (250 MB)?** | ⚠️ Tight — run benchmark to see actual size | | **AWS Lambda container (10 GB)?** | ✅ Fits easily | > 📏 Deployment sizes are **measured from actual installed files** on disk — not pip metadata. > > 💡 Compare with the [PaddleOCR benchmark Space](https://huggingface.co/spaces/rbaks/paddleocr-benchmark) to see how the full PaddlePaddle runtime compares. """ with gr.Blocks(title="RapidOCR Benchmark") as demo: gr.Markdown(HEADER) with gr.Tabs(): with gr.Tab("📊 Dataset Benchmark"): gr.Markdown("### Run RapidOCR on a benchmark dataset and measure accuracy, speed & deployment footprint.") with gr.Row(): dataset_dd = gr.Dropdown( choices=list(DATASETS.keys()), value=list(DATASETS.keys())[0], label="Select Benchmark Dataset", ) num_slider = gr.Slider(minimum=5, maximum=MAX_SAMPLES, value=20, step=5, label="Number of samples") run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg") status_box = gr.Textbox(label="Status", interactive=False) with gr.Accordion("📈 Summary Metrics", open=True): summary_tbl = gr.Dataframe(headers=["Metric", "Value"], label="Metrics", wrap=True) verdict_md = gr.Markdown("") with gr.Accordion("📦 Deployment Size Breakdown (real on-disk)", open=False): size_tbl = gr.Dataframe(headers=["Package", "Size (MB)"], label="Installed sizes (os.walk)", wrap=True) with gr.Accordion("🔎 Per-Sample Details", open=False): detail_tbl = gr.Dataframe( headers=["#", "Ground Truth", "OCR Text", "CER", "WER", "Confidence", "Time (s)"], label="Per-sample results", wrap=True, ) run_btn.click( fn=run_benchmark, inputs=[dataset_dd, num_slider], outputs=[status_box, summary_tbl, detail_tbl, size_tbl, verdict_md], ) with gr.Tab("🖼️ Try Single Image"): gr.Markdown("### Upload an image to run RapidOCR.") img_input = gr.Image(type="pil", label="Upload Image") single_btn = gr.Button("🔍 Run OCR", variant="primary") single_out = gr.Markdown("") single_btn.click(fn=run_single_image, inputs=[img_input], outputs=[single_out]) with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About this Space This Space benchmarks **RapidOCR** — a lightweight wrapper that runs **PP-OCRv4 models** via **ONNX Runtime** instead of PaddlePaddle. ### Pipeline ``` Image → [Text Detection (DB-Net)] → [Text Classification] → [Text Recognition (SVTR)] → Text ``` Same 3-stage pipeline as PaddleOCR, same model architectures — just a different inference runtime. ### Why ONNX Runtime? - **ONNX Runtime** is an inference-only engine (much smaller than PaddlePaddle) - **PaddlePaddle** is a full ML framework (training + inference) - The PP-OCR models are exported from PaddlePaddle native format to `.onnx` — same weights, same math - However, shared dependencies (OpenCV, NumPy) still dominate the deployment footprint ### PP-OCRv4 ONNX models | Model | File | Size | |-------|------|------| | Text detection | `ch_PP-OCRv4_det_infer.onnx` | ~4.5 MB | | Text recognition | `ch_PP-OCRv4_rec_infer.onnx` | ~10.4 MB | | Text classification | `ch_ppocr_mobile_v2.0_cls.onnx` | ~0.6 MB | | **Total models** | | **~15.5 MB** | ### Size measurement methodology Deployment sizes are measured by walking the **actual installed directories** on disk using `os.walk()` and summing file sizes. This is the real footprint you'd see on an EC2 instance or Lambda container — not the compressed wheel size from pip. ### Metrics | Metric | Description | Good value | |--------|-------------|------------| | **CER** | Character Error Rate | Lower = better (0 = perfect) | | **WER** | Word Error Rate | Lower = better (0 = perfect) | | **Inference time** | Wall-clock time per image | Lower = better | | **Confidence** | Mean OCR confidence score | Higher = better | ### Datasets | Dataset | Type | Content | |---------|------|---------| | FUNSD | Forms | Noisy scanned business forms | | IAM | Handwriting | English handwritten text lines | | CORD-v2 | Receipts | Receipt images with structured GT | | Invoices & Receipts | Documents | Synthetic invoices with OCR GT | """) if __name__ == "__main__": demo.launch()