Spaces:
Running
Running
| """ | |
| RapidOCR (ONNX Runtime, PP-OCRv4) — Standalone OCR Benchmark Space | |
| """ | |
| import os | |
| import time | |
| import json | |
| import importlib | |
| import importlib.metadata | |
| import tempfile | |
| from pathlib import Path | |
| from collections import OrderedDict | |
| import gradio as gr | |
| import numpy as np | |
| from PIL import Image | |
| from jiwer import cer, wer | |
| from datasets import load_dataset | |
| # --------------------------------------------------------------------------- | |
| # Dataset registry | |
| # --------------------------------------------------------------------------- | |
| DATASETS = OrderedDict( | |
| { | |
| "FUNSD — Forms (50 test docs)": { | |
| "hf_id": "nielsr/funsd", | |
| "split": "test", | |
| "image_col": "image", | |
| "gt_fn": "funsd", | |
| "description": "Form Understanding in Noisy Scanned Documents. 50 test documents with word-level GT.", | |
| }, | |
| "IAM — Handwriting lines (test set, 50 samples)": { | |
| "hf_id": "Teklia/IAM-line", | |
| "split": "test", | |
| "image_col": "image", | |
| "gt_fn": "iam", | |
| "description": "IAM handwriting database, line-level images with transcriptions.", | |
| }, | |
| "CORD-v2 — Receipts (50 samples)": { | |
| "hf_id": "naver-clova-ix/cord-v2", | |
| "split": "test", | |
| "image_col": "image", | |
| "gt_fn": "cord", | |
| "description": "Consolidated Receipt Dataset v2. Complex receipt images with structured GT.", | |
| }, | |
| "Invoices & Receipts (50 samples)": { | |
| "hf_id": "mychen76/invoices-and-receipts_ocr_v1", | |
| "split": "test", | |
| "image_col": "image", | |
| "gt_fn": "invoices", | |
| "description": "Invoices and receipts with OCR ground truth text.", | |
| }, | |
| } | |
| ) | |
| MAX_SAMPLES = 50 | |
| # --------------------------------------------------------------------------- | |
| # Ground-truth extraction helpers | |
| # --------------------------------------------------------------------------- | |
| def _gt_funsd(row): | |
| words = row.get("words", []) | |
| return " ".join(words) | |
| def _gt_iam(row): | |
| return row.get("text", "") | |
| def _gt_cord(row): | |
| try: | |
| gt = json.loads(row.get("ground_truth", "{}")) | |
| parse = gt.get("gt_parse", {}) | |
| parts = [] | |
| for menu_item in parse.get("menu", []): | |
| for key in ("nm", "cnt", "price", "unitprice", "itemsubtotal", "sub", "etc"): | |
| val = menu_item.get(key) | |
| if val and isinstance(val, str): | |
| parts.append(val) | |
| elif isinstance(val, dict): | |
| for v2 in val.values(): | |
| if isinstance(v2, str): | |
| parts.append(v2) | |
| for section in ("subtotal", "total", "tax"): | |
| sec_data = parse.get(section, {}) | |
| if isinstance(sec_data, dict): | |
| for v in sec_data.values(): | |
| if isinstance(v, str): | |
| parts.append(v) | |
| elif isinstance(sec_data, list): | |
| for item in sec_data: | |
| if isinstance(item, dict): | |
| for v in item.values(): | |
| if isinstance(v, str): | |
| parts.append(v) | |
| return " ".join(parts) if parts else "" | |
| except Exception: | |
| return "" | |
| def _gt_invoices(row): | |
| try: | |
| raw = json.loads(row.get("raw_data", "{}")) | |
| words_str = raw.get("ocr_words", "") | |
| if isinstance(words_str, str) and words_str.startswith("["): | |
| import ast | |
| words = ast.literal_eval(words_str) | |
| return " ".join(words) | |
| return str(words_str) | |
| except Exception: | |
| return "" | |
| GT_EXTRACTORS = { | |
| "funsd": _gt_funsd, | |
| "iam": _gt_iam, | |
| "cord": _gt_cord, | |
| "invoices": _gt_invoices, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # OCR engine | |
| # --------------------------------------------------------------------------- | |
| class RapidOCREngine: | |
| def __init__(self): | |
| from rapidocr_onnxruntime import RapidOCR | |
| self.ocr = RapidOCR() | |
| def run(self, image: Image.Image): | |
| img_array = np.array(image.convert("RGB")) | |
| img_bgr = img_array[:, :, ::-1].copy() | |
| t0 = time.perf_counter() | |
| result, elapse = self.ocr(img_bgr) | |
| elapsed = time.perf_counter() - t0 | |
| texts, scores = [], [] | |
| if result: | |
| for item in result: | |
| texts.append(item[1]) | |
| scores.append(float(item[2])) | |
| return texts, scores, elapsed | |
| # --------------------------------------------------------------------------- | |
| # Deployment size — REAL on-disk measurement | |
| # --------------------------------------------------------------------------- | |
| def _get_dist_dirs(dist_name: str) -> list[str]: | |
| """Find all directories on disk belonging to a pip distribution.""" | |
| try: | |
| dist = importlib.metadata.distribution(dist_name) | |
| except importlib.metadata.PackageNotFoundError: | |
| return [] | |
| dist_info_path = Path(dist._path) | |
| site_packages = dist_info_path.parent | |
| dirs: set[str] = set() | |
| dirs.add(str(dist_info_path)) | |
| # top_level.txt lists the importable package names | |
| try: | |
| top_level = dist.read_text("top_level.txt") | |
| if top_level: | |
| for name in top_level.strip().splitlines(): | |
| name = name.strip() | |
| candidate = site_packages / name | |
| if candidate.is_dir(): | |
| dirs.add(str(candidate)) | |
| elif candidate.with_suffix(".py").is_file(): | |
| dirs.add(str(candidate.with_suffix(".py"))) | |
| except Exception: | |
| pass | |
| # Also check RECORD for top-level dirs we may have missed | |
| if dist.files: | |
| for f in dist.files: | |
| parts = str(f).split("/") | |
| if parts and parts[0] not in (".", ".."): | |
| top = parts[0] | |
| if top.endswith((".dist-info", ".egg-info")): | |
| continue | |
| candidate = site_packages / top | |
| if candidate.is_dir(): | |
| dirs.add(str(candidate)) | |
| elif candidate.is_file(): | |
| dirs.add(str(candidate)) | |
| return list(dirs) | |
| def _size_bytes(path: str) -> int: | |
| """Recursively sum real file sizes under a path.""" | |
| p = Path(path) | |
| if p.is_file(): | |
| return p.stat().st_size | |
| total = 0 | |
| for dirpath, _, filenames in os.walk(p): | |
| for fname in filenames: | |
| try: | |
| total += os.path.getsize(os.path.join(dirpath, fname)) | |
| except OSError: | |
| pass | |
| return total | |
| def get_package_real_size_mb(dist_name: str) -> float | None: | |
| """Measure the REAL on-disk installed size of a package in MB.""" | |
| dirs = _get_dist_dirs(dist_name) | |
| if not dirs: | |
| return None | |
| total = sum(_size_bytes(d) for d in dirs) | |
| return total / (1024 * 1024) | |
| def estimate_deployment_size(): | |
| """Measure real installed sizes for the RapidOCR deployment stack.""" | |
| packages = [ | |
| ("rapidocr-onnxruntime", "rapidocr-onnxruntime"), | |
| ("onnxruntime", "onnxruntime"), | |
| ("opencv-python-headless", "opencv-python-headless"), | |
| ("opencv-python", "opencv-python"), | |
| ("opencv-contrib-python", "opencv-contrib-python"), | |
| ("numpy", "numpy"), | |
| ("Pillow", "Pillow"), | |
| ("shapely", "shapely"), | |
| ("pyclipper", "pyclipper"), | |
| ] | |
| total = 0.0 | |
| details = {} | |
| for label, dist_name in packages: | |
| size = get_package_real_size_mb(dist_name) | |
| if size is not None and size > 0.1: # skip trivially small | |
| total += size | |
| details[label] = round(size, 1) | |
| return round(total, 1), details | |
| # --------------------------------------------------------------------------- | |
| # Metrics | |
| # --------------------------------------------------------------------------- | |
| def compute_metrics(gt_text: str, ocr_text: str): | |
| if not gt_text.strip() or not ocr_text.strip(): | |
| return {"CER": None, "WER": None} | |
| try: | |
| c = cer(gt_text.strip(), ocr_text.strip()) | |
| except Exception: | |
| c = None | |
| try: | |
| w = wer(gt_text.strip(), ocr_text.strip()) | |
| except Exception: | |
| w = None | |
| return {"CER": c, "WER": w} | |
| # --------------------------------------------------------------------------- | |
| # Benchmark runner | |
| # --------------------------------------------------------------------------- | |
| def run_benchmark(dataset_name, num_samples, progress=gr.Progress()): | |
| if dataset_name not in DATASETS: | |
| return "❌ Unknown dataset", None, None, None, None | |
| ds_info = DATASETS[dataset_name] | |
| progress(0, desc=f"Loading dataset: {ds_info['hf_id']}...") | |
| try: | |
| ds = load_dataset(ds_info["hf_id"], split=ds_info["split"], trust_remote_code=True) | |
| except Exception as e: | |
| return f"❌ Failed to load dataset: {e}", None, None, None, None | |
| n = min(int(num_samples), len(ds), MAX_SAMPLES) | |
| ds = ds.select(range(n)) | |
| gt_fn = GT_EXTRACTORS[ds_info["gt_fn"]] | |
| progress(0.05, desc="Initializing RapidOCR (PP-OCRv4 ONNX) engine...") | |
| try: | |
| engine = RapidOCREngine() | |
| except Exception as e: | |
| return f"❌ Failed to init RapidOCR: {e}", None, None, None, None | |
| results = [] | |
| per_sample = [] | |
| for i, row in enumerate(ds): | |
| progress((0.1 + 0.85 * i / n), desc=f"Processing sample {i+1}/{n}...") | |
| image = row[ds_info["image_col"]] | |
| if not isinstance(image, Image.Image): | |
| continue | |
| gt_text = gt_fn(row) | |
| if not gt_text.strip(): | |
| continue | |
| sample = {"#": i, "Ground Truth": gt_text[:120] + "..." if len(gt_text) > 120 else gt_text} | |
| try: | |
| texts, scores, elapsed = engine.run(image) | |
| ocr_text = " ".join(texts) | |
| metrics = compute_metrics(gt_text, ocr_text) | |
| results.append({ | |
| "elapsed": elapsed, | |
| "cer": metrics["CER"], | |
| "wer": metrics["WER"], | |
| "num_detections": len(texts), | |
| "mean_confidence": float(np.mean(scores)) if scores else 0, | |
| }) | |
| sample["OCR Text"] = ocr_text[:120] + "..." if len(ocr_text) > 120 else ocr_text | |
| sample["CER"] = round(metrics["CER"], 4) if metrics["CER"] is not None else "N/A" | |
| sample["WER"] = round(metrics["WER"], 4) if metrics["WER"] is not None else "N/A" | |
| sample["Confidence"] = round(float(np.mean(scores)), 4) if scores else "N/A" | |
| sample["Time (s)"] = round(elapsed, 3) | |
| except Exception as e: | |
| sample["OCR Text"] = f"ERROR: {e}" | |
| sample["CER"] = "N/A" | |
| sample["WER"] = "N/A" | |
| sample["Confidence"] = "N/A" | |
| sample["Time (s)"] = "N/A" | |
| per_sample.append(sample) | |
| progress(0.97, desc="Computing summary...") | |
| if not results: | |
| return "❌ No valid results", None, None, None, None | |
| cers = [r["cer"] for r in results if r["cer"] is not None] | |
| wers = [r["wer"] for r in results if r["wer"] is not None] | |
| times = [r["elapsed"] for r in results] | |
| confs = [r["mean_confidence"] for r in results] | |
| summary = [ | |
| {"Metric": "Mean CER ↓", "Value": f"{np.mean(cers):.4f}" if cers else "N/A"}, | |
| {"Metric": "Median CER ↓", "Value": f"{np.median(cers):.4f}" if cers else "N/A"}, | |
| {"Metric": "Mean WER ↓", "Value": f"{np.mean(wers):.4f}" if wers else "N/A"}, | |
| {"Metric": "Median WER ↓", "Value": f"{np.median(wers):.4f}" if wers else "N/A"}, | |
| {"Metric": "Mean inference time (s) ↓", "Value": f"{np.mean(times):.3f}"}, | |
| {"Metric": "Median inference time (s) ↓", "Value": f"{np.median(times):.3f}"}, | |
| {"Metric": "Total time (s)", "Value": f"{sum(times):.2f}"}, | |
| {"Metric": "Mean confidence", "Value": f"{np.mean(confs):.4f}" if confs else "N/A"}, | |
| {"Metric": "Samples processed", "Value": str(len(results))}, | |
| ] | |
| progress(0.99, desc="Measuring deployment size (real on-disk)...") | |
| total_mb, pkg_details = estimate_deployment_size() | |
| size_rows = [{"Package": pkg, "Size (MB)": sz} for pkg, sz in pkg_details.items()] | |
| size_rows.append({"Package": "📦 TOTAL (installed)", "Size (MB)": total_mb}) | |
| lambda_fits = total_mb < 250 | |
| verdict_lines = [ | |
| "## 📊 Summary\n", | |
| f"**Engine:** RapidOCR PP-OCRv4 (ONNX Runtime)", | |
| f"\n**Accuracy:** Mean CER = {np.mean(cers):.4f}, Mean WER = {np.mean(wers):.4f}" if cers else "\n**Accuracy:** N/A", | |
| f"\n**Speed:** {np.mean(times):.3f}s avg per image ({len(results)} samples)", | |
| f"\n**Deployment footprint:** ~{total_mb} MB installed on disk", | |
| f"\n**AWS Lambda 250 MB zip limit:** {'Fits ✅' if lambda_fits else 'Exceeds ❌ — use Lambda container image (10 GB limit) or strip/slim dependencies'}", | |
| f"\n\n> ⚠️ Sizes are measured from **actual installed files** on disk via `os.walk`, not from pip metadata. " | |
| f"Zip-compressed deployment packages will be smaller (~40-60% of installed size), " | |
| f"but the Lambda unzipped limit of 250 MB applies to the installed footprint.", | |
| ] | |
| return ( | |
| f"✅ Benchmark complete — {len(results)} samples processed", | |
| summary, | |
| per_sample, | |
| size_rows, | |
| "\n".join(verdict_lines), | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Single image | |
| # --------------------------------------------------------------------------- | |
| def run_single_image(image): | |
| if image is None: | |
| return "Upload an image first" | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| try: | |
| engine = RapidOCREngine() | |
| texts, scores, elapsed = engine.run(image) | |
| lines = [f"[{s:.2f}] {t}" for t, s in zip(texts, scores)] | |
| header = f"### RapidOCR (PP-OCRv4 ONNX) — {elapsed:.3f}s — {len(texts)} detections\n" | |
| return header + ("\n".join(lines) if lines else "(no text detected)") | |
| except Exception as e: | |
| return f"### ERROR\n{e}" | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| HEADER = """ | |
| # ⚡ RapidOCR Benchmark (PP-OCRv4 — ONNX Runtime) | |
| Benchmark **RapidOCR** with the lightweight **ONNX Runtime** inference backend on public OCR datasets. | |
| | Property | Value | | |
| |---|---| | |
| | **Engine** | RapidOCR (rapidocr-onnxruntime) | | |
| | **Model version** | PP-OCRv4 (ONNX exported) | | |
| | **Runtime** | ONNX Runtime | | |
| | **AWS Lambda zip (250 MB)?** | ⚠️ Tight — run benchmark to see actual size | | |
| | **AWS Lambda container (10 GB)?** | ✅ Fits easily | | |
| > 📏 Deployment sizes are **measured from actual installed files** on disk — not pip metadata. | |
| > | |
| > 💡 Compare with the [PaddleOCR benchmark Space](https://huggingface.co/spaces/rbaks/paddleocr-benchmark) to see how the full PaddlePaddle runtime compares. | |
| """ | |
| with gr.Blocks(title="RapidOCR Benchmark") as demo: | |
| gr.Markdown(HEADER) | |
| with gr.Tabs(): | |
| with gr.Tab("📊 Dataset Benchmark"): | |
| gr.Markdown("### Run RapidOCR on a benchmark dataset and measure accuracy, speed & deployment footprint.") | |
| with gr.Row(): | |
| dataset_dd = gr.Dropdown( | |
| choices=list(DATASETS.keys()), | |
| value=list(DATASETS.keys())[0], | |
| label="Select Benchmark Dataset", | |
| ) | |
| num_slider = gr.Slider(minimum=5, maximum=MAX_SAMPLES, value=20, step=5, label="Number of samples") | |
| run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg") | |
| status_box = gr.Textbox(label="Status", interactive=False) | |
| with gr.Accordion("📈 Summary Metrics", open=True): | |
| summary_tbl = gr.Dataframe(headers=["Metric", "Value"], label="Metrics", wrap=True) | |
| verdict_md = gr.Markdown("") | |
| with gr.Accordion("📦 Deployment Size Breakdown (real on-disk)", open=False): | |
| size_tbl = gr.Dataframe(headers=["Package", "Size (MB)"], label="Installed sizes (os.walk)", wrap=True) | |
| with gr.Accordion("🔎 Per-Sample Details", open=False): | |
| detail_tbl = gr.Dataframe( | |
| headers=["#", "Ground Truth", "OCR Text", "CER", "WER", "Confidence", "Time (s)"], | |
| label="Per-sample results", | |
| wrap=True, | |
| ) | |
| run_btn.click( | |
| fn=run_benchmark, | |
| inputs=[dataset_dd, num_slider], | |
| outputs=[status_box, summary_tbl, detail_tbl, size_tbl, verdict_md], | |
| ) | |
| with gr.Tab("🖼️ Try Single Image"): | |
| gr.Markdown("### Upload an image to run RapidOCR.") | |
| img_input = gr.Image(type="pil", label="Upload Image") | |
| single_btn = gr.Button("🔍 Run OCR", variant="primary") | |
| single_out = gr.Markdown("") | |
| single_btn.click(fn=run_single_image, inputs=[img_input], outputs=[single_out]) | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(""" | |
| ## About this Space | |
| This Space benchmarks **RapidOCR** — a lightweight wrapper that runs **PP-OCRv4 models** via **ONNX Runtime** instead of PaddlePaddle. | |
| ### Pipeline | |
| ``` | |
| Image → [Text Detection (DB-Net)] → [Text Classification] → [Text Recognition (SVTR)] → Text | |
| ``` | |
| Same 3-stage pipeline as PaddleOCR, same model architectures — just a different inference runtime. | |
| ### Why ONNX Runtime? | |
| - **ONNX Runtime** is an inference-only engine (much smaller than PaddlePaddle) | |
| - **PaddlePaddle** is a full ML framework (training + inference) | |
| - The PP-OCR models are exported from PaddlePaddle native format to `.onnx` — same weights, same math | |
| - However, shared dependencies (OpenCV, NumPy) still dominate the deployment footprint | |
| ### PP-OCRv4 ONNX models | |
| | Model | File | Size | | |
| |-------|------|------| | |
| | Text detection | `ch_PP-OCRv4_det_infer.onnx` | ~4.5 MB | | |
| | Text recognition | `ch_PP-OCRv4_rec_infer.onnx` | ~10.4 MB | | |
| | Text classification | `ch_ppocr_mobile_v2.0_cls.onnx` | ~0.6 MB | | |
| | **Total models** | | **~15.5 MB** | | |
| ### Size measurement methodology | |
| Deployment sizes are measured by walking the **actual installed directories** on disk using `os.walk()` and summing file sizes. | |
| This is the real footprint you'd see on an EC2 instance or Lambda container — not the compressed wheel size from pip. | |
| ### Metrics | |
| | Metric | Description | Good value | | |
| |--------|-------------|------------| | |
| | **CER** | Character Error Rate | Lower = better (0 = perfect) | | |
| | **WER** | Word Error Rate | Lower = better (0 = perfect) | | |
| | **Inference time** | Wall-clock time per image | Lower = better | | |
| | **Confidence** | Mean OCR confidence score | Higher = better | | |
| ### Datasets | |
| | Dataset | Type | Content | | |
| |---------|------|---------| | |
| | FUNSD | Forms | Noisy scanned business forms | | |
| | IAM | Handwriting | English handwritten text lines | | |
| | CORD-v2 | Receipts | Receipt images with structured GT | | |
| | Invoices & Receipts | Documents | Synthetic invoices with OCR GT | | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |