Spaces:

rbaks
/

rapidocr-benchmark

Running

App Files Files Community

rapidocr-benchmark / app.py

rbaks

fix: measure actual on-disk package sizes instead of pip metadata estimates

8df2b09 verified 2 months ago

Raw

History Blame Contribute Delete

19.3 kB

	"""
	RapidOCR (ONNX Runtime, PP-OCRv4) — Standalone OCR Benchmark Space
	"""

	import os
	import time
	import json
	import importlib
	import importlib.metadata
	import tempfile
	from pathlib import Path
	from collections import OrderedDict

	import gradio as gr
	import numpy as np
	from PIL import Image
	from jiwer import cer, wer
	from datasets import load_dataset

	# ---------------------------------------------------------------------------
	# Dataset registry
	# ---------------------------------------------------------------------------
	DATASETS = OrderedDict(
	{
	"FUNSD — Forms (50 test docs)": {
	"hf_id": "nielsr/funsd",
	"split": "test",
	"image_col": "image",
	"gt_fn": "funsd",
	"description": "Form Understanding in Noisy Scanned Documents. 50 test documents with word-level GT.",
	},
	"IAM — Handwriting lines (test set, 50 samples)": {
	"hf_id": "Teklia/IAM-line",
	"split": "test",
	"image_col": "image",
	"gt_fn": "iam",
	"description": "IAM handwriting database, line-level images with transcriptions.",
	},
	"CORD-v2 — Receipts (50 samples)": {
	"hf_id": "naver-clova-ix/cord-v2",
	"split": "test",
	"image_col": "image",
	"gt_fn": "cord",
	"description": "Consolidated Receipt Dataset v2. Complex receipt images with structured GT.",
	},
	"Invoices & Receipts (50 samples)": {
	"hf_id": "mychen76/invoices-and-receipts_ocr_v1",
	"split": "test",
	"image_col": "image",
	"gt_fn": "invoices",
	"description": "Invoices and receipts with OCR ground truth text.",
	},
	}
	)

	MAX_SAMPLES = 50

	# ---------------------------------------------------------------------------
	# Ground-truth extraction helpers
	# ---------------------------------------------------------------------------

	def _gt_funsd(row):
	words = row.get("words", [])
	return " ".join(words)

	def _gt_iam(row):
	return row.get("text", "")

	def _gt_cord(row):
	try:
	gt = json.loads(row.get("ground_truth", "{}"))
	parse = gt.get("gt_parse", {})
	parts = []
	for menu_item in parse.get("menu", []):
	for key in ("nm", "cnt", "price", "unitprice", "itemsubtotal", "sub", "etc"):
	val = menu_item.get(key)
	if val and isinstance(val, str):
	parts.append(val)
	elif isinstance(val, dict):
	for v2 in val.values():
	if isinstance(v2, str):
	parts.append(v2)
	for section in ("subtotal", "total", "tax"):
	sec_data = parse.get(section, {})
	if isinstance(sec_data, dict):
	for v in sec_data.values():
	if isinstance(v, str):
	parts.append(v)
	elif isinstance(sec_data, list):
	for item in sec_data:
	if isinstance(item, dict):
	for v in item.values():
	if isinstance(v, str):
	parts.append(v)
	return " ".join(parts) if parts else ""
	except Exception:
	return ""

	def _gt_invoices(row):
	try:
	raw = json.loads(row.get("raw_data", "{}"))
	words_str = raw.get("ocr_words", "")
	if isinstance(words_str, str) and words_str.startswith("["):
	import ast
	words = ast.literal_eval(words_str)
	return " ".join(words)
	return str(words_str)
	except Exception:
	return ""

	GT_EXTRACTORS = {
	"funsd": _gt_funsd,
	"iam": _gt_iam,
	"cord": _gt_cord,
	"invoices": _gt_invoices,
	}

	# ---------------------------------------------------------------------------
	# OCR engine
	# ---------------------------------------------------------------------------

	class RapidOCREngine:
	def __init__(self):
	from rapidocr_onnxruntime import RapidOCR
	self.ocr = RapidOCR()

	def run(self, image: Image.Image):
	img_array = np.array(image.convert("RGB"))
	img_bgr = img_array[:, :, ::-1].copy()

	t0 = time.perf_counter()
	result, elapse = self.ocr(img_bgr)
	elapsed = time.perf_counter() - t0

	texts, scores = [], []
	if result:
	for item in result:
	texts.append(item[1])
	scores.append(float(item[2]))
	return texts, scores, elapsed

	# ---------------------------------------------------------------------------
	# Deployment size — REAL on-disk measurement
	# ---------------------------------------------------------------------------

	def _get_dist_dirs(dist_name: str) -> list[str]:
	"""Find all directories on disk belonging to a pip distribution."""
	try:
	dist = importlib.metadata.distribution(dist_name)
	except importlib.metadata.PackageNotFoundError:
	return []

	dist_info_path = Path(dist._path)
	site_packages = dist_info_path.parent

	dirs: set[str] = set()
	dirs.add(str(dist_info_path))

	# top_level.txt lists the importable package names
	try:
	top_level = dist.read_text("top_level.txt")
	if top_level:
	for name in top_level.strip().splitlines():
	name = name.strip()
	candidate = site_packages / name
	if candidate.is_dir():
	dirs.add(str(candidate))
	elif candidate.with_suffix(".py").is_file():
	dirs.add(str(candidate.with_suffix(".py")))
	except Exception:
	pass

	# Also check RECORD for top-level dirs we may have missed
	if dist.files:
	for f in dist.files:
	parts = str(f).split("/")
	if parts and parts[0] not in (".", ".."):
	top = parts[0]
	if top.endswith((".dist-info", ".egg-info")):
	continue
	candidate = site_packages / top
	if candidate.is_dir():
	dirs.add(str(candidate))
	elif candidate.is_file():
	dirs.add(str(candidate))

	return list(dirs)


	def _size_bytes(path: str) -> int:
	"""Recursively sum real file sizes under a path."""
	p = Path(path)
	if p.is_file():
	return p.stat().st_size
	total = 0
	for dirpath, _, filenames in os.walk(p):
	for fname in filenames:
	try:
	total += os.path.getsize(os.path.join(dirpath, fname))
	except OSError:
	pass
	return total


	def get_package_real_size_mb(dist_name: str) -> float \| None:
	"""Measure the REAL on-disk installed size of a package in MB."""
	dirs = _get_dist_dirs(dist_name)
	if not dirs:
	return None
	total = sum(_size_bytes(d) for d in dirs)
	return total / (1024 * 1024)


	def estimate_deployment_size():
	"""Measure real installed sizes for the RapidOCR deployment stack."""
	packages = [
	("rapidocr-onnxruntime", "rapidocr-onnxruntime"),
	("onnxruntime", "onnxruntime"),
	("opencv-python-headless", "opencv-python-headless"),
	("opencv-python", "opencv-python"),
	("opencv-contrib-python", "opencv-contrib-python"),
	("numpy", "numpy"),
	("Pillow", "Pillow"),
	("shapely", "shapely"),
	("pyclipper", "pyclipper"),
	]
	total = 0.0
	details = {}
	for label, dist_name in packages:
	size = get_package_real_size_mb(dist_name)
	if size is not None and size > 0.1: # skip trivially small
	total += size
	details[label] = round(size, 1)
	return round(total, 1), details

	# ---------------------------------------------------------------------------
	# Metrics
	# ---------------------------------------------------------------------------

	def compute_metrics(gt_text: str, ocr_text: str):
	if not gt_text.strip() or not ocr_text.strip():
	return {"CER": None, "WER": None}
	try:
	c = cer(gt_text.strip(), ocr_text.strip())
	except Exception:
	c = None
	try:
	w = wer(gt_text.strip(), ocr_text.strip())
	except Exception:
	w = None
	return {"CER": c, "WER": w}

	# ---------------------------------------------------------------------------
	# Benchmark runner
	# ---------------------------------------------------------------------------

	def run_benchmark(dataset_name, num_samples, progress=gr.Progress()):
	if dataset_name not in DATASETS:
	return "❌ Unknown dataset", None, None, None, None

	ds_info = DATASETS[dataset_name]
	progress(0, desc=f"Loading dataset: {ds_info['hf_id']}...")

	try:
	ds = load_dataset(ds_info["hf_id"], split=ds_info["split"], trust_remote_code=True)
	except Exception as e:
	return f"❌ Failed to load dataset: {e}", None, None, None, None

	n = min(int(num_samples), len(ds), MAX_SAMPLES)
	ds = ds.select(range(n))
	gt_fn = GT_EXTRACTORS[ds_info["gt_fn"]]

	progress(0.05, desc="Initializing RapidOCR (PP-OCRv4 ONNX) engine...")
	try:
	engine = RapidOCREngine()
	except Exception as e:
	return f"❌ Failed to init RapidOCR: {e}", None, None, None, None

	results = []
	per_sample = []

	for i, row in enumerate(ds):
	progress((0.1 + 0.85 * i / n), desc=f"Processing sample {i+1}/{n}...")
	image = row[ds_info["image_col"]]
	if not isinstance(image, Image.Image):
	continue
	gt_text = gt_fn(row)
	if not gt_text.strip():
	continue

	sample = {"#": i, "Ground Truth": gt_text[:120] + "..." if len(gt_text) > 120 else gt_text}
	try:
	texts, scores, elapsed = engine.run(image)
	ocr_text = " ".join(texts)
	metrics = compute_metrics(gt_text, ocr_text)
	results.append({
	"elapsed": elapsed,
	"cer": metrics["CER"],
	"wer": metrics["WER"],
	"num_detections": len(texts),
	"mean_confidence": float(np.mean(scores)) if scores else 0,
	})
	sample["OCR Text"] = ocr_text[:120] + "..." if len(ocr_text) > 120 else ocr_text
	sample["CER"] = round(metrics["CER"], 4) if metrics["CER"] is not None else "N/A"
	sample["WER"] = round(metrics["WER"], 4) if metrics["WER"] is not None else "N/A"
	sample["Confidence"] = round(float(np.mean(scores)), 4) if scores else "N/A"
	sample["Time (s)"] = round(elapsed, 3)
	except Exception as e:
	sample["OCR Text"] = f"ERROR: {e}"
	sample["CER"] = "N/A"
	sample["WER"] = "N/A"
	sample["Confidence"] = "N/A"
	sample["Time (s)"] = "N/A"
	per_sample.append(sample)

	progress(0.97, desc="Computing summary...")

	if not results:
	return "❌ No valid results", None, None, None, None

	cers = [r["cer"] for r in results if r["cer"] is not None]
	wers = [r["wer"] for r in results if r["wer"] is not None]
	times = [r["elapsed"] for r in results]
	confs = [r["mean_confidence"] for r in results]

	summary = [
	{"Metric": "Mean CER ↓", "Value": f"{np.mean(cers):.4f}" if cers else "N/A"},
	{"Metric": "Median CER ↓", "Value": f"{np.median(cers):.4f}" if cers else "N/A"},
	{"Metric": "Mean WER ↓", "Value": f"{np.mean(wers):.4f}" if wers else "N/A"},
	{"Metric": "Median WER ↓", "Value": f"{np.median(wers):.4f}" if wers else "N/A"},
	{"Metric": "Mean inference time (s) ↓", "Value": f"{np.mean(times):.3f}"},
	{"Metric": "Median inference time (s) ↓", "Value": f"{np.median(times):.3f}"},
	{"Metric": "Total time (s)", "Value": f"{sum(times):.2f}"},
	{"Metric": "Mean confidence", "Value": f"{np.mean(confs):.4f}" if confs else "N/A"},
	{"Metric": "Samples processed", "Value": str(len(results))},
	]

	progress(0.99, desc="Measuring deployment size (real on-disk)...")
	total_mb, pkg_details = estimate_deployment_size()
	size_rows = [{"Package": pkg, "Size (MB)": sz} for pkg, sz in pkg_details.items()]
	size_rows.append({"Package": "📦 TOTAL (installed)", "Size (MB)": total_mb})

	lambda_fits = total_mb < 250
	verdict_lines = [
	"## 📊 Summary\n",
	f"Engine: RapidOCR PP-OCRv4 (ONNX Runtime)",
	f"\nAccuracy: Mean CER = {np.mean(cers):.4f}, Mean WER = {np.mean(wers):.4f}" if cers else "\nAccuracy: N/A",
	f"\nSpeed: {np.mean(times):.3f}s avg per image ({len(results)} samples)",
	f"\nDeployment footprint: ~{total_mb} MB installed on disk",
	f"\nAWS Lambda 250 MB zip limit: {'Fits ✅' if lambda_fits else 'Exceeds ❌ — use Lambda container image (10 GB limit) or strip/slim dependencies'}",
	f"\n\n> ⚠️ Sizes are measured from actual installed files on disk via `os.walk`, not from pip metadata. "
	f"Zip-compressed deployment packages will be smaller (~40-60% of installed size), "
	f"but the Lambda unzipped limit of 250 MB applies to the installed footprint.",
	]

	return (
	f"✅ Benchmark complete — {len(results)} samples processed",
	summary,
	per_sample,
	size_rows,
	"\n".join(verdict_lines),
	)

	# ---------------------------------------------------------------------------
	# Single image
	# ---------------------------------------------------------------------------

	def run_single_image(image):
	if image is None:
	return "Upload an image first"
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)
	try:
	engine = RapidOCREngine()
	texts, scores, elapsed = engine.run(image)
	lines = [f"[{s:.2f}] {t}" for t, s in zip(texts, scores)]
	header = f"### RapidOCR (PP-OCRv4 ONNX) — {elapsed:.3f}s — {len(texts)} detections\n"
	return header + ("\n".join(lines) if lines else "(no text detected)")
	except Exception as e:
	return f"### ERROR\n{e}"

	# ---------------------------------------------------------------------------
	# UI
	# ---------------------------------------------------------------------------

	HEADER = """
	# ⚡ RapidOCR Benchmark (PP-OCRv4 — ONNX Runtime)

	Benchmark RapidOCR with the lightweight ONNX Runtime inference backend on public OCR datasets.

	\| Property \| Value \|
	\|---\|---\|
	\| Engine \| RapidOCR (rapidocr-onnxruntime) \|
	\| Model version \| PP-OCRv4 (ONNX exported) \|
	\| Runtime \| ONNX Runtime \|
	\| AWS Lambda zip (250 MB)? \| ⚠️ Tight — run benchmark to see actual size \|
	\| AWS Lambda container (10 GB)? \| ✅ Fits easily \|

	> 📏 Deployment sizes are measured from actual installed files on disk — not pip metadata.
	>
	> 💡 Compare with the [PaddleOCR benchmark Space](https://huggingface.co/spaces/rbaks/paddleocr-benchmark) to see how the full PaddlePaddle runtime compares.
	"""

	with gr.Blocks(title="RapidOCR Benchmark") as demo:
	gr.Markdown(HEADER)

	with gr.Tabs():
	with gr.Tab("📊 Dataset Benchmark"):
	gr.Markdown("### Run RapidOCR on a benchmark dataset and measure accuracy, speed & deployment footprint.")
	with gr.Row():
	dataset_dd = gr.Dropdown(
	choices=list(DATASETS.keys()),
	value=list(DATASETS.keys())[0],
	label="Select Benchmark Dataset",
	)
	num_slider = gr.Slider(minimum=5, maximum=MAX_SAMPLES, value=20, step=5, label="Number of samples")

	run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg")
	status_box = gr.Textbox(label="Status", interactive=False)

	with gr.Accordion("📈 Summary Metrics", open=True):
	summary_tbl = gr.Dataframe(headers=["Metric", "Value"], label="Metrics", wrap=True)

	verdict_md = gr.Markdown("")

	with gr.Accordion("📦 Deployment Size Breakdown (real on-disk)", open=False):
	size_tbl = gr.Dataframe(headers=["Package", "Size (MB)"], label="Installed sizes (os.walk)", wrap=True)

	with gr.Accordion("🔎 Per-Sample Details", open=False):
	detail_tbl = gr.Dataframe(
	headers=["#", "Ground Truth", "OCR Text", "CER", "WER", "Confidence", "Time (s)"],
	label="Per-sample results",
	wrap=True,
	)

	run_btn.click(
	fn=run_benchmark,
	inputs=[dataset_dd, num_slider],
	outputs=[status_box, summary_tbl, detail_tbl, size_tbl, verdict_md],
	)

	with gr.Tab("🖼️ Try Single Image"):
	gr.Markdown("### Upload an image to run RapidOCR.")
	img_input = gr.Image(type="pil", label="Upload Image")
	single_btn = gr.Button("🔍 Run OCR", variant="primary")
	single_out = gr.Markdown("")
	single_btn.click(fn=run_single_image, inputs=[img_input], outputs=[single_out])

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About this Space

	This Space benchmarks RapidOCR — a lightweight wrapper that runs PP-OCRv4 models via ONNX Runtime instead of PaddlePaddle.

	### Pipeline
	```
	Image → [Text Detection (DB-Net)] → [Text Classification] → [Text Recognition (SVTR)] → Text
	```
	Same 3-stage pipeline as PaddleOCR, same model architectures — just a different inference runtime.

	### Why ONNX Runtime?
	- ONNX Runtime is an inference-only engine (much smaller than PaddlePaddle)
	- PaddlePaddle is a full ML framework (training + inference)
	- The PP-OCR models are exported from PaddlePaddle native format to `.onnx` — same weights, same math
	- However, shared dependencies (OpenCV, NumPy) still dominate the deployment footprint

	### PP-OCRv4 ONNX models
	\| Model \| File \| Size \|
	\|-------\|------\|------\|
	\| Text detection \| `ch_PP-OCRv4_det_infer.onnx` \| ~4.5 MB \|
	\| Text recognition \| `ch_PP-OCRv4_rec_infer.onnx` \| ~10.4 MB \|
	\| Text classification \| `ch_ppocr_mobile_v2.0_cls.onnx` \| ~0.6 MB \|
	\| Total models \| \| ~15.5 MB \|

	### Size measurement methodology
	Deployment sizes are measured by walking the actual installed directories on disk using `os.walk()` and summing file sizes.
	This is the real footprint you'd see on an EC2 instance or Lambda container — not the compressed wheel size from pip.

	### Metrics
	\| Metric \| Description \| Good value \|
	\|--------\|-------------\|------------\|
	\| CER \| Character Error Rate \| Lower = better (0 = perfect) \|
	\| WER \| Word Error Rate \| Lower = better (0 = perfect) \|
	\| Inference time \| Wall-clock time per image \| Lower = better \|
	\| Confidence \| Mean OCR confidence score \| Higher = better \|

	### Datasets
	\| Dataset \| Type \| Content \|
	\|---------\|------\|---------\|
	\| FUNSD \| Forms \| Noisy scanned business forms \|
	\| IAM \| Handwriting \| English handwritten text lines \|
	\| CORD-v2 \| Receipts \| Receipt images with structured GT \|
	\| Invoices & Receipts \| Documents \| Synthetic invoices with OCR GT \|
	""")

	if __name__ == "__main__":
	demo.launch()