oneocr / _archive /debug_detector.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 15 days ago

3.22 kB

	"""Debug detector output to understand word segmentation."""
	import numpy as np
	import onnxruntime as ort
	from PIL import Image
	from pathlib import Path

	models_dir = Path("oneocr_extracted/onnx_models")
	img = Image.open("image.png").convert("RGB")
	w, h = img.size

	# Detector setup
	sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
	providers=["CPUExecutionProvider"])

	scale = 800 / max(h, w)
	dh = (int(h * scale) + 31) // 32 * 32
	dw = (int(w * scale) + 31) // 32 * 32
	img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
	img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
	data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
	im_info = np.array([[dh, dw, scale]], dtype=np.float32)

	outputs = sess.run(None, {"data": data, "im_info": im_info})
	output_names = [o.name for o in sess.get_outputs()]
	out_dict = dict(zip(output_names, outputs))

	# Analyze FPN2 (highest resolution)
	pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200]
	link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200]

	print(f"FPN2 shape: {pixel_scores.shape}")
	print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")

	# Find text region
	text_mask = pixel_scores > 0.6
	print(f"Text pixels (>0.6): {text_mask.sum()}")

	# Get the row/column range of text pixels
	ys, xs = np.where(text_mask)
	if len(ys) > 0:
	print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")

	# Check link scores within text region - do they separate words?
	# Link 2 is East neighbor (right), Link 6 is West neighbor (left)
	# If link between words is low, they should separate
	row_mid = (ys.min() + ys.max()) // 2
	print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
	link_east = link_scores[2, row_mid, :] # E neighbor
	for x in range(xs.min(), xs.max()+1):
	ps = pixel_scores[row_mid, x]
	le = link_east[x]
	marker = "TEXT" if ps > 0.6 else " "
	link_marker = "LINK" if le > 0.5 else "gap "
	if ps > 0.3:
	print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")

	# Also check if there are distinct "gap" regions in pixel scores
	print(f"\nPixel scores along row {row_mid}:")
	for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
	ps = pixel_scores[row_mid, x]
	bar = "█" * int(ps * 40)
	print(f" col={x:3d}: {ps:.3f} {bar}")

	# Try different thresholds
	for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
	mask = pixel_scores > thresh
	n = mask.sum()
	# Connected components using simple scan
	from scipy import ndimage
	try:
	labels, n_comps = ndimage.label(mask)
	print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
	for c in range(1, min(n_comps+1, 10)):
	comp_mask = labels == c
	area = comp_mask.sum()
	ys_c, xs_c = np.where(comp_mask)
	print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
	except ImportError:
	# Fallback without scipy
	print(f"Threshold {thresh}: {n} pixels")