oneocr / _archive /debug_detector.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 1 day ago

3.22 kB

	"""Debug detector output to understand word segmentation."""
	import numpy as np
	import onnxruntime as ort
	from PIL import Image
	from pathlib import Path

	models_dir = Path("oneocr_extracted/onnx_models")
	img = Image.open("image.png").convert("RGB")
	w, h = img.size

	# Detector setup
	sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
	providers=["CPUExecutionProvider"])

	scale = 800 / max(h, w)
	dh = (int(h * scale) + 31) // 32 * 32
	dw = (int(w * scale) + 31) // 32 * 32
	img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
	img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
	data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
	im_info = np.array([[dh, dw, scale]], dtype=np.float32)

	outputs = sess.run(None, {"data": data, "im_info": im_info})
	output_names = [o.name for o in sess.get_outputs()]
	out_dict = dict(zip(output_names, outputs))

	# Analyze FPN2 (highest resolution)
	pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200]
	link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200]

	print(f"FPN2 shape: {pixel_scores.shape}")
	print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")

	# Find text region
	text_mask = pixel_scores > 0.6
	print(f"Text pixels (>0.6): {text_mask.sum()}")

	# Get the row/column range of text pixels
	ys, xs = np.where(text_mask)
	if len(ys) > 0:
	print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")

	# Check link scores within text region - do they separate words?
	# Link 2 is East neighbor (right), Link 6 is West neighbor (left)
	# If link between words is low, they should separate
	row_mid = (ys.min() + ys.max()) // 2
	print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
	link_east = link_scores[2, row_mid, :] # E neighbor
	for x in range(xs.min(), xs.max()+1):
	ps = pixel_scores[row_mid, x]
	le = link_east[x]
	marker = "TEXT" if ps > 0.6 else " "
	link_marker = "LINK" if le > 0.5 else "gap "
	if ps > 0.3:
	print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")

	# Also check if there are distinct "gap" regions in pixel scores
	print(f"\nPixel scores along row {row_mid}:")
	for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
	ps = pixel_scores[row_mid, x]
	bar = "█" * int(ps * 40)
	print(f" col={x:3d}: {ps:.3f} {bar}")

	# Try different thresholds
	for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
	mask = pixel_scores > thresh
	n = mask.sum()
	# Connected components using simple scan
	from scipy import ndimage
	try:
	labels, n_comps = ndimage.label(mask)
	print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
	for c in range(1, min(n_comps+1, 10)):
	comp_mask = labels == c
	area = comp_mask.sum()
	ys_c, xs_c = np.where(comp_mask)
	print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
	except ImportError:
	# Fallback without scipy
	print(f"Threshold {thresh}: {n} pixels")