oneocr / _archive /temp /analyze_deltas.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 15 days ago

6.16 kB

	"""Analyze bbox_deltas to understand their format and how they refine box positions."""
	import sys, os
	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	import cv2
	import numpy as np
	import onnxruntime as ort

	# Use a small image where we know the text positions
	img = cv2.imread('working_space/input/ocr_test (2).png')
	img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	h, w = img_rgb.shape[:2]
	print(f"Image: {w}x{h}")

	mean = np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
	scale = 800 / max(h, w)
	scale = min(scale, 6.0)
	dh = (int(h * scale) + 31) // 32 * 32
	dw = (int(w * scale) + 31) // 32 * 32
	print(f"Scale={scale:.3f}, Det size: {dw}x{dh}")

	img_det = cv2.resize(img_rgb, (dw, dh)).astype(np.float32)
	det_data = (img_det[:,:,::-1] - mean).transpose(2,0,1)[np.newaxis]
	im_info = np.array([[dh, dw, scale]], dtype=np.float32)

	sess = ort.InferenceSession(
	'oneocr_extracted/onnx_models/model_00_ir6_0.1.0_11282KB.onnx',
	providers=['CPUExecutionProvider']
	)
	outs = sess.run(None, {'data': det_data, 'im_info': im_info})
	out_names = [o.name for o in sess.get_outputs()]
	out_dict = dict(zip(out_names, outs))

	print(f"\nOutputs: {out_names}")
	for name in out_names:
	print(f" {name}: shape={out_dict[name].shape}, dtype={out_dict[name].dtype}")

	# Analyze bbox_deltas for FPN3
	for level, stride in [("fpn3", 8)]:
	scores = out_dict[f'scores_hori_{level}'][0, 0] # [H, W]
	deltas = out_dict[f'bbox_deltas_hori_{level}'][0] # [8, H, W]
	links = out_dict[f'link_scores_hori_{level}'][0] # [8, H, W]

	fh, fw = scores.shape
	print(f"\n{level} (stride={stride}): score map {fw}x{fh}")

	# Find text pixels
	text_mask = scores > 0.6
	text_ys, text_xs = np.where(text_mask)
	print(f" Text pixels: {len(text_ys)}")

	# Show deltas at text positions
	print(f"\n bbox_deltas stats at text pixels:")
	for ch in range(8):
	vals = deltas[ch][text_mask]
	print(f" ch{ch}: min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f} std={vals.std():.2f}")

	# Show deltas for a specific text region (the first word cluster)
	from scipy import ndimage
	labeled, n = ndimage.label(text_mask)
	print(f"\n Components: {n}")

	for comp_id in range(1, min(n+1, 4)):
	ys, xs = np.where(labeled == comp_id)
	r_min, r_max = ys.min(), ys.max()
	c_min, c_max = xs.min(), xs.max()

	# Grid-based box (what we currently do)
	grid_x1 = c_min * stride
	grid_y1 = r_min * stride
	grid_x2 = (c_max + 1) * stride
	grid_y2 = (r_max + 1) * stride

	# Deltas at component pixels
	comp_mask = labeled == comp_id
	print(f"\n Component {comp_id}: grid box ({grid_x1},{grid_y1})-({grid_x2},{grid_y2}) in det coords")
	print(f" Original coords: ({grid_x1/scale:.0f},{grid_y1/scale:.0f})-({grid_x2/scale:.0f},{grid_y2/scale:.0f})")

	for ch in range(8):
	vals = deltas[ch][comp_mask]
	corner = ch // 2
	coord = 'x' if ch % 2 == 0 else 'y'
	print(f" delta[{ch}] (corner{corner}.{coord}): min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f}")

	# Try interpreting: deltas are per-pixel corner offsets from (cstride, rstride)
	# Average over all pixels in component
	avg_deltas = [deltas[ch][comp_mask].mean() for ch in range(8)]

	# Hypothesis 1: deltas are absolute offsets from grid box corners
	# TL = (grid_x1 + d0, grid_y1 + d1)
	# TR = (grid_x2 + d2, grid_y2_top + d3)
	# BR = (grid_x2 + d4, grid_y2 + d5)
	# BL = (grid_x1 + d6, grid_y2 + d7)
	print(f" H1 (offset from grid): TL=({grid_x1+avg_deltas[0]:.0f},{grid_y1+avg_deltas[1]:.0f}) "
	f"TR=({grid_x2+avg_deltas[2]:.0f},{grid_y1+avg_deltas[3]:.0f}) "
	f"BR=({grid_x2+avg_deltas[4]:.0f},{grid_y2+avg_deltas[5]:.0f}) "
	f"BL=({grid_x1+avg_deltas[6]:.0f},{grid_y2+avg_deltas[7]:.0f})")

	# Hypothesis 2: deltas are per-pixel, use min/max of corner deltas
	# For TL corner, use pixels near top-left, etc.
	# Actually, let's try: each pixel predicts the 4 corners of its text region
	# So for pixel (r, c), the predicted box is:
	# TL = (cstride + d0, rstride + d1)
	# TR = (cstride + d2, rstride + d3)
	# BR = (cstride + d4, rstride + d5)
	# BL = (cstride + d6, rstride + d7)

	# For the component, take the pixel-wise predictions and average or extremes
	all_corners = []
	for idx in range(len(ys)):
	r, c = ys[idx], xs[idx]
	d = [float(deltas[ch, r, c]) for ch in range(8)]
	cx, cy = c * stride, r * stride
	tl = (cx + d[0], cy + d[1])
	tr = (cx + d[2], cy + d[3])
	br = (cx + d[4], cy + d[5])
	bl = (cx + d[6], cy + d[7])
	all_corners.append([tl, tr, br, bl])

	all_corners = np.array(all_corners) # [N, 4, 2]
	# Average each corner across all pixels
	avg_corners = all_corners.mean(axis=0)
	print(f" H2 (per-pixel avg): TL=({avg_corners[0,0]:.0f},{avg_corners[0,1]:.0f}) "
	f"TR=({avg_corners[1,0]:.0f},{avg_corners[1,1]:.0f}) "
	f"BR=({avg_corners[2,0]:.0f},{avg_corners[2,1]:.0f}) "
	f"BL=({avg_corners[3,0]:.0f},{avg_corners[3,1]:.0f})")
	print(f" H2 original: TL=({avg_corners[0,0]/scale:.0f},{avg_corners[0,1]/scale:.0f}) "
	f"TR=({avg_corners[1,0]/scale:.0f},{avg_corners[1,1]/scale:.0f}) "
	f"BR=({avg_corners[2,0]/scale:.0f},{avg_corners[2,1]/scale:.0f}) "
	f"BL=({avg_corners[3,0]/scale:.0f},{avg_corners[3,1]/scale:.0f})")

	# Also try: TL = min of all TL predictions, BR = max of all BR predictions
	min_tl = all_corners[:, 0, :].min(axis=0)
	max_br = all_corners[:, 2, :].max(axis=0)
	print(f" H2 (min TL, max BR): ({min_tl[0]/scale:.0f},{min_tl[1]/scale:.0f})-({max_br[0]/scale:.0f},{max_br[1]/scale:.0f})")