oneocr / _archive /temp /analyze_deltas.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Analyze bbox_deltas to understand their format and how they refine box positions."""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import cv2
import numpy as np
import onnxruntime as ort
# Use a small image where we know the text positions
img = cv2.imread('working_space/input/ocr_test (2).png')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
h, w = img_rgb.shape[:2]
print(f"Image: {w}x{h}")
mean = np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
scale = 800 / max(h, w)
scale = min(scale, 6.0)
dh = (int(h * scale) + 31) // 32 * 32
dw = (int(w * scale) + 31) // 32 * 32
print(f"Scale={scale:.3f}, Det size: {dw}x{dh}")
img_det = cv2.resize(img_rgb, (dw, dh)).astype(np.float32)
det_data = (img_det[:,:,::-1] - mean).transpose(2,0,1)[np.newaxis]
im_info = np.array([[dh, dw, scale]], dtype=np.float32)
sess = ort.InferenceSession(
'oneocr_extracted/onnx_models/model_00_ir6_0.1.0_11282KB.onnx',
providers=['CPUExecutionProvider']
)
outs = sess.run(None, {'data': det_data, 'im_info': im_info})
out_names = [o.name for o in sess.get_outputs()]
out_dict = dict(zip(out_names, outs))
print(f"\nOutputs: {out_names}")
for name in out_names:
print(f" {name}: shape={out_dict[name].shape}, dtype={out_dict[name].dtype}")
# Analyze bbox_deltas for FPN3
for level, stride in [("fpn3", 8)]:
scores = out_dict[f'scores_hori_{level}'][0, 0] # [H, W]
deltas = out_dict[f'bbox_deltas_hori_{level}'][0] # [8, H, W]
links = out_dict[f'link_scores_hori_{level}'][0] # [8, H, W]
fh, fw = scores.shape
print(f"\n{level} (stride={stride}): score map {fw}x{fh}")
# Find text pixels
text_mask = scores > 0.6
text_ys, text_xs = np.where(text_mask)
print(f" Text pixels: {len(text_ys)}")
# Show deltas at text positions
print(f"\n bbox_deltas stats at text pixels:")
for ch in range(8):
vals = deltas[ch][text_mask]
print(f" ch{ch}: min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f} std={vals.std():.2f}")
# Show deltas for a specific text region (the first word cluster)
from scipy import ndimage
labeled, n = ndimage.label(text_mask)
print(f"\n Components: {n}")
for comp_id in range(1, min(n+1, 4)):
ys, xs = np.where(labeled == comp_id)
r_min, r_max = ys.min(), ys.max()
c_min, c_max = xs.min(), xs.max()
# Grid-based box (what we currently do)
grid_x1 = c_min * stride
grid_y1 = r_min * stride
grid_x2 = (c_max + 1) * stride
grid_y2 = (r_max + 1) * stride
# Deltas at component pixels
comp_mask = labeled == comp_id
print(f"\n Component {comp_id}: grid box ({grid_x1},{grid_y1})-({grid_x2},{grid_y2}) in det coords")
print(f" Original coords: ({grid_x1/scale:.0f},{grid_y1/scale:.0f})-({grid_x2/scale:.0f},{grid_y2/scale:.0f})")
for ch in range(8):
vals = deltas[ch][comp_mask]
corner = ch // 2
coord = 'x' if ch % 2 == 0 else 'y'
print(f" delta[{ch}] (corner{corner}.{coord}): min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f}")
# Try interpreting: deltas are per-pixel corner offsets from (c*stride, r*stride)
# Average over all pixels in component
avg_deltas = [deltas[ch][comp_mask].mean() for ch in range(8)]
# Hypothesis 1: deltas are absolute offsets from grid box corners
# TL = (grid_x1 + d0, grid_y1 + d1)
# TR = (grid_x2 + d2, grid_y2_top + d3)
# BR = (grid_x2 + d4, grid_y2 + d5)
# BL = (grid_x1 + d6, grid_y2 + d7)
print(f" H1 (offset from grid): TL=({grid_x1+avg_deltas[0]:.0f},{grid_y1+avg_deltas[1]:.0f}) "
f"TR=({grid_x2+avg_deltas[2]:.0f},{grid_y1+avg_deltas[3]:.0f}) "
f"BR=({grid_x2+avg_deltas[4]:.0f},{grid_y2+avg_deltas[5]:.0f}) "
f"BL=({grid_x1+avg_deltas[6]:.0f},{grid_y2+avg_deltas[7]:.0f})")
# Hypothesis 2: deltas are per-pixel, use min/max of corner deltas
# For TL corner, use pixels near top-left, etc.
# Actually, let's try: each pixel predicts the 4 corners of its text region
# So for pixel (r, c), the predicted box is:
# TL = (c*stride + d0, r*stride + d1)
# TR = (c*stride + d2, r*stride + d3)
# BR = (c*stride + d4, r*stride + d5)
# BL = (c*stride + d6, r*stride + d7)
# For the component, take the pixel-wise predictions and average or extremes
all_corners = []
for idx in range(len(ys)):
r, c = ys[idx], xs[idx]
d = [float(deltas[ch, r, c]) for ch in range(8)]
cx, cy = c * stride, r * stride
tl = (cx + d[0], cy + d[1])
tr = (cx + d[2], cy + d[3])
br = (cx + d[4], cy + d[5])
bl = (cx + d[6], cy + d[7])
all_corners.append([tl, tr, br, bl])
all_corners = np.array(all_corners) # [N, 4, 2]
# Average each corner across all pixels
avg_corners = all_corners.mean(axis=0)
print(f" H2 (per-pixel avg): TL=({avg_corners[0,0]:.0f},{avg_corners[0,1]:.0f}) "
f"TR=({avg_corners[1,0]:.0f},{avg_corners[1,1]:.0f}) "
f"BR=({avg_corners[2,0]:.0f},{avg_corners[2,1]:.0f}) "
f"BL=({avg_corners[3,0]:.0f},{avg_corners[3,1]:.0f})")
print(f" H2 original: TL=({avg_corners[0,0]/scale:.0f},{avg_corners[0,1]/scale:.0f}) "
f"TR=({avg_corners[1,0]/scale:.0f},{avg_corners[1,1]/scale:.0f}) "
f"BR=({avg_corners[2,0]/scale:.0f},{avg_corners[2,1]/scale:.0f}) "
f"BL=({avg_corners[3,0]/scale:.0f},{avg_corners[3,1]/scale:.0f})")
# Also try: TL = min of all TL predictions, BR = max of all BR predictions
min_tl = all_corners[:, 0, :].min(axis=0)
max_br = all_corners[:, 2, :].max(axis=0)
print(f" H2 (min TL, max BR): ({min_tl[0]/scale:.0f},{min_tl[1]/scale:.0f})-({max_br[0]/scale:.0f},{max_br[1]/scale:.0f})")