"""Analyze bbox_deltas to understand their format and how they refine box positions.""" import sys, os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import cv2 import numpy as np import onnxruntime as ort # Use a small image where we know the text positions img = cv2.imread('working_space/input/ocr_test (2).png') img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) h, w = img_rgb.shape[:2] print(f"Image: {w}x{h}") mean = np.array([102.9801, 115.9465, 122.7717], dtype=np.float32) scale = 800 / max(h, w) scale = min(scale, 6.0) dh = (int(h * scale) + 31) // 32 * 32 dw = (int(w * scale) + 31) // 32 * 32 print(f"Scale={scale:.3f}, Det size: {dw}x{dh}") img_det = cv2.resize(img_rgb, (dw, dh)).astype(np.float32) det_data = (img_det[:,:,::-1] - mean).transpose(2,0,1)[np.newaxis] im_info = np.array([[dh, dw, scale]], dtype=np.float32) sess = ort.InferenceSession( 'oneocr_extracted/onnx_models/model_00_ir6_0.1.0_11282KB.onnx', providers=['CPUExecutionProvider'] ) outs = sess.run(None, {'data': det_data, 'im_info': im_info}) out_names = [o.name for o in sess.get_outputs()] out_dict = dict(zip(out_names, outs)) print(f"\nOutputs: {out_names}") for name in out_names: print(f" {name}: shape={out_dict[name].shape}, dtype={out_dict[name].dtype}") # Analyze bbox_deltas for FPN3 for level, stride in [("fpn3", 8)]: scores = out_dict[f'scores_hori_{level}'][0, 0] # [H, W] deltas = out_dict[f'bbox_deltas_hori_{level}'][0] # [8, H, W] links = out_dict[f'link_scores_hori_{level}'][0] # [8, H, W] fh, fw = scores.shape print(f"\n{level} (stride={stride}): score map {fw}x{fh}") # Find text pixels text_mask = scores > 0.6 text_ys, text_xs = np.where(text_mask) print(f" Text pixels: {len(text_ys)}") # Show deltas at text positions print(f"\n bbox_deltas stats at text pixels:") for ch in range(8): vals = deltas[ch][text_mask] print(f" ch{ch}: min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f} std={vals.std():.2f}") # Show deltas for a specific text region (the first word cluster) from scipy import ndimage labeled, n = ndimage.label(text_mask) print(f"\n Components: {n}") for comp_id in range(1, min(n+1, 4)): ys, xs = np.where(labeled == comp_id) r_min, r_max = ys.min(), ys.max() c_min, c_max = xs.min(), xs.max() # Grid-based box (what we currently do) grid_x1 = c_min * stride grid_y1 = r_min * stride grid_x2 = (c_max + 1) * stride grid_y2 = (r_max + 1) * stride # Deltas at component pixels comp_mask = labeled == comp_id print(f"\n Component {comp_id}: grid box ({grid_x1},{grid_y1})-({grid_x2},{grid_y2}) in det coords") print(f" Original coords: ({grid_x1/scale:.0f},{grid_y1/scale:.0f})-({grid_x2/scale:.0f},{grid_y2/scale:.0f})") for ch in range(8): vals = deltas[ch][comp_mask] corner = ch // 2 coord = 'x' if ch % 2 == 0 else 'y' print(f" delta[{ch}] (corner{corner}.{coord}): min={vals.min():.2f} max={vals.max():.2f} mean={vals.mean():.2f}") # Try interpreting: deltas are per-pixel corner offsets from (c*stride, r*stride) # Average over all pixels in component avg_deltas = [deltas[ch][comp_mask].mean() for ch in range(8)] # Hypothesis 1: deltas are absolute offsets from grid box corners # TL = (grid_x1 + d0, grid_y1 + d1) # TR = (grid_x2 + d2, grid_y2_top + d3) # BR = (grid_x2 + d4, grid_y2 + d5) # BL = (grid_x1 + d6, grid_y2 + d7) print(f" H1 (offset from grid): TL=({grid_x1+avg_deltas[0]:.0f},{grid_y1+avg_deltas[1]:.0f}) " f"TR=({grid_x2+avg_deltas[2]:.0f},{grid_y1+avg_deltas[3]:.0f}) " f"BR=({grid_x2+avg_deltas[4]:.0f},{grid_y2+avg_deltas[5]:.0f}) " f"BL=({grid_x1+avg_deltas[6]:.0f},{grid_y2+avg_deltas[7]:.0f})") # Hypothesis 2: deltas are per-pixel, use min/max of corner deltas # For TL corner, use pixels near top-left, etc. # Actually, let's try: each pixel predicts the 4 corners of its text region # So for pixel (r, c), the predicted box is: # TL = (c*stride + d0, r*stride + d1) # TR = (c*stride + d2, r*stride + d3) # BR = (c*stride + d4, r*stride + d5) # BL = (c*stride + d6, r*stride + d7) # For the component, take the pixel-wise predictions and average or extremes all_corners = [] for idx in range(len(ys)): r, c = ys[idx], xs[idx] d = [float(deltas[ch, r, c]) for ch in range(8)] cx, cy = c * stride, r * stride tl = (cx + d[0], cy + d[1]) tr = (cx + d[2], cy + d[3]) br = (cx + d[4], cy + d[5]) bl = (cx + d[6], cy + d[7]) all_corners.append([tl, tr, br, bl]) all_corners = np.array(all_corners) # [N, 4, 2] # Average each corner across all pixels avg_corners = all_corners.mean(axis=0) print(f" H2 (per-pixel avg): TL=({avg_corners[0,0]:.0f},{avg_corners[0,1]:.0f}) " f"TR=({avg_corners[1,0]:.0f},{avg_corners[1,1]:.0f}) " f"BR=({avg_corners[2,0]:.0f},{avg_corners[2,1]:.0f}) " f"BL=({avg_corners[3,0]:.0f},{avg_corners[3,1]:.0f})") print(f" H2 original: TL=({avg_corners[0,0]/scale:.0f},{avg_corners[0,1]/scale:.0f}) " f"TR=({avg_corners[1,0]/scale:.0f},{avg_corners[1,1]/scale:.0f}) " f"BR=({avg_corners[2,0]/scale:.0f},{avg_corners[2,1]/scale:.0f}) " f"BL=({avg_corners[3,0]/scale:.0f},{avg_corners[3,1]/scale:.0f})") # Also try: TL = min of all TL predictions, BR = max of all BR predictions min_tl = all_corners[:, 0, :].min(axis=0) max_br = all_corners[:, 2, :].max(axis=0) print(f" H2 (min TL, max BR): ({min_tl[0]/scale:.0f},{min_tl[1]/scale:.0f})-({max_br[0]/scale:.0f},{max_br[1]/scale:.0f})")