"""Debug detector output to understand word segmentation.""" import numpy as np import onnxruntime as ort from PIL import Image from pathlib import Path models_dir = Path("oneocr_extracted/onnx_models") img = Image.open("image.png").convert("RGB") w, h = img.size # Detector setup sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))), providers=["CPUExecutionProvider"]) scale = 800 / max(h, w) dh = (int(h * scale) + 31) // 32 * 32 dw = (int(w * scale) + 31) // 32 * 32 img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32) img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32) data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32) im_info = np.array([[dh, dw, scale]], dtype=np.float32) outputs = sess.run(None, {"data": data, "im_info": im_info}) output_names = [o.name for o in sess.get_outputs()] out_dict = dict(zip(output_names, outputs)) # Analyze FPN2 (highest resolution) pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200] link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200] print(f"FPN2 shape: {pixel_scores.shape}") print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}") # Find text region text_mask = pixel_scores > 0.6 print(f"Text pixels (>0.6): {text_mask.sum()}") # Get the row/column range of text pixels ys, xs = np.where(text_mask) if len(ys) > 0: print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]") # Check link scores within text region - do they separate words? # Link 2 is East neighbor (right), Link 6 is West neighbor (left) # If link between words is low, they should separate row_mid = (ys.min() + ys.max()) // 2 print(f"\nHorizontal link scores at row {row_mid} (East neighbor):") link_east = link_scores[2, row_mid, :] # E neighbor for x in range(xs.min(), xs.max()+1): ps = pixel_scores[row_mid, x] le = link_east[x] marker = "TEXT" if ps > 0.6 else " " link_marker = "LINK" if le > 0.5 else "gap " if ps > 0.3: print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]") # Also check if there are distinct "gap" regions in pixel scores print(f"\nPixel scores along row {row_mid}:") for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)): ps = pixel_scores[row_mid, x] bar = "█" * int(ps * 40) print(f" col={x:3d}: {ps:.3f} {bar}") # Try different thresholds for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]: mask = pixel_scores > thresh n = mask.sum() # Connected components using simple scan from scipy import ndimage try: labels, n_comps = ndimage.label(mask) print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components") for c in range(1, min(n_comps+1, 10)): comp_mask = labels == c area = comp_mask.sum() ys_c, xs_c = np.where(comp_mask) print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]") except ImportError: # Fallback without scipy print(f"Threshold {thresh}: {n} pixels")