oneocr / _archive /debug_detector.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Debug detector output to understand word segmentation."""
import numpy as np
import onnxruntime as ort
from PIL import Image
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
img = Image.open("image.png").convert("RGB")
w, h = img.size
# Detector setup
sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
providers=["CPUExecutionProvider"])
scale = 800 / max(h, w)
dh = (int(h * scale) + 31) // 32 * 32
dw = (int(w * scale) + 31) // 32 * 32
img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
im_info = np.array([[dh, dw, scale]], dtype=np.float32)
outputs = sess.run(None, {"data": data, "im_info": im_info})
output_names = [o.name for o in sess.get_outputs()]
out_dict = dict(zip(output_names, outputs))
# Analyze FPN2 (highest resolution)
pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200]
link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200]
print(f"FPN2 shape: {pixel_scores.shape}")
print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")
# Find text region
text_mask = pixel_scores > 0.6
print(f"Text pixels (>0.6): {text_mask.sum()}")
# Get the row/column range of text pixels
ys, xs = np.where(text_mask)
if len(ys) > 0:
print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")
# Check link scores within text region - do they separate words?
# Link 2 is East neighbor (right), Link 6 is West neighbor (left)
# If link between words is low, they should separate
row_mid = (ys.min() + ys.max()) // 2
print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
link_east = link_scores[2, row_mid, :] # E neighbor
for x in range(xs.min(), xs.max()+1):
ps = pixel_scores[row_mid, x]
le = link_east[x]
marker = "TEXT" if ps > 0.6 else " "
link_marker = "LINK" if le > 0.5 else "gap "
if ps > 0.3:
print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")
# Also check if there are distinct "gap" regions in pixel scores
print(f"\nPixel scores along row {row_mid}:")
for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
ps = pixel_scores[row_mid, x]
bar = "█" * int(ps * 40)
print(f" col={x:3d}: {ps:.3f} {bar}")
# Try different thresholds
for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
mask = pixel_scores > thresh
n = mask.sum()
# Connected components using simple scan
from scipy import ndimage
try:
labels, n_comps = ndimage.label(mask)
print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
for c in range(1, min(n_comps+1, 10)):
comp_mask = labels == c
area = comp_mask.sum()
ys_c, xs_c = np.where(comp_mask)
print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
except ImportError:
# Fallback without scipy
print(f"Threshold {thresh}: {n} pixels")