oneocr / _archive /debug_detector.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
raw
history blame
3.22 kB
"""Debug detector output to understand word segmentation."""
import numpy as np
import onnxruntime as ort
from PIL import Image
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
img = Image.open("image.png").convert("RGB")
w, h = img.size
# Detector setup
sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
providers=["CPUExecutionProvider"])
scale = 800 / max(h, w)
dh = (int(h * scale) + 31) // 32 * 32
dw = (int(w * scale) + 31) // 32 * 32
img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
im_info = np.array([[dh, dw, scale]], dtype=np.float32)
outputs = sess.run(None, {"data": data, "im_info": im_info})
output_names = [o.name for o in sess.get_outputs()]
out_dict = dict(zip(output_names, outputs))
# Analyze FPN2 (highest resolution)
pixel_scores = out_dict["scores_hori_fpn2"][0, 0] # [56, 200]
link_scores = out_dict["link_scores_hori_fpn2"][0] # [8, 56, 200]
print(f"FPN2 shape: {pixel_scores.shape}")
print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")
# Find text region
text_mask = pixel_scores > 0.6
print(f"Text pixels (>0.6): {text_mask.sum()}")
# Get the row/column range of text pixels
ys, xs = np.where(text_mask)
if len(ys) > 0:
print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")
# Check link scores within text region - do they separate words?
# Link 2 is East neighbor (right), Link 6 is West neighbor (left)
# If link between words is low, they should separate
row_mid = (ys.min() + ys.max()) // 2
print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
link_east = link_scores[2, row_mid, :] # E neighbor
for x in range(xs.min(), xs.max()+1):
ps = pixel_scores[row_mid, x]
le = link_east[x]
marker = "TEXT" if ps > 0.6 else " "
link_marker = "LINK" if le > 0.5 else "gap "
if ps > 0.3:
print(f" col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")
# Also check if there are distinct "gap" regions in pixel scores
print(f"\nPixel scores along row {row_mid}:")
for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
ps = pixel_scores[row_mid, x]
bar = "█" * int(ps * 40)
print(f" col={x:3d}: {ps:.3f} {bar}")
# Try different thresholds
for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
mask = pixel_scores > thresh
n = mask.sum()
# Connected components using simple scan
from scipy import ndimage
try:
labels, n_comps = ndimage.label(mask)
print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
for c in range(1, min(n_comps+1, 10)):
comp_mask = labels == c
area = comp_mask.sum()
ys_c, xs_c = np.where(comp_mask)
print(f" Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
except ImportError:
# Fallback without scipy
print(f"Threshold {thresh}: {n} pixels")