oneocr

File size: 3,215 Bytes

ce847d4

"""Debug detector output to understand word segmentation."""
import numpy as np
import onnxruntime as ort
from PIL import Image
from pathlib import Path

models_dir = Path("oneocr_extracted/onnx_models")
img = Image.open("image.png").convert("RGB")
w, h = img.size

# Detector setup
sess = ort.InferenceSession(str(next(models_dir.glob("model_00_*"))),
                            providers=["CPUExecutionProvider"])

scale = 800 / max(h, w)
dh = (int(h * scale) + 31) // 32 * 32
dw = (int(w * scale) + 31) // 32 * 32
img_d = np.array(img.resize((dw, dh), Image.LANCZOS), dtype=np.float32)
img_d = img_d[:, :, ::-1] - np.array([102.9801, 115.9465, 122.7717], dtype=np.float32)
data = img_d.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
im_info = np.array([[dh, dw, scale]], dtype=np.float32)

outputs = sess.run(None, {"data": data, "im_info": im_info})
output_names = [o.name for o in sess.get_outputs()]
out_dict = dict(zip(output_names, outputs))

# Analyze FPN2 (highest resolution)
pixel_scores = out_dict["scores_hori_fpn2"][0, 0]  # [56, 200]
link_scores = out_dict["link_scores_hori_fpn2"][0]  # [8, 56, 200]

print(f"FPN2 shape: {pixel_scores.shape}")
print(f"Pixel scores: min={pixel_scores.min():.4f} max={pixel_scores.max():.4f}")

# Find text region
text_mask = pixel_scores > 0.6
print(f"Text pixels (>0.6): {text_mask.sum()}")

# Get the row/column range of text pixels
ys, xs = np.where(text_mask)
if len(ys) > 0:
    print(f"Text region: rows [{ys.min()}-{ys.max()}], cols [{xs.min()}-{xs.max()}]")
    
    # Check link scores within text region - do they separate words?
    # Link 2 is East neighbor (right), Link 6 is West neighbor (left)
    # If link between words is low, they should separate
    row_mid = (ys.min() + ys.max()) // 2
    print(f"\nHorizontal link scores at row {row_mid} (East neighbor):")
    link_east = link_scores[2, row_mid, :]  # E neighbor
    for x in range(xs.min(), xs.max()+1):
        ps = pixel_scores[row_mid, x]
        le = link_east[x]
        marker = "TEXT" if ps > 0.6 else "    "
        link_marker = "LINK" if le > 0.5 else "gap "
        if ps > 0.3:
            print(f"  col={x:3d}: pixel={ps:.3f} [{marker}] east_link={le:.3f} [{link_marker}]")

    # Also check if there are distinct "gap" regions in pixel scores
    print(f"\nPixel scores along row {row_mid}:")
    for x in range(max(0, xs.min()-2), min(pixel_scores.shape[1], xs.max()+3)):
        ps = pixel_scores[row_mid, x]
        bar = "█" * int(ps * 40)
        print(f"  col={x:3d}: {ps:.3f} {bar}")

# Try different thresholds
for thresh in [0.5, 0.6, 0.7, 0.8, 0.9]:
    mask = pixel_scores > thresh
    n = mask.sum()
    # Connected components using simple scan
    from scipy import ndimage
    try:
        labels, n_comps = ndimage.label(mask)
        print(f"\nThreshold {thresh}: {n} pixels, {n_comps} components")
        for c in range(1, min(n_comps+1, 10)):
            comp_mask = labels == c
            area = comp_mask.sum()
            ys_c, xs_c = np.where(comp_mask)
            print(f"  Component {c}: area={area}, cols=[{xs_c.min()}-{xs_c.max()}]")
    except ImportError:
        # Fallback without scipy
        print(f"Threshold {thresh}: {n} pixels")