oneocr

File size: 3,617 Bytes

ce847d4

"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix."""
import onnx
import numpy as np
from pathlib import Path

models_dir = Path("oneocr_extracted/onnx_models")

# Load model_11
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))

# Get feature/config blob
config_blob = None
for init in model.graph.initializer:
    if init.name == "feature/config":
        config_blob = bytes(init.string_data[0])
        break

print(f"Config blob size: {len(config_blob)} bytes")
print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}")

# Full float32 interpretation
all_floats = np.frombuffer(config_blob, dtype=np.float32)
print(f"\nFull blob as float32:")
print(f"  Count: {len(all_floats)}")
print(f"  Finite: {np.isfinite(all_floats).sum()}")
print(f"  In [-10,10]: {np.sum(np.abs(all_floats) < 10)}")
print(f"  Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]")
print(f"  Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}")
print(f"  First 20: {all_floats[:20]}")

# 4492 bytes / 4 = 1123 floats
# Hypothesis: some header + 21×50 weight matrix + 50 bias
# 1123 - 1050 - 50 = 23 extra floats (92 bytes header)

# Try different header sizes
for header_floats in range(0, 40):
    remaining = len(all_floats) - header_floats
    # Check if remaining = in_dim * out_dim + out_dim for some dimensions
    for in_dim in [20, 21, 22]:
        for out_dim in [48, 49, 50, 51, 52]:
            needed = in_dim * out_dim + out_dim
            if remaining == needed:
                print(f"\n  *** MATCH: header={header_floats} ({header_floats*4}B) + "
                      f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats")
                W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim)
                b = all_floats[header_floats + in_dim*out_dim:header_floats + needed]
                print(f"      W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}")
                print(f"      b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}")
                
                if header_floats > 0:
                    header = all_floats[:header_floats]
                    print(f"      Header values: {header}")

# Also try: the blob might encode multiple layers
# Or maybe it's quantized (int8/uint8)?
print(f"\n--- Trying int8 interpretation ---")
int8_arr = np.frombuffer(config_blob, dtype=np.int8)
print(f"  int8 range: [{int8_arr.min()}, {int8_arr.max()}]")

uint8_arr = np.frombuffer(config_blob, dtype=np.uint8)
print(f"  uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]")

# Maybe float16?
if len(config_blob) % 2 == 0:
    f16_arr = np.frombuffer(config_blob, dtype=np.float16)
    finite_f16 = np.isfinite(f16_arr).sum()
    print(f"  float16 count: {len(f16_arr)}, finite: {finite_f16}")
    if finite_f16 > len(f16_arr) * 0.9:
        print(f"  float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]")

# Check the Slice in model_11 to understand input dimensions
print(f"\n--- Checking Slice constants to understand feature extraction ---")
for node in model.graph.node:
    if node.op_type == "Constant":
        for attr in node.attribute:
            if attr.type == 4:  # TENSOR
                t = attr.t
                data = onnx.numpy_helper.to_array(t)
                print(f"  Constant '{node.output[0]}': {data}")

# Check Add and Div constants  
for node in model.graph.node:
    if node.op_type in ("Add", "Div"):
        print(f"\n  {node.op_type}: {list(node.input)} → {list(node.output)}")