oneocr / _archive /crack_config.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix."""
import onnx
import numpy as np
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
# Load model_11
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))
# Get feature/config blob
config_blob = None
for init in model.graph.initializer:
if init.name == "feature/config":
config_blob = bytes(init.string_data[0])
break
print(f"Config blob size: {len(config_blob)} bytes")
print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}")
# Full float32 interpretation
all_floats = np.frombuffer(config_blob, dtype=np.float32)
print(f"\nFull blob as float32:")
print(f" Count: {len(all_floats)}")
print(f" Finite: {np.isfinite(all_floats).sum()}")
print(f" In [-10,10]: {np.sum(np.abs(all_floats) < 10)}")
print(f" Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]")
print(f" Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}")
print(f" First 20: {all_floats[:20]}")
# 4492 bytes / 4 = 1123 floats
# Hypothesis: some header + 21×50 weight matrix + 50 bias
# 1123 - 1050 - 50 = 23 extra floats (92 bytes header)
# Try different header sizes
for header_floats in range(0, 40):
remaining = len(all_floats) - header_floats
# Check if remaining = in_dim * out_dim + out_dim for some dimensions
for in_dim in [20, 21, 22]:
for out_dim in [48, 49, 50, 51, 52]:
needed = in_dim * out_dim + out_dim
if remaining == needed:
print(f"\n *** MATCH: header={header_floats} ({header_floats*4}B) + "
f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats")
W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim)
b = all_floats[header_floats + in_dim*out_dim:header_floats + needed]
print(f" W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}")
print(f" b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}")
if header_floats > 0:
header = all_floats[:header_floats]
print(f" Header values: {header}")
# Also try: the blob might encode multiple layers
# Or maybe it's quantized (int8/uint8)?
print(f"\n--- Trying int8 interpretation ---")
int8_arr = np.frombuffer(config_blob, dtype=np.int8)
print(f" int8 range: [{int8_arr.min()}, {int8_arr.max()}]")
uint8_arr = np.frombuffer(config_blob, dtype=np.uint8)
print(f" uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]")
# Maybe float16?
if len(config_blob) % 2 == 0:
f16_arr = np.frombuffer(config_blob, dtype=np.float16)
finite_f16 = np.isfinite(f16_arr).sum()
print(f" float16 count: {len(f16_arr)}, finite: {finite_f16}")
if finite_f16 > len(f16_arr) * 0.9:
print(f" float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]")
# Check the Slice in model_11 to understand input dimensions
print(f"\n--- Checking Slice constants to understand feature extraction ---")
for node in model.graph.node:
if node.op_type == "Constant":
for attr in node.attribute:
if attr.type == 4: # TENSOR
t = attr.t
data = onnx.numpy_helper.to_array(t)
print(f" Constant '{node.output[0]}': {data}")
# Check Add and Div constants
for node in model.graph.node:
if node.op_type in ("Add", "Div"):
print(f"\n {node.op_type}: {list(node.input)}{list(node.output)}")