oneocr / _archive /inspect_config_blob.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Deep-dive into model_11 and model_22 graph structure — handle binary config."""
import onnx
import numpy as np
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
for idx in [11, 22]:
matches = list(models_dir.glob(f"model_{idx:02d}_*"))
model = onnx.load(str(matches[0]))
print(f"\n{'='*70}")
print(f"FULL GRAPH: model_{idx:02d}")
print(f"{'='*70}")
# All initializers (weights)
print(f"\n Initializers ({len(model.graph.initializer)}):")
for init in model.graph.initializer:
if init.data_type == 8: # STRING
raw = init.string_data[0] if init.string_data else init.raw_data
print(f" {init.name}: STRING, {len(raw)} bytes (binary)")
else:
data = onnx.numpy_helper.to_array(init)
print(f" {init.name}: shape={data.shape}, dtype={data.dtype}, "
f"range=[{data.min():.4f}, {data.max():.4f}]")
# All nodes
print(f"\n Nodes ({len(model.graph.node)}):")
for i, node in enumerate(model.graph.node):
domain_str = f" [{node.domain}]" if node.domain else ""
print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)}{list(node.output)}")
for attr in node.attribute:
if attr.type == 2:
print(f" {attr.name} = {attr.i}")
elif attr.type == 1:
print(f" {attr.name} = {attr.f}")
elif attr.type == 7:
print(f" {attr.name} = {list(attr.ints)}")
# Analyze feature/config blob
for init in model.graph.initializer:
if "config" in init.name.lower():
raw = init.string_data[0] if init.string_data else init.raw_data
blob = bytes(raw)
print(f"\n ── feature/config analysis ──")
print(f" Total bytes: {len(blob)}")
print(f" First 32 bytes hex: {blob[:32].hex()}")
# Hypothesis: header + weight_matrix(input_dim × output_dim) + bias(output_dim)
# If input=21, output=50: 21*50=1050 floats = 4200 bytes, bias=50 floats = 200 bytes
# Total weights = 4400 bytes, header = 4492-4400 = 92 bytes
# Try reading first few uint32 as header
header_u32 = [int.from_bytes(blob[i:i+4], 'little') for i in range(0, min(96, len(blob)), 4)]
print(f" First 24 uint32 LE values: {header_u32}")
# Try float32 interpretation after various offsets
for offset in [0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92]:
remaining = len(blob) - offset
n_floats = remaining // 4
if n_floats == 0:
continue
arr = np.frombuffer(blob[offset:offset + n_floats*4], dtype=np.float32)
valid = np.isfinite(arr).sum()
reasonable = np.sum((np.abs(arr) < 10) & np.isfinite(arr))
if reasonable > n_floats * 0.7: # >70% reasonable values
print(f" *** offset={offset}: {n_floats} floats, {valid} finite, "
f"{reasonable} in [-10,10] ({100*reasonable/n_floats:.0f}%)")
print(f" First 10: {arr[:10]}")
print(f" Stats: mean={arr.mean():.4f}, std={arr.std():.4f}")
# Check if it could be weight matrix 21×50
if n_floats >= 1050 + 50:
W = arr[:1050].reshape(21, 50)
b = arr[1050:1100]
print(f" As 21×50 weight: W_range=[{W.min():.4f},{W.max():.4f}], "
f"b_range=[{b.min():.4f},{b.max():.4f}]")
# Test with random input
x = np.random.randn(1, 21).astype(np.float32)
y = x @ W + b
print(f" Test: input(21) → output(50), y_range=[{y.min():.4f},{y.max():.4f}]")