|
|
"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix.""" |
|
|
import onnx |
|
|
import numpy as np |
|
|
from pathlib import Path |
|
|
|
|
|
models_dir = Path("oneocr_extracted/onnx_models") |
|
|
|
|
|
|
|
|
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0])) |
|
|
|
|
|
|
|
|
config_blob = None |
|
|
for init in model.graph.initializer: |
|
|
if init.name == "feature/config": |
|
|
config_blob = bytes(init.string_data[0]) |
|
|
break |
|
|
|
|
|
print(f"Config blob size: {len(config_blob)} bytes") |
|
|
print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}") |
|
|
|
|
|
|
|
|
all_floats = np.frombuffer(config_blob, dtype=np.float32) |
|
|
print(f"\nFull blob as float32:") |
|
|
print(f" Count: {len(all_floats)}") |
|
|
print(f" Finite: {np.isfinite(all_floats).sum()}") |
|
|
print(f" In [-10,10]: {np.sum(np.abs(all_floats) < 10)}") |
|
|
print(f" Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]") |
|
|
print(f" Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}") |
|
|
print(f" First 20: {all_floats[:20]}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for header_floats in range(0, 40): |
|
|
remaining = len(all_floats) - header_floats |
|
|
|
|
|
for in_dim in [20, 21, 22]: |
|
|
for out_dim in [48, 49, 50, 51, 52]: |
|
|
needed = in_dim * out_dim + out_dim |
|
|
if remaining == needed: |
|
|
print(f"\n *** MATCH: header={header_floats} ({header_floats*4}B) + " |
|
|
f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats") |
|
|
W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim) |
|
|
b = all_floats[header_floats + in_dim*out_dim:header_floats + needed] |
|
|
print(f" W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}") |
|
|
print(f" b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}") |
|
|
|
|
|
if header_floats > 0: |
|
|
header = all_floats[:header_floats] |
|
|
print(f" Header values: {header}") |
|
|
|
|
|
|
|
|
|
|
|
print(f"\n--- Trying int8 interpretation ---") |
|
|
int8_arr = np.frombuffer(config_blob, dtype=np.int8) |
|
|
print(f" int8 range: [{int8_arr.min()}, {int8_arr.max()}]") |
|
|
|
|
|
uint8_arr = np.frombuffer(config_blob, dtype=np.uint8) |
|
|
print(f" uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]") |
|
|
|
|
|
|
|
|
if len(config_blob) % 2 == 0: |
|
|
f16_arr = np.frombuffer(config_blob, dtype=np.float16) |
|
|
finite_f16 = np.isfinite(f16_arr).sum() |
|
|
print(f" float16 count: {len(f16_arr)}, finite: {finite_f16}") |
|
|
if finite_f16 > len(f16_arr) * 0.9: |
|
|
print(f" float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]") |
|
|
|
|
|
|
|
|
print(f"\n--- Checking Slice constants to understand feature extraction ---") |
|
|
for node in model.graph.node: |
|
|
if node.op_type == "Constant": |
|
|
for attr in node.attribute: |
|
|
if attr.type == 4: |
|
|
t = attr.t |
|
|
data = onnx.numpy_helper.to_array(t) |
|
|
print(f" Constant '{node.output[0]}': {data}") |
|
|
|
|
|
|
|
|
for node in model.graph.node: |
|
|
if node.op_type in ("Add", "Div"): |
|
|
print(f"\n {node.op_type}: {list(node.input)} → {list(node.output)}") |
|
|
|