"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix.""" import onnx import numpy as np from pathlib import Path models_dir = Path("oneocr_extracted/onnx_models") # Load model_11 model = onnx.load(str(list(models_dir.glob("model_11_*"))[0])) # Get feature/config blob config_blob = None for init in model.graph.initializer: if init.name == "feature/config": config_blob = bytes(init.string_data[0]) break print(f"Config blob size: {len(config_blob)} bytes") print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}") # Full float32 interpretation all_floats = np.frombuffer(config_blob, dtype=np.float32) print(f"\nFull blob as float32:") print(f" Count: {len(all_floats)}") print(f" Finite: {np.isfinite(all_floats).sum()}") print(f" In [-10,10]: {np.sum(np.abs(all_floats) < 10)}") print(f" Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]") print(f" Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}") print(f" First 20: {all_floats[:20]}") # 4492 bytes / 4 = 1123 floats # Hypothesis: some header + 21×50 weight matrix + 50 bias # 1123 - 1050 - 50 = 23 extra floats (92 bytes header) # Try different header sizes for header_floats in range(0, 40): remaining = len(all_floats) - header_floats # Check if remaining = in_dim * out_dim + out_dim for some dimensions for in_dim in [20, 21, 22]: for out_dim in [48, 49, 50, 51, 52]: needed = in_dim * out_dim + out_dim if remaining == needed: print(f"\n *** MATCH: header={header_floats} ({header_floats*4}B) + " f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats") W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim) b = all_floats[header_floats + in_dim*out_dim:header_floats + needed] print(f" W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}") print(f" b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}") if header_floats > 0: header = all_floats[:header_floats] print(f" Header values: {header}") # Also try: the blob might encode multiple layers # Or maybe it's quantized (int8/uint8)? print(f"\n--- Trying int8 interpretation ---") int8_arr = np.frombuffer(config_blob, dtype=np.int8) print(f" int8 range: [{int8_arr.min()}, {int8_arr.max()}]") uint8_arr = np.frombuffer(config_blob, dtype=np.uint8) print(f" uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]") # Maybe float16? if len(config_blob) % 2 == 0: f16_arr = np.frombuffer(config_blob, dtype=np.float16) finite_f16 = np.isfinite(f16_arr).sum() print(f" float16 count: {len(f16_arr)}, finite: {finite_f16}") if finite_f16 > len(f16_arr) * 0.9: print(f" float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]") # Check the Slice in model_11 to understand input dimensions print(f"\n--- Checking Slice constants to understand feature extraction ---") for node in model.graph.node: if node.op_type == "Constant": for attr in node.attribute: if attr.type == 4: # TENSOR t = attr.t data = onnx.numpy_helper.to_array(t) print(f" Constant '{node.output[0]}': {data}") # Check Add and Div constants for node in model.graph.node: if node.op_type in ("Add", "Div"): print(f"\n {node.op_type}: {list(node.input)} → {list(node.output)}")