oneocr / _archive /crack_endian.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Test big-endian float32 interpretation of OneOCRFeatureExtract config blob."""
import onnx
import numpy as np
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))
# Get config blob
for init in model.graph.initializer:
if init.name == "feature/config":
blob = bytes(init.string_data[0])
break
print(f"Blob: {len(blob)} bytes = {len(blob) // 4} float32s")
# Big-endian float32
be_arr = np.frombuffer(blob, dtype='>f4') # big-endian
le_arr = np.frombuffer(blob, dtype='<f4') # little-endian
print(f"\nBig-endian float32:")
print(f" Finite: {np.isfinite(be_arr).sum()} / {len(be_arr)}")
in_range = np.sum(np.abs(be_arr[np.isfinite(be_arr)]) < 10)
print(f" In [-10,10]: {in_range} ({100*in_range/len(be_arr):.1f}%)")
be_finite = be_arr[np.isfinite(be_arr)]
print(f" Mean: {be_finite.mean():.4f}, Std: {be_finite.std():.4f}")
print(f" Range: [{be_finite.min():.4f}, {be_finite.max():.4f}]")
print(f" First 20: {be_arr[:20]}")
print(f"\nLittle-endian float32:")
print(f" Finite: {np.isfinite(le_arr).sum()} / {len(le_arr)}")
in_range_le = np.sum(np.abs(le_arr[np.isfinite(le_arr)]) < 10)
print(f" In [-10,10]: {in_range_le} ({100*in_range_le/len(le_arr):.1f}%)")
# If big-endian works, try to extract 21×50 weight matrix + 50 bias
# 1123 total floats
# Check feasible dimensions
print(f"\n--- Dimension search for big-endian ---")
for header in range(0, 40):
remaining = len(be_arr) - header
for in_d in [20, 21, 22]:
for out_d in [48, 49, 50, 51, 52]:
if remaining == in_d * out_d + out_d:
W = be_arr[header:header + in_d*out_d].reshape(in_d, out_d)
b = be_arr[header + in_d*out_d:]
w_finite = np.isfinite(W).sum()
w_reasonable = np.sum(np.abs(W[np.isfinite(W)]) < 10)
if w_reasonable > in_d * out_d * 0.7:
print(f" *** header={header} + W[{in_d}×{out_d}] + b[{out_d}]")
print(f" W finite={w_finite}, reasonable={w_reasonable}")
print(f" W range: [{W[np.isfinite(W)].min():.4f}, {W[np.isfinite(W)].max():.4f}]")
print(f" b range: [{b[np.isfinite(b)].min():.4f}, {b[np.isfinite(b)].max():.4f}]")
# Also test: could be byteswapped structure with header
# Try offset by checking where the "nice" values start
print(f"\n--- Finding good float32 regions (big-endian) ---")
for start_byte in range(0, 100, 4):
chunk = np.frombuffer(blob[start_byte:start_byte+84], dtype='>f4')
all_reasonable = all(np.isfinite(chunk)) and all(np.abs(chunk) < 10)
if all_reasonable:
print(f" offset={start_byte}: ALL 21 values reasonable: {chunk}")
break
decent = np.sum((np.abs(chunk) < 10) & np.isfinite(chunk))
if decent >= 18:
print(f" offset={start_byte}: {decent}/21 reasonable: {chunk}")