oneocr / _archive /crack_config.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 about 16 hours ago

3.62 kB

	"""Crack the OneOCRFeatureExtract config blob — find the hidden weight matrix."""
	import onnx
	import numpy as np
	from pathlib import Path

	models_dir = Path("oneocr_extracted/onnx_models")

	# Load model_11
	model = onnx.load(str(list(models_dir.glob("model_11_*"))[0]))

	# Get feature/config blob
	config_blob = None
	for init in model.graph.initializer:
	if init.name == "feature/config":
	config_blob = bytes(init.string_data[0])
	break

	print(f"Config blob size: {len(config_blob)} bytes")
	print(f"As float32 count: {len(config_blob) // 4} = {len(config_blob) / 4}")

	# Full float32 interpretation
	all_floats = np.frombuffer(config_blob, dtype=np.float32)
	print(f"\nFull blob as float32:")
	print(f" Count: {len(all_floats)}")
	print(f" Finite: {np.isfinite(all_floats).sum()}")
	print(f" In [-10,10]: {np.sum(np.abs(all_floats) < 10)}")
	print(f" Range: [{all_floats.min():.4f}, {all_floats.max():.4f}]")
	print(f" Mean: {all_floats.mean():.4f}, Std: {all_floats.std():.4f}")
	print(f" First 20: {all_floats[:20]}")

	# 4492 bytes / 4 = 1123 floats
	# Hypothesis: some header + 21×50 weight matrix + 50 bias
	# 1123 - 1050 - 50 = 23 extra floats (92 bytes header)

	# Try different header sizes
	for header_floats in range(0, 40):
	remaining = len(all_floats) - header_floats
	# Check if remaining = in_dim * out_dim + out_dim for some dimensions
	for in_dim in [20, 21, 22]:
	for out_dim in [48, 49, 50, 51, 52]:
	needed = in_dim * out_dim + out_dim
	if remaining == needed:
	print(f"\n *** MATCH: header={header_floats} ({header_floats*4}B) + "
	f"W[{in_dim}×{out_dim}] + b[{out_dim}] = {needed} floats")
	W = all_floats[header_floats:header_floats + in_dim*out_dim].reshape(in_dim, out_dim)
	b = all_floats[header_floats + in_dim*out_dim:header_floats + needed]
	print(f" W range: [{W.min():.4f}, {W.max():.4f}], mean={W.mean():.4f}")
	print(f" b range: [{b.min():.4f}, {b.max():.4f}], mean={b.mean():.4f}")

	if header_floats > 0:
	header = all_floats[:header_floats]
	print(f" Header values: {header}")

	# Also try: the blob might encode multiple layers
	# Or maybe it's quantized (int8/uint8)?
	print(f"\n--- Trying int8 interpretation ---")
	int8_arr = np.frombuffer(config_blob, dtype=np.int8)
	print(f" int8 range: [{int8_arr.min()}, {int8_arr.max()}]")

	uint8_arr = np.frombuffer(config_blob, dtype=np.uint8)
	print(f" uint8 range: [{uint8_arr.min()}, {uint8_arr.max()}]")

	# Maybe float16?
	if len(config_blob) % 2 == 0:
	f16_arr = np.frombuffer(config_blob, dtype=np.float16)
	finite_f16 = np.isfinite(f16_arr).sum()
	print(f" float16 count: {len(f16_arr)}, finite: {finite_f16}")
	if finite_f16 > len(f16_arr) * 0.9:
	print(f" float16 could work! range=[{f16_arr[np.isfinite(f16_arr)].min():.4f}, {f16_arr[np.isfinite(f16_arr)].max():.4f}]")

	# Check the Slice in model_11 to understand input dimensions
	print(f"\n--- Checking Slice constants to understand feature extraction ---")
	for node in model.graph.node:
	if node.op_type == "Constant":
	for attr in node.attribute:
	if attr.type == 4: # TENSOR
	t = attr.t
	data = onnx.numpy_helper.to_array(t)
	print(f" Constant '{node.output[0]}': {data}")

	# Check Add and Div constants
	for node in model.graph.node:
	if node.op_type in ("Add", "Div"):
	print(f"\n {node.op_type}: {list(node.input)} → {list(node.output)}")