oneocr / _archive /inspect_config_blob.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 2 days ago

4.06 kB

	"""Deep-dive into model_11 and model_22 graph structure — handle binary config."""
	import onnx
	import numpy as np
	from pathlib import Path

	models_dir = Path("oneocr_extracted/onnx_models")

	for idx in [11, 22]:
	matches = list(models_dir.glob(f"model_{idx:02d}_*"))
	model = onnx.load(str(matches[0]))

	print(f"\n{'='*70}")
	print(f"FULL GRAPH: model_{idx:02d}")
	print(f"{'='*70}")

	# All initializers (weights)
	print(f"\n Initializers ({len(model.graph.initializer)}):")
	for init in model.graph.initializer:
	if init.data_type == 8: # STRING
	raw = init.string_data[0] if init.string_data else init.raw_data
	print(f" {init.name}: STRING, {len(raw)} bytes (binary)")
	else:
	data = onnx.numpy_helper.to_array(init)
	print(f" {init.name}: shape={data.shape}, dtype={data.dtype}, "
	f"range=[{data.min():.4f}, {data.max():.4f}]")

	# All nodes
	print(f"\n Nodes ({len(model.graph.node)}):")
	for i, node in enumerate(model.graph.node):
	domain_str = f" [{node.domain}]" if node.domain else ""
	print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)} → {list(node.output)}")
	for attr in node.attribute:
	if attr.type == 2:
	print(f" {attr.name} = {attr.i}")
	elif attr.type == 1:
	print(f" {attr.name} = {attr.f}")
	elif attr.type == 7:
	print(f" {attr.name} = {list(attr.ints)}")

	# Analyze feature/config blob
	for init in model.graph.initializer:
	if "config" in init.name.lower():
	raw = init.string_data[0] if init.string_data else init.raw_data
	blob = bytes(raw)
	print(f"\n ── feature/config analysis ──")
	print(f" Total bytes: {len(blob)}")
	print(f" First 32 bytes hex: {blob[:32].hex()}")

	# Hypothesis: header + weight_matrix(input_dim × output_dim) + bias(output_dim)
	# If input=21, output=50: 21*50=1050 floats = 4200 bytes, bias=50 floats = 200 bytes
	# Total weights = 4400 bytes, header = 4492-4400 = 92 bytes

	# Try reading first few uint32 as header
	header_u32 = [int.from_bytes(blob[i:i+4], 'little') for i in range(0, min(96, len(blob)), 4)]
	print(f" First 24 uint32 LE values: {header_u32}")

	# Try float32 interpretation after various offsets
	for offset in [0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92]:
	remaining = len(blob) - offset
	n_floats = remaining // 4
	if n_floats == 0:
	continue
	arr = np.frombuffer(blob[offset:offset + n_floats*4], dtype=np.float32)
	valid = np.isfinite(arr).sum()
	reasonable = np.sum((np.abs(arr) < 10) & np.isfinite(arr))
	if reasonable > n_floats * 0.7: # >70% reasonable values
	print(f" *** offset={offset}: {n_floats} floats, {valid} finite, "
	f"{reasonable} in [-10,10] ({100*reasonable/n_floats:.0f}%)")
	print(f" First 10: {arr[:10]}")
	print(f" Stats: mean={arr.mean():.4f}, std={arr.std():.4f}")
	# Check if it could be weight matrix 21×50
	if n_floats >= 1050 + 50:
	W = arr[:1050].reshape(21, 50)
	b = arr[1050:1100]
	print(f" As 21×50 weight: W_range=[{W.min():.4f},{W.max():.4f}], "
	f"b_range=[{b.min():.4f},{b.max():.4f}]")
	# Test with random input
	x = np.random.randn(1, 21).astype(np.float32)
	y = x @ W + b
	print(f" Test: input(21) → output(50), y_range=[{y.min():.4f},{y.max():.4f}]")