oneocr / _archive /analysis /analyze_model.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Analyze oneocr.onemodel file format."""
import os
import struct
MODEL_PATH = r"ocr_data\oneocr.onemodel"
with open(MODEL_PATH, "rb") as f:
data = f.read()
print(f"Total size: {len(data)} bytes = {len(data)/1024/1024:.2f} MB")
print(f"First 8 bytes (hex): {data[:8].hex()}")
print(f"First 4 bytes as uint32 LE: {struct.unpack('<I', data[:4])[0]}")
print(f"First 8 bytes as uint64 LE: {struct.unpack('<Q', data[:8])[0]}")
print()
# Search for known patterns
patterns = [b"onnx", b"ai.onnx", b"ONNX", b"ort_", b"onnxruntime",
b"ir_version", b"ORTM", b"FORT", b"ORT ", b"model",
b"graph", b"Conv", b"Relu", b"Softmax", b"tensor",
b"float", b"int64", b"opset", b"producer"]
for pattern in patterns:
idx = data.find(pattern)
if idx >= 0:
ctx_start = max(0, idx - 8)
ctx_end = min(len(data), idx + len(pattern) + 8)
print(f"Found '{pattern.decode(errors='replace')}' at offset {idx} (0x{idx:x})")
print(f" Context hex: {data[ctx_start:ctx_end].hex()}")
print()
# Check entropy by sections
import collections
def entropy_score(chunk):
c = collections.Counter(chunk)
unique = len(c)
return unique
print("Entropy analysis (unique byte values per 4KB block):")
for i in range(0, min(len(data), 64*1024), 4096):
chunk = data[i:i+4096]
e = entropy_score(chunk)
print(f" Offset 0x{i:06x}: {e}/256 unique bytes",
"(encrypted/compressed)" if e > 240 else "(structured)" if e < 100 else "")
# Look at first int as possible header size
hdr_size = struct.unpack('<I', data[:4])[0]
print(f"\nFirst uint32 = {hdr_size} (0x{hdr_size:x})")
print(f"If header size, data starts at offset {hdr_size}")
if hdr_size < len(data):
print(f"Data at offset {hdr_size}: {data[hdr_size:hdr_size+32].hex()}")
# Check what's at byte 8
print(f"\nBytes 8-16: {data[8:16].hex()}")
print(f"If offset 8 is data: unique bytes = {entropy_score(data[8:8+4096])}/256")
# XOR analysis - try single byte XOR keys
print("\nXOR key analysis (checking if XOR of first bytes gives ONNX protobuf header):")
# ONNX protobuf starts with 0x08 (varint, field 1 = ir_version)
xor_key_byte0 = data[0] ^ 0x08
print(f" If first byte should be 0x08: XOR key = 0x{xor_key_byte0:02x}")
# Try XOR with that key on first 16 bytes
test = bytes(b ^ xor_key_byte0 for b in data[:16])
print(f" XOR'd first 16 bytes: {test.hex()}")