oneocr / _archive /attempts /peek_header.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 3 days ago

3.09 kB

	import struct

	filepath = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"

	with open(filepath, "rb") as f:
	data = f.read(23000) # read a bit more than 22636
	f.seek(0, 2)
	filesize = f.tell()

	print(f"File size: {filesize} bytes ({filesize/1024/1024:.2f} MB)")
	print()

	# Hex dump first 512 bytes
	print("=== First 512 bytes hex dump ===")
	for i in range(0, 512, 16):
	hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
	ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
	print(f"{i:08x}: {hex_part:<48s} {ascii_part}")

	print()
	print("=== uint32 LE values at key offsets ===")
	for off in range(0, 64, 4):
	val = struct.unpack_from("<I", data, off)[0]
	print(f" offset {off:4d} (0x{off:04x}): {val:12d} (0x{val:08x})")

	print()
	print("=== Check around offset 22636 (header size?) ===")
	off = 22636
	for i in range(off - 32, off + 64, 16):
	hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
	ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
	print(f"{i:08x}: {hex_part:<48s} {ascii_part}")

	print()
	print("=== Entropy analysis of header vs body ===")
	from collections import Counter
	header = data[:22636]
	body_sample = data[22636:22636+4096]
	h_counter = Counter(header)
	b_counter = Counter(body_sample)
	print(f" Header unique bytes: {len(h_counter)}/256")
	print(f" Body sample unique bytes: {len(b_counter)}/256")

	# Check for null bytes in header
	null_count = header.count(0)
	print(f" Header null bytes: {null_count}/{len(header)} ({100*null_count/len(header):.1f}%)")

	# Look for patterns in header
	print()
	print("=== Looking for potential sub-structures in header ===")
	# Check if there are recognizable strings
	import re
	strings = re.findall(b'[\x20-\x7e]{4,}', header)
	if strings:
	print(" ASCII strings found in header:")
	for s in strings[:30]:
	print(f" {s.decode('ascii', errors='replace')}")
	else:
	print(" No ASCII strings >= 4 chars found in header")

	# Check for potential magic numbers
	print()
	print("=== Magic number checks at offset 0 ===")
	print(f" Bytes 0-3: {data[0:4].hex()}")
	print(f" Bytes 0-7: {data[0:8].hex()}")
	print(f" As string: {data[0:8]}")

	# Look for repeating 4-byte patterns
	print()
	print("=== Byte frequency in first 64 bytes ===")
	for i in range(64):
	if i % 16 == 0:
	print(f" {i:3d}: ", end="")
	print(f"{data[i]:3d}", end=" ")
	if i % 16 == 15:
	print()

	# Check if header has structure - look for uint32 values that could be offsets/sizes
	print()
	print("=== Potential offset/size table at start ===")
	for i in range(0, min(256, len(header)), 4):
	val = struct.unpack_from("<I", data, i)[0]
	if 0 < val < filesize:
	print(f" offset {i}: uint32={val} (could be offset/size, {val/1024:.1f}KB)")

	# Check byte patterns for IV detection
	print()
	print("=== 16-byte blocks that could be IV ===")
	for start in [4, 8, 12, 16, 20]:
	block = data[start:start+16]
	unique = len(set(block))
	print(f" offset {start:3d}: {block.hex()} (unique bytes: {unique}/16)")