oneocr / _archive /attempts /peek_header.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
import struct
filepath = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.onemodel"
with open(filepath, "rb") as f:
data = f.read(23000) # read a bit more than 22636
f.seek(0, 2)
filesize = f.tell()
print(f"File size: {filesize} bytes ({filesize/1024/1024:.2f} MB)")
print()
# Hex dump first 512 bytes
print("=== First 512 bytes hex dump ===")
for i in range(0, 512, 16):
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
print(f"{i:08x}: {hex_part:<48s} {ascii_part}")
print()
print("=== uint32 LE values at key offsets ===")
for off in range(0, 64, 4):
val = struct.unpack_from("<I", data, off)[0]
print(f" offset {off:4d} (0x{off:04x}): {val:12d} (0x{val:08x})")
print()
print("=== Check around offset 22636 (header size?) ===")
off = 22636
for i in range(off - 32, off + 64, 16):
hex_part = " ".join(f"{b:02x}" for b in data[i:i+16])
ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in data[i:i+16])
print(f"{i:08x}: {hex_part:<48s} {ascii_part}")
print()
print("=== Entropy analysis of header vs body ===")
from collections import Counter
header = data[:22636]
body_sample = data[22636:22636+4096]
h_counter = Counter(header)
b_counter = Counter(body_sample)
print(f" Header unique bytes: {len(h_counter)}/256")
print(f" Body sample unique bytes: {len(b_counter)}/256")
# Check for null bytes in header
null_count = header.count(0)
print(f" Header null bytes: {null_count}/{len(header)} ({100*null_count/len(header):.1f}%)")
# Look for patterns in header
print()
print("=== Looking for potential sub-structures in header ===")
# Check if there are recognizable strings
import re
strings = re.findall(b'[\x20-\x7e]{4,}', header)
if strings:
print(" ASCII strings found in header:")
for s in strings[:30]:
print(f" {s.decode('ascii', errors='replace')}")
else:
print(" No ASCII strings >= 4 chars found in header")
# Check for potential magic numbers
print()
print("=== Magic number checks at offset 0 ===")
print(f" Bytes 0-3: {data[0:4].hex()}")
print(f" Bytes 0-7: {data[0:8].hex()}")
print(f" As string: {data[0:8]}")
# Look for repeating 4-byte patterns
print()
print("=== Byte frequency in first 64 bytes ===")
for i in range(64):
if i % 16 == 0:
print(f" {i:3d}: ", end="")
print(f"{data[i]:3d}", end=" ")
if i % 16 == 15:
print()
# Check if header has structure - look for uint32 values that could be offsets/sizes
print()
print("=== Potential offset/size table at start ===")
for i in range(0, min(256, len(header)), 4):
val = struct.unpack_from("<I", data, i)[0]
if 0 < val < filesize:
print(f" offset {i}: uint32={val} (could be offset/size, {val/1024:.1f}KB)")
# Check byte patterns for IV detection
print()
print("=== 16-byte blocks that could be IV ===")
for start in [4, 8, 12, 16, 20]:
block = data[start:start+16]
unique = len(set(block))
print(f" offset {start:3d}: {block.hex()} (unique bytes: {unique}/16)")