oneocr / _archive /analysis /analyze_boundaries.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 1 day ago

3.45 kB

	"""Analyze exact chunk boundary structure in the .onemodel file."""
	import struct, json

	with open("ocr_data/oneocr.onemodel", "rb") as f:
	fdata = f.read()
	log = json.load(open("temp/crypto_log.json"))

	sha256s = [op for op in log if op["op"] == "sha256"]
	sha_map = {s["output"]: s["input"] for s in sha256s}
	decrypts = [op for op in log if op["op"] == "decrypt"]

	# Get info for first few payload chunks
	def get_chunk_info(dec_idx):
	d = decrypts[dec_idx]
	sha_inp = bytes.fromhex(sha_map[d["aes_key"]])
	s1, s2 = struct.unpack_from("<QQ", sha_inp, 0)
	chk = sha_inp[16:32]
	chk_pos = fdata.find(chk)
	return {
	"dec_idx": dec_idx,
	"enc_size": d["input_size"],
	"size1": s1,
	"size2": s2,
	"chk": chk,
	"chk_pos": chk_pos,
	}

	# Focus on first few consecutive large chunks
	# From the sorted output, the order in file is: dec#02, dec#03, dec#06, dec#11, dec#16, dec#23, ...
	chunks_in_order = [2, 3, 6, 11, 16, 23, 28, 33]
	infos = [get_chunk_info(i) for i in chunks_in_order]

	print("=== Chunk boundary analysis ===\n")
	for i, info in enumerate(infos):
	print(f"dec#{info['dec_idx']:02d}: chk_pos={info['chk_pos']}, size1={info['size1']}, enc_size={info['enc_size']}")

	if i > 0:
	prev = infos[i-1]
	# Hypothesis: on-disk encrypted data = size1 + 8 (data_size + container_header)
	prev_data_start = prev['chk_pos'] + 32
	prev_on_disk = prev['size1'] + 8
	expected_next_chk = prev_data_start + prev_on_disk
	actual_next_chk = info['chk_pos']
	delta = actual_next_chk - expected_next_chk
	print(f" Expected chk_pos: {expected_next_chk}, actual: {actual_next_chk}, delta: {delta}")

	# Now figure out the EXACT header structure
	print("\n=== Bytes around first few chunk boundaries ===\n")

	# Between DX and first chunk
	dx_end = 24 + 22624 # = 22648
	print(f"--- DX end ({dx_end}) to first chunk ---")
	for off in range(dx_end, infos[0]['chk_pos'] + 48, 8):
	raw = fdata[off:off+8]
	val = struct.unpack_from("<Q", raw)[0] if len(raw) == 8 else 0
	print(f" {off:>8}: {raw.hex()} (uint64={val})")

	# Between chunk 0 and chunk 1
	c0 = infos[0]
	c1 = infos[1]
	# data starts at chk_pos + 32, on-disk size is approximately size1+8 or enc_size
	# Let's look at bytes around where the boundary should be
	c0_data_start = c0['chk_pos'] + 32
	c0_approx_end = c0_data_start + c0['size1'] + 8
	print(f"\n--- End of dec#{c0['dec_idx']:02d} / Start of dec#{c1['dec_idx']:02d} ---")
	print(f" c0 data_start: {c0_data_start}")
	print(f" c0 size1+8: {c0['size1']+8}")
	print(f" c0 approx end: {c0_approx_end}")
	print(f" c1 chk_pos: {c1['chk_pos']}")

	for off in range(c0_approx_end - 16, c1['chk_pos'] + 48, 8):
	raw = fdata[off:off+8]
	val = struct.unpack_from("<Q", raw)[0] if len(raw) == 8 else 0
	ascii_s = ''.join(chr(b) if 32 <= b < 127 else '.' for b in raw)
	print(f" {off:>8}: {raw.hex()} val={val:<15d} {ascii_s}")

	# Check file header
	header_size = struct.unpack_from("<Q", fdata, 0)[0]
	print(f"\nFile header uint64: {header_size}")
	print(f" = file[0:8] as uint64 LE")

	# What if it's NOT a uint64 but two uint32?
	h1, h2 = struct.unpack_from("<II", fdata, 0)
	print(f" As two uint32: ({h1}, {h2})")

	# file[0:24] detailed view
	print("\nFile header [0:24]:")
	for off in range(0, 24, 8):
	raw = fdata[off:off+8]
	val = struct.unpack_from("<Q", raw)[0]
	print(f" {off:>3}: {raw.hex()} uint64={val}")