oneocr / _archive /analysis /analyze_dx.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Analyze DX index structure to understand chunk record format."""
import hashlib
import struct
import json
from pathlib import Path
from Crypto.Cipher import AES
KEY = b'kj)TGtrK>f]b[Piow.gU+nC@s""""""4'
IV = b"Copyright @ OneO"
file_data = Path("ocr_data/oneocr.onemodel").read_bytes()
# Step 1: Decrypt DX
header_hash = file_data[8:24]
dx_key = hashlib.sha256(KEY + header_hash).digest()
encrypted_dx = file_data[24:24 + 22624]
cipher = AES.new(dx_key, AES.MODE_CFB, iv=IV, segment_size=128)
dx = cipher.decrypt(encrypted_dx)
assert dx[:2] == b"DX"
# Load crypto log
crypto_log = json.load(open("temp/crypto_log.json"))
# Get unique SHA256 inputs in order
sha_ops = [x for x in crypto_log if x['op'] == 'sha256']
seen = set()
unique_sha = []
for s in sha_ops:
if s['input'] not in seen:
seen.add(s['input'])
unique_sha.append(s)
# Get decrypt ops
dec_ops = [x for x in crypto_log if x['op'] == 'decrypt']
# For each SHA256 input, find its position in DX
print("=" * 80)
print("DX Index Structure Analysis")
print("=" * 80)
print(f"DX size: {len(dx)} bytes, valid: {struct.unpack('<Q', dx[8:16])[0]}")
print()
# Skip first SHA256 (DX key derivation uses master_key + file_header, not DX data)
print("SHA256 input #0: DX key = SHA256(master_key + file[8:24]) [special case]")
print()
for i, s in enumerate(unique_sha[1:], 1):
inp = bytes.fromhex(s['input'])
pos = dx.find(inp)
# Also try finding parts of the input
first_uint64 = inp[:8]
pos_partial = dx.find(first_uint64)
if pos >= 0:
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} found at DX offset {pos:5d} (0x{pos:04x})")
elif pos_partial >= 0:
# The input might be rearranged from DX
size1 = struct.unpack('<Q', inp[:8])[0]
size2 = struct.unpack('<Q', inp[8:16])[0]
checksum = inp[16:] if len(inp) > 16 else b""
# Check if sizes and checksum are nearby but in different order
pos_sizes = dx.find(inp[:16])
pos_check = dx.find(checksum) if checksum else -1
if pos_sizes >= 0:
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} sizes at DX offset {pos_sizes:5d}, checksum at {pos_check}")
else:
# Sizes might be in different order or interleaved
pos_s1 = dx.find(first_uint64)
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} first_uint64 at DX offset {pos_s1:5d} (rearranged?)")
size1 = struct.unpack('<Q', inp[:8])[0]
size2 = struct.unpack('<Q', inp[8:16])[0]
print(f" size1={size1} size2={size2} diff={size2-size1}")
else:
size1 = struct.unpack('<Q', inp[:8])[0]
size2 = struct.unpack('<Q', inp[8:16])[0]
print(f"SHA256 #{i:3d}: len={s['input_len']:2d} NOT FOUND (size1={size1} size2={size2})")
# Now let's dump DX structure around the first few records
print()
print("=" * 80)
print("DX Record Structure (first 128 bytes)")
print("=" * 80)
off = 0
print(f"[{off:4d}] DX Magic: {dx[off:off+8]!r}")
off += 8
print(f"[{off:4d}] Valid Size: {struct.unpack('<Q', dx[off:off+8])[0]}")
off += 8
print(f"[{off:4d}] Container: {dx[off:off+8].hex()}")
off += 8
val = struct.unpack('<Q', dx[off:off+8])[0]
print(f"[{off:4d}] Value: {val} (0x{val:x})")
off += 8
print(f"[{off:4d}] Checksum: {dx[off:off+16].hex()}")
off += 16
s1 = struct.unpack('<Q', dx[off:off+8])[0]
s2 = struct.unpack('<Q', dx[off+8:off+16])[0]
print(f"[{off:4d}] Sizes: {s1}, {s2} (diff={s2-s1})")
off += 16
print(f"[{off:4d}] Enc data starts: {dx[off:off+32].hex()}")
# The config chunk data is here, 11920 bytes
config_enc_size = 11920
config_end = off + config_enc_size
print(f" Config encrypted data: offset {off} to {config_end} ({config_enc_size} bytes)")
# What's after the config?
print(f"\n--- After config chunk ({config_end}) ---")
for j in range(0, 80, 16):
pos = config_end + j
if pos + 16 > len(dx):
break
chunk = dx[pos:pos+16]
hex_str = ' '.join(f'{b:02x}' for b in chunk)
ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in chunk)
print(f" {pos:5d} ({pos:#06x}): {hex_str:<48s} {ascii_str}")
# Look at the area around found patterns
for name, dx_off in [("Chunk2(encrypt) 0x2ed7", 0x2ed7),
("Chunk4(ONNX) 0x2f80", 0x2f80),
("Chunk5(ONNX2) 0x4692", 0x4692)]:
print(f"\n--- Area around {name} ---")
start = max(0, dx_off - 48)
for j in range(0, 128, 16):
pos = start + j
if pos + 16 > len(dx):
break
chunk = dx[pos:pos+16]
hex_str = ' '.join(f'{b:02x}' for b in chunk)
ascii_str = ''.join(chr(b) if 32 <= b < 127 else '.' for b in chunk)
marker = " <<<" if pos == dx_off else ""
print(f" {pos:5d} ({pos:#06x}): {hex_str:<48s} {ascii_str}{marker}")