oneocr / _archive /attempts /disasm_crypto.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 1 day ago

6.23 kB

	"""
	Disassemble the Cipher function in oneocr.dll to find the exact crypto parameters.
	Find code references to the crypto strings we identified.
	"""
	import struct
	import re

	dll_path = r"c:\Users\MattyMroz\Desktop\PROJECTS\ONEOCR\ocr_data\oneocr.dll"
	with open(dll_path, "rb") as f:
	data = f.read()

	# Parse PE headers to find section info
	pe_sig_offset = struct.unpack_from("<I", data, 0x3C)[0]
	assert data[pe_sig_offset:pe_sig_offset+4] == b"PE\x00\x00"

	# COFF header
	coff_start = pe_sig_offset + 4
	num_sections = struct.unpack_from("<H", data, coff_start + 2)[0]
	opt_header_size = struct.unpack_from("<H", data, coff_start + 16)[0]

	# Optional header
	opt_start = coff_start + 20
	magic = struct.unpack_from("<H", data, opt_start)[0]
	assert magic == 0x20B # PE32+

	image_base = struct.unpack_from("<Q", data, opt_start + 24)[0]

	# Sections
	section_start = opt_start + opt_header_size
	sections = []
	for i in range(num_sections):
	s_off = section_start + i * 40
	name = data[s_off:s_off+8].rstrip(b"\x00").decode("ascii", errors="replace")
	vsize = struct.unpack_from("<I", data, s_off + 8)[0]
	va = struct.unpack_from("<I", data, s_off + 12)[0]
	raw_size = struct.unpack_from("<I", data, s_off + 16)[0]
	raw_ptr = struct.unpack_from("<I", data, s_off + 20)[0]
	sections.append((name, va, vsize, raw_ptr, raw_size))
	print(f"Section: {name:10s} VA=0x{va:08x} VSize=0x{vsize:08x} RawPtr=0x{raw_ptr:08x} RawSize=0x{raw_size:08x}")

	print(f"\nImage base: 0x{image_base:016x}")

	def rva_to_file_offset(rva):
	for name, va, vsize, raw_ptr, raw_size in sections:
	if va <= rva < va + vsize:
	return raw_ptr + (rva - va)
	return None

	def file_offset_to_rva(offset):
	for name, va, vsize, raw_ptr, raw_size in sections:
	if raw_ptr <= offset < raw_ptr + raw_size:
	return va + (offset - raw_ptr)
	return None

	# Key string offsets we found
	crypto_strings = {
	"SHA256 (wide)": 0x02724b60,
	"AES (wide)": 0x02724b70,
	"BlockLength (wide)": 0x02724b78,
	"ChainingModeCFB (wide)": 0x02724b90,
	"meta->magic_number == MAGIC_NUMBER": 0x02724bb0,
	"Unable to uncompress": 0x02724bd8,
	"Crypto.cpp": 0x02724c08,
	"Error returned from crypto API": 0x02724c40,
	"ChainingMode (wide)": 0x02724c80,
	"MessageBlockLength (wide)": 0x02724ca0,
	}

	# Calculate RVAs of these strings
	print("\n=== String RVAs ===")
	for name, file_off in crypto_strings.items():
	rva = file_offset_to_rva(file_off)
	if rva:
	print(f" {name}: file=0x{file_off:08x} RVA=0x{rva:08x}")

	# Find code references to these strings via LEA instruction patterns
	# In x64, LEA reg, [rip+disp32] is encoded as:
	# 48 8D xx yy yy yy yy (where xx determines the register)
	# or 4C 8D xx yy yy yy yy
	# The target address = instruction_address + 7 + disp32

	print("\n=== Searching for code references to crypto strings ===")

	# Focus on the most important strings
	key_strings = {
	"ChainingModeCFB (wide)": 0x02724b90,
	"SHA256 (wide)": 0x02724b60,
	"AES (wide)": 0x02724b70,
	"Crypto.cpp": 0x02724c08,
	"MessageBlockLength (wide)": 0x02724ca0,
	"meta->magic_number": 0x02724bb0,
	}

	# Find the .text section (code)
	text_section = None
	for name, va, vsize, raw_ptr, raw_size in sections:
	if name == ".text":
	text_section = (va, vsize, raw_ptr, raw_size)
	break

	if text_section:
	text_va, text_vsize, text_raw, text_rawsize = text_section
	print(f"\n.text section: VA=0x{text_va:08x} size=0x{text_vsize:08x}")

	for string_name, string_file_off in key_strings.items():
	string_rva = file_offset_to_rva(string_file_off)
	if string_rva is None:
	continue

	# Search for LEA instructions referencing this RVA
	# LEA uses RIP-relative addressing: target = RIP + disp32
	# RIP at instruction = instruction_RVA + instruction_length (typically 7 for LEA)
	refs_found = []

	for code_off in range(text_raw, text_raw + text_rawsize - 7):
	# Check for LEA patterns
	b0 = data[code_off]
	b1 = data[code_off + 1]

	# 48 8D 0D/15/05/1D/25/2D/35/3D = LEA with REX.W
	# 4C 8D 05/0D/15/1D/25/2D/35/3D = LEA with REX.WR
	if b0 in (0x48, 0x4C) and b1 == 0x8D:
	modrm = data[code_off + 2]
	if (modrm & 0xC7) == 0x05: # mod=00, rm=101 (RIP-relative)
	disp32 = struct.unpack_from("<i", data, code_off + 3)[0]
	instr_rva = file_offset_to_rva(code_off)
	if instr_rva is None:
	continue
	target_rva = instr_rva + 7 + disp32
	if target_rva == string_rva:
	reg_idx = (modrm >> 3) & 7
	if b0 == 0x4C:
	reg_idx += 8
	reg_names = ["rax","rcx","rdx","rbx","rsp","rbp","rsi","rdi",
	"r8","r9","r10","r11","r12","r13","r14","r15"]
	reg = reg_names[reg_idx]
	refs_found.append((code_off, instr_rva, reg))

	if refs_found:
	print(f"\n References to '{string_name}' (RVA=0x{string_rva:08x}):")
	for code_off, instr_rva, reg in refs_found[:5]:
	print(f" at file=0x{code_off:08x} RVA=0x{instr_rva:08x}: LEA {reg}, [{string_name}]")
	# Dump surrounding code
	ctx_start = max(text_raw, code_off - 64)
	ctx_end = min(text_raw + text_rawsize, code_off + 128)

	# Simple bytecode dump with some x64 instruction markers
	print(f" Context (file offset 0x{ctx_start:08x} - 0x{ctx_end:08x}):")
	for i in range(ctx_start, ctx_end, 16):
	chunk = data[i:i+16]
	hex_part = " ".join(f"{b:02x}" for b in chunk)
	rva_i = file_offset_to_rva(i)
	marker = " <<<" if i <= code_off < i + 16 else ""
	print(f" {rva_i:08x}: {hex_part}{marker}")
	else:
	print(f"\n No code references found for '{string_name}'")