Gladius / tokenizers /bytecode_tokenizer.py

GLADIUS v5.0 — Cognitive kernel with Synthase depth attention, PUP uncertainty, Memory V2, multi-tokenizer architecture

3f42614 about 2 months ago

17.5 kB

	#!/usr/bin/env python3
	"""
	GLADIUS v4.0 — Bytecode Tokenizer

	A structural tokenizer for bytecode. Not BPE. Not byte-level.
	Direct opcode-to-token mapping with argument quantization.

	Design:
	Token 0-255: Reserved for byte-level (machine code curriculum, future)
	Token 256-258: PAD, BOS, EOS
	Token 259-511: Opcodes (146 used, room for 253)
	Token 512-639: Numeric arguments (128 quantized buckets)
	Token 640-671: Register/local indices (0-31)
	Token 672-687: Comparison operators (16 slots)
	Token 688-703: Task markers (NEXT, STACK, EXEC, FILL, etc.)
	Token 704-767: Stack names, difficulty markers, structural tokens
	Token 768+: Overflow / future expansion

	Total vocab: ~768 active tokens (fits easily in 32K embedding space)

	Encoding is O(1) per token — table lookup, no iteration, no probability.
	Decoding is deterministic — token → exact opcode string.
	"""

	import re
	from typing import List, Optional, Dict, Tuple
	from pathlib import Path


	class BytecodeTokenizer:
	"""Structural tokenizer for Python/WASM/EVM bytecode."""

	# Ranges
	BYTE_START = 0 # 0-255: byte-level (reserved)
	PAD = 256
	BOS = 257
	EOS = 258
	OPCODE_START = 259
	NUMERIC_START = 512 # quantized numeric values
	REGISTER_START = 640 # local/register indices 0-31
	COMPARE_START = 672 # comparison operators
	TASK_START = 688 # task markers
	STRUCT_START = 704 # structural tokens

	# Numeric quantization: map integers to 128 buckets
	# Bucket 0-63: exact values -32 to 31
	# Bucket 64-95: values 32-255 (8-value steps)
	# Bucket 96-111: values 256-4095 (256-value steps)
	# Bucket 112-127: values 4096-65535+ (exponential)

	COMPARE_OPS = ['<', '>', '==', '!=', '<=', '>=',
	'lt_s', 'lt_u', 'gt_s', 'gt_u', 'le_s', 'le_u',
	'ge_s', 'ge_u', 'eq', 'ne']

	TASK_MARKERS = ['NEXT', 'STACK', 'EXEC', 'FILL', 'TRACE', 'OUT', 'QED',
	'D1', 'D2', 'D3', 'D4', 'D5',
	'python', 'wasm', 'evm']

	STRUCT_TOKENS = ['(', ')', ',', '\|', ':', '=',
	'o=', 'a=', # WASM memory args
	'computed', # EXEC result placeholder
	'SEPARATOR']

	def __init__(self, vocab_file: Optional[str] = None):
	"""Initialize with opcode vocabulary.

	If vocab_file is None, uses built-in vocabulary.
	"""
	self._opcode_to_id: Dict[str, int] = {}
	self._id_to_opcode: Dict[int, str] = {}
	self._compare_to_id: Dict[str, int] = {}
	self._task_to_id: Dict[str, int] = {}
	self._struct_to_id: Dict[str, int] = {}

	# Build opcode vocabulary
	opcodes = self._get_builtin_opcodes()
	if vocab_file and Path(vocab_file).exists():
	with open(vocab_file) as f:
	file_ops = [line.strip() for line in f if line.strip()]
	# Merge (file ops take precedence for ordering)
	opcodes = list(dict.fromkeys(file_ops + opcodes))

	for i, op in enumerate(opcodes):
	tid = self.OPCODE_START + i
	self._opcode_to_id[op] = tid
	self._id_to_opcode[tid] = op

	# Comparison operators
	for i, cmp in enumerate(self.COMPARE_OPS):
	tid = self.COMPARE_START + i
	self._compare_to_id[cmp] = tid
	self._id_to_opcode[tid] = f"CMP:{cmp}"

	# Task markers
	for i, task in enumerate(self.TASK_MARKERS):
	tid = self.TASK_START + i
	self._task_to_id[task] = tid
	self._id_to_opcode[tid] = f"TASK:{task}"

	# Structural tokens
	for i, st in enumerate(self.STRUCT_TOKENS):
	tid = self.STRUCT_START + i
	self._struct_to_id[st] = tid
	self._id_to_opcode[tid] = f"STRUCT:{st}"

	self.vocab_size = self.STRUCT_START + len(self.STRUCT_TOKENS) + 1
	self._num_opcodes = len(opcodes)

	def _get_builtin_opcodes(self) -> List[str]:
	"""Built-in opcode vocabulary covering Python/WASM/EVM."""
	return [
	# Python (CPython dis)
	'LOAD_CONST', 'LOAD_FAST', 'LOAD_GLOBAL', 'LOAD_NAME',
	'LOAD_ATTR', 'LOAD_DEREF', 'LOAD_CLOSURE',
	'STORE_FAST', 'STORE_GLOBAL', 'STORE_NAME', 'STORE_ATTR',
	'STORE_DEREF', 'STORE_SUBSCR',
	'BINARY_ADD', 'BINARY_SUBTRACT', 'BINARY_MULTIPLY',
	'BINARY_TRUE_DIVIDE', 'BINARY_FLOOR_DIVIDE', 'BINARY_MODULO',
	'BINARY_POWER', 'BINARY_AND', 'BINARY_OR', 'BINARY_XOR',
	'BINARY_LSHIFT', 'BINARY_RSHIFT', 'BINARY_SUBSCR',
	'UNARY_POSITIVE', 'UNARY_NEGATIVE', 'UNARY_NOT', 'UNARY_INVERT',
	'COMPARE_OP',
	'JUMP_ABSOLUTE', 'JUMP_FORWARD',
	'POP_JUMP_IF_TRUE', 'POP_JUMP_IF_FALSE',
	'JUMP_IF_TRUE_OR_POP', 'JUMP_IF_FALSE_OR_POP',
	'CALL_FUNCTION', 'CALL_FUNCTION_KW', 'CALL_METHOD',
	'BUILD_TUPLE', 'BUILD_LIST', 'BUILD_SET', 'BUILD_MAP',
	'BUILD_CONST_KEY_MAP', 'BUILD_STRING', 'BUILD_SLICE',
	'LIST_APPEND', 'SET_ADD', 'MAP_ADD',
	'POP_TOP', 'ROT_TWO', 'ROT_THREE', 'ROT_FOUR',
	'DUP_TOP', 'DUP_TOP_TWO',
	'RETURN_VALUE', 'GET_ITER', 'FOR_ITER', 'GET_YIELD_FROM_ITER',
	'NOP', 'MAKE_FUNCTION', 'SETUP_LOOP', 'POP_BLOCK',
	'SETUP_EXCEPT', 'SETUP_FINALLY', 'RAISE_VARARGS',
	'IMPORT_NAME', 'IMPORT_FROM', 'UNPACK_SEQUENCE',
	# WASM
	'i32.const', 'i64.const', 'f32.const', 'f64.const',
	'i32.add', 'i32.sub', 'i32.mul', 'i32.div_s', 'i32.div_u',
	'i32.rem_s', 'i32.rem_u', 'i32.and', 'i32.or', 'i32.xor',
	'i32.shl', 'i32.shr_s', 'i32.shr_u', 'i32.rotl', 'i32.rotr',
	'i32.clz', 'i32.ctz', 'i32.popcnt',
	'i64.add', 'i64.sub', 'i64.mul', 'i64.div_s',
	'i64.and', 'i64.or', 'i64.xor',
	'f32.add', 'f32.sub', 'f32.mul', 'f32.div',
	'f32.sqrt', 'f32.min', 'f32.max', 'f32.abs', 'f32.neg',
	'f64.add', 'f64.sub', 'f64.mul', 'f64.div',
	'f64.sqrt', 'f64.min', 'f64.max', 'f64.abs', 'f64.neg',
	'i32.eqz', 'i32.eq', 'i32.ne', 'i32.lt_s', 'i32.lt_u',
	'i32.gt_s', 'i32.gt_u', 'i32.le_s', 'i32.le_u',
	'i32.ge_s', 'i32.ge_u',
	'i64.eqz', 'i64.eq', 'i64.ne',
	'f32.eq', 'f32.ne', 'f32.lt', 'f32.gt',
	'f64.eq', 'f64.ne', 'f64.lt', 'f64.gt',
	'i32.load', 'i64.load', 'f32.load', 'f64.load',
	'i32.store', 'i64.store', 'f32.store', 'f64.store',
	'i32.load8_s', 'i32.load8_u', 'i32.load16_s', 'i32.load16_u',
	'memory.size', 'memory.grow',
	'block', 'loop', 'if', 'else', 'end',
	'br', 'br_if', 'br_table', 'return',
	'call', 'call_indirect',
	'local.get', 'local.set', 'local.tee',
	'global.get', 'global.set',
	'i32.wrap_i64', 'i64.extend_i32_s', 'i64.extend_i32_u',
	'f32.convert_i32_s', 'f64.convert_i32_s',
	'i32.trunc_f32_s', 'i32.trunc_f64_s',
	'f32.demote_f64', 'f64.promote_f32',
	'i32.reinterpret_f32', 'f32.reinterpret_i32',
	'drop', 'select', 'nop', 'unreachable',
	# EVM
	'PUSH1', 'PUSH2', 'PUSH32', 'POP',
	'DUP1', 'DUP2', 'DUP3', 'DUP4',
	'SWAP1', 'SWAP2', 'SWAP3', 'SWAP4',
	'ADD', 'MUL', 'SUB', 'DIV', 'SDIV', 'MOD', 'SMOD',
	'ADDMOD', 'MULMOD', 'EXP', 'SIGNEXTEND',
	'LT', 'GT', 'SLT', 'SGT', 'EQ', 'ISZERO',
	'AND', 'OR', 'XOR', 'NOT', 'BYTE', 'SHL', 'SHR', 'SAR',
	'SHA3',
	'MLOAD', 'MSTORE', 'MSTORE8', 'MSIZE',
	'SLOAD', 'SSTORE',
	'JUMP', 'JUMPI', 'JUMPDEST', 'STOP', 'RETURN', 'REVERT',
	'ADDRESS', 'BALANCE', 'ORIGIN', 'CALLER', 'CALLVALUE',
	'CALLDATALOAD', 'CALLDATASIZE', 'CALLDATACOPY',
	'CODESIZE', 'CODECOPY', 'GASPRICE', 'RETURNDATASIZE',
	'RETURNDATACOPY', 'BLOCKHASH', 'COINBASE', 'TIMESTAMP',
	'NUMBER', 'DIFFICULTY', 'GASLIMIT', 'CHAINID', 'SELFBALANCE',
	'GAS',
	'LOG0', 'LOG1', 'LOG2', 'LOG3', 'LOG4',
	'CALL', 'DELEGATECALL', 'STATICCALL', 'CREATE', 'CREATE2',
	'SELFDESTRUCT',
	]

	def _quantize_number(self, val: float) -> int:
	"""Quantize a number to a bucket index (0-127)."""
	v = int(val) if val == int(val) else int(val)

	# Exact range: -32 to 31 → buckets 0-63
	if -32 <= v <= 31:
	return v + 32

	# Medium range: 32-255 → buckets 64-91 (8-value steps)
	if 32 <= v <= 255:
	return 64 + min(27, (v - 32) // 8)
	if -256 <= v < -32:
	return 64 + min(27, (-v - 33) // 8)

	# Large range: 256-4095 → buckets 92-107
	if 256 <= abs(v) <= 4095:
	return 92 + min(15, (abs(v) - 256) // 256)

	# Huge range: 4096+ → buckets 108-127 (exponential)
	if abs(v) > 4095:
	import math
	return 108 + min(19, int(math.log2(max(1, abs(v) / 4096))))

	return 64 # fallback

	def _dequantize_number(self, bucket: int) -> int:
	"""Reverse quantization — approximate original value."""
	if 0 <= bucket <= 63:
	return bucket - 32
	if 64 <= bucket <= 91:
	return 32 + (bucket - 64) * 8
	if 92 <= bucket <= 107:
	return 256 + (bucket - 92) * 256
	if 108 <= bucket <= 127:
	return 4096 * (2 ** (bucket - 108))
	return 0

	def encode_line(self, line: str) -> List[int]:
	"""Encode a complete bytecode line into token IDs.

	Input format: D{n}\|{stack}\|{opcode_sequence}\|{task}
	Returns: [BOS, ...token_ids..., EOS]
	"""
	tokens = [self.BOS]

	parts = line.split('\|')
	if len(parts) < 4:
	return tokens + [self.EOS]

	# Difficulty marker
	diff = parts[0].strip()
	if diff in self._task_to_id:
	tokens.append(self._task_to_id[diff])

	# Stack name
	stack = parts[1].strip()
	if stack in self._task_to_id:
	tokens.append(self._task_to_id[stack])

	# Separator
	tokens.append(self._struct_to_id.get('\|', self.STRUCT_START + 3))

	# Opcode sequence
	ops_str = parts[2].strip()
	tokens.extend(self._encode_ops(ops_str))

	# Task separator
	tokens.append(self._struct_to_id.get('\|', self.STRUCT_START + 3))

	# Task
	task_str = parts[3].strip()
	task_parts = task_str.split(':', 1)
	task_marker = task_parts[0]
	if task_marker in self._task_to_id:
	tokens.append(self._task_to_id[task_marker])

	if len(task_parts) > 1:
	task_content = task_parts[1].strip()
	# Task content is either opcodes (NEXT/FILL) or a value (STACK/EXEC)
	if task_marker in ('NEXT', 'FILL'):
	tokens.extend(self._encode_ops(task_content))
	elif task_marker == 'STACK':
	try:
	val = int(task_content)
	bucket = self._quantize_number(val)
	tokens.append(self.NUMERIC_START + bucket)
	except ValueError:
	pass
	elif task_marker == 'EXEC':
	if task_content in self._struct_to_id:
	tokens.append(self._struct_to_id[task_content])

	tokens.append(self.EOS)
	return tokens

	def _encode_ops(self, ops_str: str) -> List[int]:
	"""Encode a space-separated sequence of opcodes with arguments."""
	tokens = []
	# Parse: OPCODE(arg1,arg2) or OPCODE or OPCODE(arg)
	pattern = re.compile(r'(\S+?)(?:$([^)]*)$)?(?:\s\|$)')

	for match in pattern.finditer(ops_str):
	opcode = match.group(1)
	args = match.group(2)

	# Look up opcode
	if opcode in self._opcode_to_id:
	tokens.append(self._opcode_to_id[opcode])
	else:
	# Unknown opcode — skip (shouldn't happen with our corpus)
	continue

	# Encode arguments
	if args:
	for arg in args.split(','):
	arg = arg.strip()
	tokens.extend(self._encode_arg(arg))

	return tokens

	def _encode_arg(self, arg: str) -> List[int]:
	"""Encode a single argument to token(s)."""
	tokens = []

	# Check if it's a comparison operator
	if arg in self._compare_to_id:
	return [self._compare_to_id[arg]]

	# Check for key=value (WASM memory args)
	if '=' in arg and not arg.startswith('0x'):
	key, val = arg.split('=', 1)
	key_tok = key + '='
	if key_tok in self._struct_to_id:
	tokens.append(self._struct_to_id[key_tok])
	try:
	v = int(val)
	bucket = self._quantize_number(v)
	tokens.append(self.NUMERIC_START + bucket)
	except ValueError:
	pass
	return tokens

	# Try as hex
	if arg.startswith('0x'):
	try:
	v = int(arg, 16)
	bucket = self._quantize_number(v)
	tokens.append(self.NUMERIC_START + bucket)
	return tokens
	except ValueError:
	pass

	# Try as number
	try:
	v = float(arg)
	bucket = self._quantize_number(v)
	tokens.append(self.NUMERIC_START + bucket)
	return tokens
	except ValueError:
	pass

	# Register index (small integer as string)
	try:
	v = int(arg)
	if 0 <= v <= 31:
	tokens.append(self.REGISTER_START + v)
	return tokens
	except ValueError:
	pass

	return tokens

	def decode(self, token_ids: List[int]) -> str:
	"""Decode token IDs back to bytecode string (approximate)."""
	parts = []
	for tid in token_ids:
	if tid == self.BOS:
	parts.append('<BOS>')
	elif tid == self.EOS:
	parts.append('<EOS>')
	elif tid == self.PAD:
	continue
	elif tid in self._id_to_opcode:
	parts.append(self._id_to_opcode[tid])
	elif self.NUMERIC_START <= tid < self.REGISTER_START:
	bucket = tid - self.NUMERIC_START
	val = self._dequantize_number(bucket)
	parts.append(f"#{val}")
	elif self.REGISTER_START <= tid < self.COMPARE_START:
	reg = tid - self.REGISTER_START
	parts.append(f"r{reg}")
	else:
	parts.append(f"?{tid}")
	return ' '.join(parts)

	def stats(self) -> Dict[str, int]:
	return {
	'vocab_size': self.vocab_size,
	'num_opcodes': self._num_opcodes,
	'num_comparisons': len(self._compare_to_id),
	'num_task_markers': len(self._task_to_id),
	'num_struct_tokens': len(self._struct_to_id),
	'numeric_buckets': 128,
	'register_slots': 32,
	}


	def main():
	"""Test the tokenizer."""
	tok = BytecodeTokenizer()
	print(f"BytecodeTokenizer initialized:")
	for k, v in tok.stats().items():
	print(f" {k}: {v}")

	# Test encoding
	test_lines = [
	"D1\|python\|LOAD_CONST(42) LOAD_CONST(10) BINARY_ADD RETURN_VALUE\|NEXT:RETURN_VALUE",
	"D2\|wasm\|local.get(1) i32.const(54) i32.lt_u if i32.const(60)\|STACK:3",
	"D3\|evm\|PUSH1(0x00) CALLDATALOAD PUSH2(0x3a8a) EQ PUSH1(0x29) JUMPI\|NEXT:STOP",
	]

	print(f"\nEncoding tests:")
	for line in test_lines:
	ids = tok.encode_line(line)
	decoded = tok.decode(ids)
	print(f"\n Input: {line[:80]}...")
	print(f" Tokens: {ids}")
	print(f" Length: {len(ids)}")
	print(f" Decoded: {decoded[:80]}...")

	# Roundtrip test on full corpus
	corpus_path = Path(__file__).parent.parent / 'corpus' / 'bytecode.txt'
	if corpus_path.exists():
	print(f"\nCorpus encoding test ({corpus_path}):")
	total = 0
	total_tokens = 0
	empty = 0
	with open(corpus_path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	ids = tok.encode_line(line)
	total += 1
	total_tokens += len(ids)
	if len(ids) <= 2: # only BOS+EOS
	empty += 1
	print(f" Lines: {total}")
	print(f" Total tokens: {total_tokens}")
	print(f" Avg tokens/line: {total_tokens / max(1, total):.1f}")
	print(f" Empty (BOS+EOS only): {empty}")
	print(f" Compression vs raw chars: {total_tokens / sum(len(l) for l in open(corpus_path)):.2f}x")


	if __name__ == '__main__':
	main()