Gladius / tokenizers /math_tokenizer.py
amuzetnoM's picture
GLADIUS v5.0 — Cognitive kernel with Synthase depth attention, PUP uncertainty, Memory V2, multi-tokenizer architecture
3f42614
"""
GLADIUS MathTokenizer — Structural tokenizer for mathematical expressions.
128 tokens, purpose-built for mathematical reasoning.
Every token IS the mathematical object — no BPE fragmentation.
"""
from typing import List
class MathTokenizer:
"""
Structural tokenizer for mathematical expressions.
128 tokens in range 17000-17127 (but internally remapped to 0-127).
Every token IS the mathematical object — no BPE fragmentation.
Grid tokenizer proved: purpose-built tokens → 91% loss drop.
"""
# Internal vocab (0-indexed for nn.Embedding)
VOCAB_SIZE = 128
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2
# Original range (for reference/compatibility)
ORIGINAL_BASE = 17000
ORIGINAL_RANGE = (17000, 17127)
def __init__(self):
# Build token maps (0-indexed)
self._build_maps()
def _build_maps(self):
"""Build all token mappings."""
self.token_to_id = {}
self.id_to_token = {}
idx = 0
# Specials: 0-2
for name in ['[PAD_MATH]', '[BOS_MATH]', '[EOS_MATH]']:
self.token_to_id[name] = idx
self.id_to_token[idx] = name
idx += 1
# Digits 0-9: 3-12
for i in range(10):
self.token_to_id[str(i)] = idx
self.id_to_token[idx] = str(i)
idx += 1
# Operators: 13-30
for op in ['+', '-', '*', '/', '=', '^', '√', '!', '%', '<', '>',
'≤', '≥', '≠', '≈', '→', '∈', '.']:
self.token_to_id[op] = idx
self.id_to_token[idx] = op
idx += 1
# Variables a-z: 31-56
for i in range(26):
ch = chr(ord('a') + i)
self.token_to_id[ch] = idx
self.id_to_token[idx] = ch
idx += 1
# Greek/constants: 57-69
for g in ['π', 'e_const', 'φ', 'i', '∞', 'α', 'β', 'γ', 'δ', 'θ', 'λ', 'σ', 'ω']:
self.token_to_id[g] = idx
self.id_to_token[idx] = g
idx += 1
# Functions: 70-89
for f in ['sin', 'cos', 'tan', 'log', 'ln', 'exp', 'lim', 'Σ', '∫', 'd/dx',
'abs', 'floor', 'ceil', 'max', 'min', 'gcd', 'lcm', 'mod', 'det', 'tr']:
self.token_to_id[f] = idx
self.id_to_token[idx] = f
idx += 1
# Structure/delimiters: 90-114
for s in ['(', ')', '[', ']', '{', '}', ',', ';', ':', '_', '|', '\\', ' ', '\n',
'…', '∴', '∵', '⟨', '⟩', 'QED', 'GIVEN', 'STEP', 'OUT', 'FILL', 'NEXT']:
self.token_to_id[s] = idx
self.id_to_token[idx] = s
idx += 1
# Multi-char sorted by length for greedy matching
self._multi_char = {k: v for k, v in self.token_to_id.items() if len(k) > 1}
self._sorted_multi = sorted(self._multi_char.keys(), key=len, reverse=True)
self._single_char = {k: v for k, v in self.token_to_id.items() if len(k) == 1}
@property
def vocab_size(self) -> int:
return self.VOCAB_SIZE
def encode(self, text: str, add_special: bool = True) -> List[int]:
"""Encode mathematical expression to token IDs (0-indexed)."""
tokens = []
if add_special:
tokens.append(self.BOS_ID)
i = 0
text = text.strip()
while i < len(text):
if text[i] == ' ':
i += 1
continue
# Multi-char (longest match first)
matched = False
for multi in self._sorted_multi:
if text[i:i+len(multi)] == multi:
tokens.append(self._multi_char[multi])
i += len(multi)
matched = True
break
if matched:
continue
# Single-char
ch = text[i]
if ch in self._single_char:
tokens.append(self._single_char[ch])
i += 1
continue
# R as remainder
if ch == 'R':
tokens.append(self.token_to_id.get(',', 96))
i += 1
continue
# Uppercase → lowercase variable
if ch.isupper() and ch.lower() in self._single_char:
tokens.append(self._single_char[ch.lower()])
i += 1
continue
i += 1 # skip unknown
if add_special:
tokens.append(self.EOS_ID)
return tokens
def encode_corpus_line(self, line: str) -> List[int]:
"""Encode a corpus line with D{n}| prefix and |GIVEN:|STEP:|QED:|OUT: segments."""
import re
line = line.strip()
tokens = [self.BOS_ID]
# Handle D{n}| prefix
if re.match(r'^D\d\|', line):
tokens.append(self.token_to_id[line[1]]) # digit
tokens.append(self.token_to_id['|'])
line = line[3:]
segments = line.split('|')
for seg_idx, segment in enumerate(segments):
if seg_idx > 0:
tokens.append(self.token_to_id['|'])
for label in ['GIVEN:', 'QED:', 'OUT:', 'FILL:', 'NEXT:']:
if segment.startswith(label):
key = label[:-1] # strip colon
tokens.append(self.token_to_id.get(key, 0))
tokens.append(self.token_to_id[':'])
tokens.extend(self.encode(segment[len(label):], add_special=False))
break
else:
# Check STEP{n}:
step_match = re.match(r'^STEP(\d+):', segment)
if step_match:
tokens.append(self.token_to_id['STEP'])
for d in step_match.group(1):
tokens.append(self.token_to_id[d])
tokens.append(self.token_to_id[':'])
tokens.extend(self.encode(segment[step_match.end():], add_special=False))
else:
tokens.extend(self.encode(segment, add_special=False))
tokens.append(self.EOS_ID)
return tokens
def decode(self, token_ids: List[int], skip_special: bool = True) -> str:
"""Decode token IDs back to string."""
parts = []
for tid in token_ids:
if skip_special and tid in (self.PAD_ID, self.BOS_ID, self.EOS_ID):
continue
parts.append(self.id_to_token.get(tid, f'<?{tid}>'))
return ''.join(parts)
# ─── Byte Tokenizer (inline — no external dependency) ────────