File size: 6,781 Bytes
3f42614 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | """
GLADIUS MathTokenizer — Structural tokenizer for mathematical expressions.
128 tokens, purpose-built for mathematical reasoning.
Every token IS the mathematical object — no BPE fragmentation.
"""
from typing import List
class MathTokenizer:
"""
Structural tokenizer for mathematical expressions.
128 tokens in range 17000-17127 (but internally remapped to 0-127).
Every token IS the mathematical object — no BPE fragmentation.
Grid tokenizer proved: purpose-built tokens → 91% loss drop.
"""
# Internal vocab (0-indexed for nn.Embedding)
VOCAB_SIZE = 128
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2
# Original range (for reference/compatibility)
ORIGINAL_BASE = 17000
ORIGINAL_RANGE = (17000, 17127)
def __init__(self):
# Build token maps (0-indexed)
self._build_maps()
def _build_maps(self):
"""Build all token mappings."""
self.token_to_id = {}
self.id_to_token = {}
idx = 0
# Specials: 0-2
for name in ['[PAD_MATH]', '[BOS_MATH]', '[EOS_MATH]']:
self.token_to_id[name] = idx
self.id_to_token[idx] = name
idx += 1
# Digits 0-9: 3-12
for i in range(10):
self.token_to_id[str(i)] = idx
self.id_to_token[idx] = str(i)
idx += 1
# Operators: 13-30
for op in ['+', '-', '*', '/', '=', '^', '√', '!', '%', '<', '>',
'≤', '≥', '≠', '≈', '→', '∈', '.']:
self.token_to_id[op] = idx
self.id_to_token[idx] = op
idx += 1
# Variables a-z: 31-56
for i in range(26):
ch = chr(ord('a') + i)
self.token_to_id[ch] = idx
self.id_to_token[idx] = ch
idx += 1
# Greek/constants: 57-69
for g in ['π', 'e_const', 'φ', 'i', '∞', 'α', 'β', 'γ', 'δ', 'θ', 'λ', 'σ', 'ω']:
self.token_to_id[g] = idx
self.id_to_token[idx] = g
idx += 1
# Functions: 70-89
for f in ['sin', 'cos', 'tan', 'log', 'ln', 'exp', 'lim', 'Σ', '∫', 'd/dx',
'abs', 'floor', 'ceil', 'max', 'min', 'gcd', 'lcm', 'mod', 'det', 'tr']:
self.token_to_id[f] = idx
self.id_to_token[idx] = f
idx += 1
# Structure/delimiters: 90-114
for s in ['(', ')', '[', ']', '{', '}', ',', ';', ':', '_', '|', '\\', ' ', '\n',
'…', '∴', '∵', '⟨', '⟩', 'QED', 'GIVEN', 'STEP', 'OUT', 'FILL', 'NEXT']:
self.token_to_id[s] = idx
self.id_to_token[idx] = s
idx += 1
# Multi-char sorted by length for greedy matching
self._multi_char = {k: v for k, v in self.token_to_id.items() if len(k) > 1}
self._sorted_multi = sorted(self._multi_char.keys(), key=len, reverse=True)
self._single_char = {k: v for k, v in self.token_to_id.items() if len(k) == 1}
@property
def vocab_size(self) -> int:
return self.VOCAB_SIZE
def encode(self, text: str, add_special: bool = True) -> List[int]:
"""Encode mathematical expression to token IDs (0-indexed)."""
tokens = []
if add_special:
tokens.append(self.BOS_ID)
i = 0
text = text.strip()
while i < len(text):
if text[i] == ' ':
i += 1
continue
# Multi-char (longest match first)
matched = False
for multi in self._sorted_multi:
if text[i:i+len(multi)] == multi:
tokens.append(self._multi_char[multi])
i += len(multi)
matched = True
break
if matched:
continue
# Single-char
ch = text[i]
if ch in self._single_char:
tokens.append(self._single_char[ch])
i += 1
continue
# R as remainder
if ch == 'R':
tokens.append(self.token_to_id.get(',', 96))
i += 1
continue
# Uppercase → lowercase variable
if ch.isupper() and ch.lower() in self._single_char:
tokens.append(self._single_char[ch.lower()])
i += 1
continue
i += 1 # skip unknown
if add_special:
tokens.append(self.EOS_ID)
return tokens
def encode_corpus_line(self, line: str) -> List[int]:
"""Encode a corpus line with D{n}| prefix and |GIVEN:|STEP:|QED:|OUT: segments."""
import re
line = line.strip()
tokens = [self.BOS_ID]
# Handle D{n}| prefix
if re.match(r'^D\d\|', line):
tokens.append(self.token_to_id[line[1]]) # digit
tokens.append(self.token_to_id['|'])
line = line[3:]
segments = line.split('|')
for seg_idx, segment in enumerate(segments):
if seg_idx > 0:
tokens.append(self.token_to_id['|'])
for label in ['GIVEN:', 'QED:', 'OUT:', 'FILL:', 'NEXT:']:
if segment.startswith(label):
key = label[:-1] # strip colon
tokens.append(self.token_to_id.get(key, 0))
tokens.append(self.token_to_id[':'])
tokens.extend(self.encode(segment[len(label):], add_special=False))
break
else:
# Check STEP{n}:
step_match = re.match(r'^STEP(\d+):', segment)
if step_match:
tokens.append(self.token_to_id['STEP'])
for d in step_match.group(1):
tokens.append(self.token_to_id[d])
tokens.append(self.token_to_id[':'])
tokens.extend(self.encode(segment[step_match.end():], add_special=False))
else:
tokens.extend(self.encode(segment, add_special=False))
tokens.append(self.EOS_ID)
return tokens
def decode(self, token_ids: List[int], skip_special: bool = True) -> str:
"""Decode token IDs back to string."""
parts = []
for tid in token_ids:
if skip_special and tid in (self.PAD_ID, self.BOS_ID, self.EOS_ID):
continue
parts.append(self.id_to_token.get(tid, f'<?{tid}>'))
return ''.join(parts)
# ─── Byte Tokenizer (inline — no external dependency) ────────
|