| """ |
| GLADIUS MathTokenizer — Structural tokenizer for mathematical expressions. |
| 128 tokens, purpose-built for mathematical reasoning. |
| Every token IS the mathematical object — no BPE fragmentation. |
| """ |
|
|
| from typing import List |
|
|
| class MathTokenizer: |
| """ |
| Structural tokenizer for mathematical expressions. |
| 128 tokens in range 17000-17127 (but internally remapped to 0-127). |
| |
| Every token IS the mathematical object — no BPE fragmentation. |
| Grid tokenizer proved: purpose-built tokens → 91% loss drop. |
| """ |
|
|
| |
| VOCAB_SIZE = 128 |
| PAD_ID = 0 |
| BOS_ID = 1 |
| EOS_ID = 2 |
|
|
| |
| ORIGINAL_BASE = 17000 |
| ORIGINAL_RANGE = (17000, 17127) |
|
|
| def __init__(self): |
| |
| self._build_maps() |
|
|
| def _build_maps(self): |
| """Build all token mappings.""" |
| self.token_to_id = {} |
| self.id_to_token = {} |
| idx = 0 |
|
|
| |
| for name in ['[PAD_MATH]', '[BOS_MATH]', '[EOS_MATH]']: |
| self.token_to_id[name] = idx |
| self.id_to_token[idx] = name |
| idx += 1 |
|
|
| |
| for i in range(10): |
| self.token_to_id[str(i)] = idx |
| self.id_to_token[idx] = str(i) |
| idx += 1 |
|
|
| |
| for op in ['+', '-', '*', '/', '=', '^', '√', '!', '%', '<', '>', |
| '≤', '≥', '≠', '≈', '→', '∈', '.']: |
| self.token_to_id[op] = idx |
| self.id_to_token[idx] = op |
| idx += 1 |
|
|
| |
| for i in range(26): |
| ch = chr(ord('a') + i) |
| self.token_to_id[ch] = idx |
| self.id_to_token[idx] = ch |
| idx += 1 |
|
|
| |
| for g in ['π', 'e_const', 'φ', 'i', '∞', 'α', 'β', 'γ', 'δ', 'θ', 'λ', 'σ', 'ω']: |
| self.token_to_id[g] = idx |
| self.id_to_token[idx] = g |
| idx += 1 |
|
|
| |
| for f in ['sin', 'cos', 'tan', 'log', 'ln', 'exp', 'lim', 'Σ', '∫', 'd/dx', |
| 'abs', 'floor', 'ceil', 'max', 'min', 'gcd', 'lcm', 'mod', 'det', 'tr']: |
| self.token_to_id[f] = idx |
| self.id_to_token[idx] = f |
| idx += 1 |
|
|
| |
| for s in ['(', ')', '[', ']', '{', '}', ',', ';', ':', '_', '|', '\\', ' ', '\n', |
| '…', '∴', '∵', '⟨', '⟩', 'QED', 'GIVEN', 'STEP', 'OUT', 'FILL', 'NEXT']: |
| self.token_to_id[s] = idx |
| self.id_to_token[idx] = s |
| idx += 1 |
|
|
| |
| self._multi_char = {k: v for k, v in self.token_to_id.items() if len(k) > 1} |
| self._sorted_multi = sorted(self._multi_char.keys(), key=len, reverse=True) |
| self._single_char = {k: v for k, v in self.token_to_id.items() if len(k) == 1} |
|
|
| @property |
| def vocab_size(self) -> int: |
| return self.VOCAB_SIZE |
|
|
| def encode(self, text: str, add_special: bool = True) -> List[int]: |
| """Encode mathematical expression to token IDs (0-indexed).""" |
| tokens = [] |
| if add_special: |
| tokens.append(self.BOS_ID) |
|
|
| i = 0 |
| text = text.strip() |
| while i < len(text): |
| if text[i] == ' ': |
| i += 1 |
| continue |
|
|
| |
| matched = False |
| for multi in self._sorted_multi: |
| if text[i:i+len(multi)] == multi: |
| tokens.append(self._multi_char[multi]) |
| i += len(multi) |
| matched = True |
| break |
| if matched: |
| continue |
|
|
| |
| ch = text[i] |
| if ch in self._single_char: |
| tokens.append(self._single_char[ch]) |
| i += 1 |
| continue |
|
|
| |
| if ch == 'R': |
| tokens.append(self.token_to_id.get(',', 96)) |
| i += 1 |
| continue |
|
|
| |
| if ch.isupper() and ch.lower() in self._single_char: |
| tokens.append(self._single_char[ch.lower()]) |
| i += 1 |
| continue |
|
|
| i += 1 |
|
|
| if add_special: |
| tokens.append(self.EOS_ID) |
| return tokens |
|
|
| def encode_corpus_line(self, line: str) -> List[int]: |
| """Encode a corpus line with D{n}| prefix and |GIVEN:|STEP:|QED:|OUT: segments.""" |
| import re |
| line = line.strip() |
| tokens = [self.BOS_ID] |
|
|
| |
| if re.match(r'^D\d\|', line): |
| tokens.append(self.token_to_id[line[1]]) |
| tokens.append(self.token_to_id['|']) |
| line = line[3:] |
|
|
| segments = line.split('|') |
| for seg_idx, segment in enumerate(segments): |
| if seg_idx > 0: |
| tokens.append(self.token_to_id['|']) |
|
|
| for label in ['GIVEN:', 'QED:', 'OUT:', 'FILL:', 'NEXT:']: |
| if segment.startswith(label): |
| key = label[:-1] |
| tokens.append(self.token_to_id.get(key, 0)) |
| tokens.append(self.token_to_id[':']) |
| tokens.extend(self.encode(segment[len(label):], add_special=False)) |
| break |
| else: |
| |
| step_match = re.match(r'^STEP(\d+):', segment) |
| if step_match: |
| tokens.append(self.token_to_id['STEP']) |
| for d in step_match.group(1): |
| tokens.append(self.token_to_id[d]) |
| tokens.append(self.token_to_id[':']) |
| tokens.extend(self.encode(segment[step_match.end():], add_special=False)) |
| else: |
| tokens.extend(self.encode(segment, add_special=False)) |
|
|
| tokens.append(self.EOS_ID) |
| return tokens |
|
|
| def decode(self, token_ids: List[int], skip_special: bool = True) -> str: |
| """Decode token IDs back to string.""" |
| parts = [] |
| for tid in token_ids: |
| if skip_special and tid in (self.PAD_ID, self.BOS_ID, self.EOS_ID): |
| continue |
| parts.append(self.id_to_token.get(tid, f'<?{tid}>')) |
| return ''.join(parts) |
|
|
|
|
| |
|
|