|
|
| """ |
| Double-Array Trie (DAT) Compiler for Crayon. |
| Compiles a sorted vocabulary list into a highly compressed, cache-local binary format (.dat). |
| |
| Algorithm: |
| - Base[s] + c = t |
| - Check[t] = s |
| """ |
|
|
| import struct |
| import sys |
| import array |
| from typing import List, Tuple, Dict |
|
|
| class DATBuilder: |
| def __init__(self): |
| |
| |
| |
| self.base = array.array('i', [0] * 1024) |
| self.check = array.array('i', [0] * 1024) |
| self.used = array.array('b', [0] * 1024) |
| self.check[0] = 0 |
| self.size = 1024 |
| self.max_idx = 0 |
| |
| |
| self.output = {} |
|
|
| def _resize(self, new_size): |
| if new_size <= self.size: |
| return |
| |
| extension = [0] * (new_size - self.size) |
| self.base.extend(extension) |
| self.check.extend(extension) |
| self.used.extend([0] * (new_size - self.size)) |
| self.size = new_size |
|
|
| def _find_base(self, children_keys: List[int]) -> int: |
| """Finds a base offset 'b' such that check[b + c] are all empty for each c in children.""" |
| if not children_keys: |
| return 1 |
| |
| first = children_keys[0] |
| |
| b = 1 |
| while True: |
| |
| pos = b + first |
| if pos >= self.size: |
| self._resize(pos + 256) |
| |
| if self.check[pos] != 0: |
| |
| b += 1 |
| continue |
| |
| |
| overlap = False |
| max_pos = 0 |
| for k in children_keys: |
| p = b + k |
| if p >= self.size: |
| self._resize(p + 256) |
| max_pos = max(max_pos, p) |
| |
| if self.check[p] != 0: |
| overlap = True |
| break |
| |
| if not overlap: |
| return b |
| |
| b += 1 |
|
|
| def build(self, tokens: List[str]) -> bytes: |
| """ |
| Builds the Double-Array Trie from sorted tokens. |
| """ |
| |
| |
| trie = {'id': -1, 'children': {}} |
| |
| for i, token in enumerate(tokens): |
| node = trie |
| for char in token: |
| key = ord(char) |
| if key not in node['children']: |
| node['children'][key] = {'id': -1, 'children': {}} |
| node = node['children'][key] |
| node['id'] = i |
| |
| |
| |
| queue: List[Tuple[Dict, int]] = [(trie, 0)] |
| |
| |
| self.base[0] = 1 |
| self._resize(256) |
| |
| processed_count = 0 |
| |
| while queue: |
| node, state = queue.pop(0) |
| |
| if node['id'] != -1: |
| self.output[state] = node['id'] |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| pass |
|
|
| children = node['children'] |
| if not children: |
| continue |
| |
| sorted_keys = sorted(children.keys()) |
| |
| |
| base_offset = self._find_base(sorted_keys) |
| self.base[state] = base_offset |
| |
| |
| for k in sorted_keys: |
| next_state = base_offset + k |
| self.check[next_state] = state |
| self.used[next_state] = 1 |
| self.max_idx = max(self.max_idx, next_state) |
| |
| queue.append((children[k], next_state)) |
| |
| processed_count += 1 |
| if processed_count % 1000 == 0: |
| print(f"Compiled {processed_count} states...", end='\r') |
|
|
| print(f"\nDAT Construction Complete. {self.max_idx} states.") |
| return self._serialize() |
|
|
| def _serialize(self) -> bytes: |
| """ |
| Format: |
| [HEADER: 16 bytes] |
| - Magic: "CRYN" (4) |
| - Version: 1 (4) |
| - Size: int (4) |
| [BODY] |
| - Base: int32 * size |
| - Check: int32 * size |
| - Terminals: int32 * size (Token mapping) |
| """ |
| |
| final_size = self.max_idx + 1 |
| |
| |
| terminals = array.array('i', [-1] * final_size) |
| for state, pid in self.output.items(): |
| if state < final_size: |
| terminals[state] = pid |
| |
| header = struct.pack('<4sII', b'CRYN', 1, final_size) |
| |
| |
| final_base = self.base[:final_size] |
| final_check = self.check[:final_size] |
| |
| print(f"Serialized Size: {(final_size * 12 + 12) / 1024 / 1024:.2f} MB") |
| |
| return ( |
| header + |
| final_base.tobytes() + |
| final_check.tobytes() + |
| terminals.tobytes() |
| ) |
|
|
| def compile_dat(tokens: List[str], output_path: str): |
| builder = DATBuilder() |
| data = builder.build(tokens) |
| with open(output_path, 'wb') as f: |
| f.write(data) |
| print(f"Saved: {output_path}") |
|
|
|
|