|
|
| """ |
| Hyper-Production Double-Array Trie (DAT) Compiler. |
| Compiles standard JSON vocabulary into cache-optimized binary arrays. |
| Algorithm: First-Fit Linear Scan with Collision Resolution. |
| """ |
|
|
| import struct |
| import json |
| import logging |
| from typing import List, Dict, Tuple, Optional |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - [DAT-BUILDER] - %(message)s') |
|
|
| class DATBuilder: |
| def __init__(self): |
| |
| self.init_size = 65536 |
| self.base = [1] * self.init_size |
| self.check = [-1] * self.init_size |
| self.values = [-1] * self.init_size |
| |
| |
| self.base[0] = 1 |
| self.check[0] = 0 |
| |
| self.size = self.init_size |
| self.next_check_pos = 1 |
|
|
| def _resize(self, required_index: int): |
| """Exponential resizing strategy to amortize cost.""" |
| if required_index < self.size: |
| return |
|
|
| new_size = max(required_index + 1024, self.size * 2) |
| expand_count = new_size - self.size |
| |
| self.base.extend([1] * expand_count) |
| self.check.extend([-1] * expand_count) |
| self.values.extend([-1] * expand_count) |
| self.size = new_size |
|
|
| def _find_base(self, children_codes: List[int]) -> int: |
| """ |
| Finds a base offset 'q' such that for all char_code 'c': |
| check[q + c] is available (== -1). |
| """ |
| if not children_codes: |
| return 1 |
|
|
| |
| q = self.next_check_pos |
| first_char = children_codes[0] |
|
|
| while True: |
| |
| if q + first_char >= self.size: |
| self._resize(q + first_char + 256) |
| |
| |
| if self.check[q + first_char] != -1: |
| q += 1 |
| continue |
| |
| |
| collision = False |
| max_idx_needed = 0 |
| |
| for c in children_codes: |
| idx = q + c |
| if idx >= self.size: |
| self._resize(idx + 1024) |
| |
| if self.check[idx] != -1: |
| collision = True |
| break |
| |
| if idx > max_idx_needed: |
| max_idx_needed = idx |
| |
| if not collision: |
| |
| if q == self.next_check_pos: |
| self.next_check_pos += 1 |
| return q |
| |
| q += 1 |
|
|
| def build(self, vocab: List[str]) -> None: |
| """ |
| Compiles the list of strings into the DAT structure. |
| """ |
| logging.info(f"Compiling vocabulary of {len(vocab)} tokens...") |
| |
| |
| root = {'children': {}, 'val': -1} |
| for token_id, token in enumerate(vocab): |
| node = root |
| |
| for byte_val in token.encode('utf-8'): |
| if byte_val not in node['children']: |
| node['children'][byte_val] = {'children': {}, 'val': -1} |
| node = node['children'][byte_val] |
| node['val'] = token_id |
|
|
| |
| |
| queue = [(root, 0)] |
| |
| processed_nodes = 0 |
| |
| while queue: |
| curr_node, curr_dat_idx = queue.pop(0) |
| children_map = curr_node['children'] |
| |
| if not children_map: |
| continue |
|
|
| |
| children_bytes = sorted(children_map.keys()) |
| |
| |
| base_offset = self._find_base(children_bytes) |
| self.base[curr_dat_idx] = base_offset |
| |
| |
| for byte_val in children_bytes: |
| child_node = children_map[byte_val] |
| next_dat_idx = base_offset + byte_val |
| |
| self.check[next_dat_idx] = curr_dat_idx |
| self.values[next_dat_idx] = child_node['val'] |
| |
| queue.append((child_node, next_dat_idx)) |
| |
| processed_nodes += 1 |
| |
| |
| |
| last_used = 0 |
| for i in range(self.size - 1, -1, -1): |
| if self.check[i] != -1 or self.base[i] != 1: |
| last_used = i |
| break |
| |
| final_size = last_used + 1 |
| self.base = self.base[:final_size] |
| self.check = self.check[:final_size] |
| self.values = self.values[:final_size] |
| self.size = final_size |
| |
| logging.info(f"Compilation Complete. Final Array Size: {self.size}") |
|
|
| def save(self, output_path: str): |
| """ |
| Saves the memory-mappable binary format. |
| Format: [MAGIC 4b][VER 4b][SIZE 4b][BASE int32 array][CHECK int32 array][VALS int32 array] |
| """ |
| logging.info(f"Saving binary to {output_path}...") |
| |
| with open(output_path, "wb") as f: |
| |
| f.write(b"CRAY") |
| f.write(struct.pack("<I", 2)) |
| f.write(struct.pack("<I", self.size)) |
| |
| |
| |
| fmt = f"<{self.size}i" |
| f.write(struct.pack(fmt, *self.base)) |
| f.write(struct.pack(fmt, *self.check)) |
| f.write(struct.pack(fmt, *self.values)) |
| |
| logging.info("Save successful.") |
|
|