| """ |
| Crayon Vocabulary Training Module. |
| |
| Implements Algorithm 3.1 from the XERV Crayon Engineering Treatise: |
| - Extract substring candidates up to SIMD limit (16 bytes) |
| - Calculate information gain with entropy reduction |
| - Select top-K candidates maximizing gain-to-cost ratio |
| |
| This is the production-grade implementation for building optimal vocabularies |
| from either user-provided corpora or the built-in default sources. |
| """ |
|
|
| import math |
| import logging |
| import string |
| from collections import defaultdict |
| from typing import List, Tuple, Dict, Iterator, Optional, Callable |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
| |
| MAX_TOKEN_LENGTH = 16 |
|
|
| |
| DEFAULT_MIN_FREQUENCY = 2 |
|
|
|
|
| def build_default_vocabulary( |
| target_size: int = 500000, |
| progress_callback: Optional[Callable[[str], None]] = None |
| ) -> List[str]: |
| """ |
| Builds a 'Batteries-Included' vocabulary using Xerv-AI's curated datasets. |
| |
| Sources: |
| - Xerv-AI/GRAD (Graduate Mathematics) |
| - Xerv-AI/Physics-dataset-700 (Scientific Reasoning) |
| - Xerv-AI/RainDrop-DTS (General Instruction) |
| - Tiny Shakespeare (Classical Literature) |
| - Built-in corpus (Baseline Coverage) |
| |
| No local files are required; data is streamed directly into the entropy engine. |
| |
| Args: |
| target_size: Maximum vocabulary size (default 500k) |
| progress_callback: Optional callback for progress updates |
| |
| Returns: |
| List of token strings ordered by utility |
| """ |
| from .resources import get_default_corpus_iterator |
| |
| if progress_callback: |
| progress_callback("Initializing default corpus stream...") |
| |
| corpus_stream = get_default_corpus_iterator() |
| return train_vocabulary( |
| corpus_stream, |
| target_size=target_size, |
| progress_callback=progress_callback |
| ) |
|
|
|
|
| def train_vocabulary( |
| corpus_iterator: Iterator[str], |
| target_size: int = 500000, |
| min_frequency: int = DEFAULT_MIN_FREQUENCY, |
| progress_callback: Optional[Callable[[str], None]] = None |
| ) -> List[str]: |
| """ |
| Constructs an optimal vocabulary from a corpus using first-principles entropy analysis. |
| |
| Algorithm 3.1 [cite: 127-135]: |
| 1. Extract all substrings up to MAX_TOKEN_LENGTH (16 bytes for AVX2). |
| 2. Calculate Information Gain: Gain(s) = Frequency(s) × Entropy(s) - Cost(s). |
| 3. Select Top-K candidates maximizing utility score. |
| |
| Args: |
| corpus_iterator: Iterator yielding chunks/lines of text |
| target_size: Maximum vocabulary size (default 500k) |
| min_frequency: Minimum token frequency threshold |
| progress_callback: Optional callback for progress updates |
| |
| Returns: |
| List of token strings ordered for stable ID assignment |
| """ |
| if progress_callback: |
| progress_callback("Starting Entropy-Guided Vocabulary Construction...") |
| |
| logger.info("Starting Entropy-Guided Vocabulary Construction...") |
| |
| |
| |
| |
| candidates: Dict[str, int] = defaultdict(int) |
| total_chars = 0 |
| chunk_count = 0 |
| |
| |
| for text_chunk in corpus_iterator: |
| if not text_chunk: |
| continue |
| |
| text_len = len(text_chunk) |
| total_chars += text_len |
| chunk_count += 1 |
| |
| |
| for i in range(text_len): |
| |
| limit = min(i + MAX_TOKEN_LENGTH, text_len) |
| for j in range(i + 1, limit + 1): |
| token = text_chunk[i:j] |
| |
| |
| if len(token.encode('utf-8')) <= MAX_TOKEN_LENGTH: |
| candidates[token] += 1 |
| |
| |
| if chunk_count % 100 == 0 and progress_callback: |
| progress_callback(f"Processed {chunk_count} chunks, {len(candidates):,} candidates...") |
| |
| if progress_callback: |
| progress_callback(f"Extracted {len(candidates):,} unique candidates from {total_chars:,} chars") |
| |
| logger.info(f"Extracted {len(candidates):,} unique candidates from {total_chars:,} chars.") |
|
|
| |
| |
| |
| if progress_callback: |
| progress_callback("Scoring candidates by information gain...") |
| |
| scored_candidates: List[Tuple[str, float]] = [] |
| |
| for token, freq in candidates.items(): |
| |
| if freq < min_frequency: |
| continue |
| |
| |
| if not token or not token.isprintable(): |
| continue |
| |
| |
| p_s = freq / total_chars |
| if p_s <= 0: |
| continue |
| |
| |
| |
| entropy = -math.log2(p_s) |
| |
| |
| |
| byte_length = len(token.encode('utf-8')) |
| comp_cost = byte_length * 0.1 + 1.0 |
| |
| |
| |
| gain = (entropy * freq) / comp_cost |
| |
| scored_candidates.append((token, gain)) |
|
|
| if progress_callback: |
| progress_callback(f"Scored {len(scored_candidates):,} viable candidates") |
| |
| logger.info(f"Scored {len(scored_candidates):,} viable candidates") |
|
|
| |
| |
| |
| if progress_callback: |
| progress_callback("Building final vocabulary...") |
| |
| |
| scored_candidates.sort(key=lambda x: x[1], reverse=True) |
| |
| |
| vocab_set: set = set() |
| |
| |
| specials = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"] |
| for s in specials: |
| vocab_set.add(s) |
| |
| |
| for c in string.printable: |
| if c not in vocab_set and c.strip(): |
| vocab_set.add(c) |
| |
| |
| for i in range(256): |
| try: |
| char = chr(i) |
| if char.isprintable() and char not in vocab_set: |
| vocab_set.add(char) |
| except (ValueError, UnicodeDecodeError): |
| pass |
| |
| |
| remaining_slots = target_size - len(vocab_set) |
| added_count = 0 |
| |
| for token, gain in scored_candidates: |
| if added_count >= remaining_slots: |
| break |
| if token not in vocab_set: |
| vocab_set.add(token) |
| added_count += 1 |
| |
| final_vocab = list(vocab_set) |
| |
| if progress_callback: |
| progress_callback(f"Final vocabulary: {len(final_vocab):,} tokens") |
| |
| logger.info(f"Final vocabulary: {len(final_vocab):,} tokens") |
| |
| return final_vocab |
|
|
|
|
| def calculate_corpus_entropy(corpus_iterator: Iterator[str]) -> float: |
| """ |
| Calculate Shannon entropy of a corpus [cite: 93-96]. |
| |
| H(X) = -Σ p(x) log2(p(x)) |
| |
| Args: |
| corpus_iterator: Iterator yielding text chunks |
| |
| Returns: |
| Entropy in bits per character |
| """ |
| char_counts: Dict[str, int] = defaultdict(int) |
| total = 0 |
| |
| for chunk in corpus_iterator: |
| for char in chunk: |
| char_counts[char] += 1 |
| total += 1 |
| |
| if total == 0: |
| return 0.0 |
| |
| entropy = 0.0 |
| for count in char_counts.values(): |
| p = count / total |
| if p > 0: |
| entropy -= p * math.log2(p) |
| |
| return entropy |
|
|
|
|
| def estimate_optimal_vocab_size(entropy: float, epsilon: float = 0.5) -> int: |
| """ |
| Calculate optimal vocabulary size from corpus entropy [cite: 94]. |
| |
| V_optimal ≈ 2^(H(corpus) + ε) |
| |
| For English text (H ≈ 1.2 bits/char), this yields ~500k tokens. |
| |
| Args: |
| entropy: Corpus entropy in bits per character |
| epsilon: Adjustment factor (default 0.5) |
| |
| Returns: |
| Estimated optimal vocabulary size |
| """ |
| return int(2 ** (entropy + epsilon)) |
|
|