| from typing import List | |
| from .vocabulary import CrayonVocab | |
| # Try importing C-extension | |
| try: | |
| from ..c_ext import _core | |
| _C_EXT_AVAILABLE = True | |
| except ImportError: | |
| _C_EXT_AVAILABLE = False | |
| def crayon_tokenize(text: str, vocab: CrayonVocab) -> List[int]: | |
| """ | |
| Core tokenization algorithm optimized for throughput and accuracy. | |
| Time Complexity: O(n) due to O(1) average lookup and constant max_lookahead. | |
| Space Complexity: O(n) for output tokens. | |
| Automatically uses C-Extension with SIMD acceleration if available [cite: 358-375]. | |
| """ | |
| # 1. Fast Path: Use C-Extension if available and trie is built | |
| if _C_EXT_AVAILABLE and vocab._c_ext_available and vocab._c_trie is not None: | |
| return _core.crayon_tokenize_fast(text, vocab._c_trie, vocab.unk_token_id) | |
| # 2. Slow Path: Pure Python Implementation (Fallback) | |
| # Optimized using local variables for loop speed | |
| tokens: List[int] = [] | |
| position: int = 0 | |
| text_length: int = len(text) | |
| # Pre-fetch methods to avoid attribute lookup in loop | |
| vocab_match = vocab.longest_match | |
| tokens_append = tokens.append | |
| unk_id = vocab.unk_token_id | |
| while position < text_length: | |
| # Longest matching token using optimized trie traversal | |
| token_id, match_length = vocab_match(text, position) | |
| if match_length > 0: | |
| tokens_append(token_id) | |
| position += match_length | |
| else: | |
| # Handle out-of-vocabulary characters | |
| tokens_append(unk_id) | |
| position += 1 | |
| return tokens |