File size: 1,601 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | from typing import List
from .vocabulary import CrayonVocab
# Try importing C-extension
try:
from ..c_ext import _core
_C_EXT_AVAILABLE = True
except ImportError:
_C_EXT_AVAILABLE = False
def crayon_tokenize(text: str, vocab: CrayonVocab) -> List[int]:
"""
Core tokenization algorithm optimized for throughput and accuracy.
Time Complexity: O(n) due to O(1) average lookup and constant max_lookahead.
Space Complexity: O(n) for output tokens.
Automatically uses C-Extension with SIMD acceleration if available [cite: 358-375].
"""
# 1. Fast Path: Use C-Extension if available and trie is built
if _C_EXT_AVAILABLE and vocab._c_ext_available and vocab._c_trie is not None:
return _core.crayon_tokenize_fast(text, vocab._c_trie, vocab.unk_token_id)
# 2. Slow Path: Pure Python Implementation (Fallback)
# Optimized using local variables for loop speed
tokens: List[int] = []
position: int = 0
text_length: int = len(text)
# Pre-fetch methods to avoid attribute lookup in loop
vocab_match = vocab.longest_match
tokens_append = tokens.append
unk_id = vocab.unk_token_id
while position < text_length:
# Longest matching token using optimized trie traversal
token_id, match_length = vocab_match(text, position)
if match_length > 0:
tokens_append(token_id)
position += match_length
else:
# Handle out-of-vocabulary characters
tokens_append(unk_id)
position += 1
return tokens |