CRAYON-tokenizer / src /crayon /core /tokenizer.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
from typing import List
from .vocabulary import CrayonVocab
# Try importing C-extension
try:
from ..c_ext import _core
_C_EXT_AVAILABLE = True
except ImportError:
_C_EXT_AVAILABLE = False
def crayon_tokenize(text: str, vocab: CrayonVocab) -> List[int]:
"""
Core tokenization algorithm optimized for throughput and accuracy.
Time Complexity: O(n) due to O(1) average lookup and constant max_lookahead.
Space Complexity: O(n) for output tokens.
Automatically uses C-Extension with SIMD acceleration if available [cite: 358-375].
"""
# 1. Fast Path: Use C-Extension if available and trie is built
if _C_EXT_AVAILABLE and vocab._c_ext_available and vocab._c_trie is not None:
return _core.crayon_tokenize_fast(text, vocab._c_trie, vocab.unk_token_id)
# 2. Slow Path: Pure Python Implementation (Fallback)
# Optimized using local variables for loop speed
tokens: List[int] = []
position: int = 0
text_length: int = len(text)
# Pre-fetch methods to avoid attribute lookup in loop
vocab_match = vocab.longest_match
tokens_append = tokens.append
unk_id = vocab.unk_token_id
while position < text_length:
# Longest matching token using optimized trie traversal
token_id, match_length = vocab_match(text, position)
if match_length > 0:
tokens_append(token_id)
position += match_length
else:
# Handle out-of-vocabulary characters
tokens_append(unk_id)
position += 1
return tokens