File size: 1,601 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from typing import List
from .vocabulary import CrayonVocab

# Try importing C-extension
try:
    from ..c_ext import _core
    _C_EXT_AVAILABLE = True
except ImportError:
    _C_EXT_AVAILABLE = False

def crayon_tokenize(text: str, vocab: CrayonVocab) -> List[int]:
    """
    Core tokenization algorithm optimized for throughput and accuracy.
    
    Time Complexity: O(n) due to O(1) average lookup and constant max_lookahead.
    Space Complexity: O(n) for output tokens.
    
    Automatically uses C-Extension with SIMD acceleration if available [cite: 358-375].
    """
    # 1. Fast Path: Use C-Extension if available and trie is built
    if _C_EXT_AVAILABLE and vocab._c_ext_available and vocab._c_trie is not None:
        return _core.crayon_tokenize_fast(text, vocab._c_trie, vocab.unk_token_id)

    # 2. Slow Path: Pure Python Implementation (Fallback)
    # Optimized using local variables for loop speed
    tokens: List[int] = []
    position: int = 0
    text_length: int = len(text)
    
    # Pre-fetch methods to avoid attribute lookup in loop
    vocab_match = vocab.longest_match
    tokens_append = tokens.append
    unk_id = vocab.unk_token_id
    
    while position < text_length:
        # Longest matching token using optimized trie traversal
        token_id, match_length = vocab_match(text, position)
        
        if match_length > 0:
            tokens_append(token_id)
            position += match_length
        else:
            # Handle out-of-vocabulary characters
            tokens_append(unk_id)
            position += 1
            
    return tokens