File size: 3,721 Bytes
708f4a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import mmap
import os
from typing import Iterator, Tuple, List
from ..core.vocabulary import CrayonVocab

class ZeroCopyTokenizer:
    """
    Zero-copy tokenizer minimizing memory allocation and data movement.
    
    Uses OS virtual memory (mmap) to handle files larger than RAM[cite: 844].
    """

    def __init__(self, vocab: CrayonVocab):
        self.vocab = vocab

    def tokenize_file_zerocopy(self, file_path: str) -> Iterator[Tuple[int, int]]:
        """
        Tokenize large files without loading entire content into memory.
        Yields: (token_id, file_offset)
        """
        file_size = os.path.getsize(file_path)
        chunk_size = 64 * 1024 # 64KB fits L2 cache [cite: 858]
        overlap = 1024 # Safety margin for boundary tokens
        
        with open(file_path, 'rb') as f:
            # Memory map the entire file [cite: 854]
            with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmapped:
                offset = 0
                
                while offset < file_size:
                    chunk_end = min(offset + chunk_size, file_size)
                    
                    # Create zero-copy memoryview [cite: 860]
                    # Includes overlap to catch tokens spanning chunks
                    view_end = min(chunk_end + overlap, file_size)
                    # Convert to bytes immediately to avoid holding mmap reference
                    chunk_bytes = bytes(mmapped[offset:view_end])
                    
                    # Process chunk
                    # Note: We pass is_last to know if we can consume the very end
                    is_last = (chunk_end == file_size)
                    tokens, consumed = self._tokenize_chunk_with_boundaries(
                        memoryview(chunk_bytes), offset, is_last
                    )
                    
                    for tid in tokens:
                        yield tid, offset # In reality, offset needs strict tracking per token
                    
                    # Advance
                    offset += consumed

    def _tokenize_chunk_with_boundaries(self, 
                                      chunk_view: memoryview, 
                                      base_offset: int,
                                      is_last: bool) -> Tuple[List[int], int]:
        """
        Tokenize memory chunk handling token boundaries at edges[cite: 877].
        """
        # Decode (copy happens here unfortunately in Python, unless C-ext used)
        # In strict zero-copy C-ext, we'd pass the pointer directly.
        try:
            text = chunk_view.tobytes().decode('utf-8')
        except UnicodeDecodeError:
            # Handle partial UTF-8 at end of view
            text = chunk_view.tobytes().decode('utf-8', errors='ignore')
            
        tokens = []
        pos = 0
        text_len = len(text)
        limit = text_len if is_last else text_len - 100 # Safety margin [cite: 892]
        
        while pos < text_len:
            # Stop if we are in the danger zone (overlap area) and not at EOF
            if not is_last and pos > limit:
                break
                
            token_id, match_len = self.vocab.longest_match(text, pos)
            
            if match_len > 0:
                tokens.append(token_id)
                pos += match_len
            else:
                tokens.append(self.vocab.unk_token_id)
                pos += 1
                
        # Calculate actual bytes consumed to adjust file offset correctly
        # This part is tricky in Python due to char vs byte length mismatch
        consumed_bytes = len(text[:pos].encode('utf-8'))
        
        return tokens, consumed_bytes