CRAYON-tokenizer / src /crayon /memory /zerocopy.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
import mmap
import os
from typing import Iterator, Tuple, List
from ..core.vocabulary import CrayonVocab
class ZeroCopyTokenizer:
"""
Zero-copy tokenizer minimizing memory allocation and data movement.
Uses OS virtual memory (mmap) to handle files larger than RAM[cite: 844].
"""
def __init__(self, vocab: CrayonVocab):
self.vocab = vocab
def tokenize_file_zerocopy(self, file_path: str) -> Iterator[Tuple[int, int]]:
"""
Tokenize large files without loading entire content into memory.
Yields: (token_id, file_offset)
"""
file_size = os.path.getsize(file_path)
chunk_size = 64 * 1024 # 64KB fits L2 cache [cite: 858]
overlap = 1024 # Safety margin for boundary tokens
with open(file_path, 'rb') as f:
# Memory map the entire file [cite: 854]
with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmapped:
offset = 0
while offset < file_size:
chunk_end = min(offset + chunk_size, file_size)
# Create zero-copy memoryview [cite: 860]
# Includes overlap to catch tokens spanning chunks
view_end = min(chunk_end + overlap, file_size)
# Convert to bytes immediately to avoid holding mmap reference
chunk_bytes = bytes(mmapped[offset:view_end])
# Process chunk
# Note: We pass is_last to know if we can consume the very end
is_last = (chunk_end == file_size)
tokens, consumed = self._tokenize_chunk_with_boundaries(
memoryview(chunk_bytes), offset, is_last
)
for tid in tokens:
yield tid, offset # In reality, offset needs strict tracking per token
# Advance
offset += consumed
def _tokenize_chunk_with_boundaries(self,
chunk_view: memoryview,
base_offset: int,
is_last: bool) -> Tuple[List[int], int]:
"""
Tokenize memory chunk handling token boundaries at edges[cite: 877].
"""
# Decode (copy happens here unfortunately in Python, unless C-ext used)
# In strict zero-copy C-ext, we'd pass the pointer directly.
try:
text = chunk_view.tobytes().decode('utf-8')
except UnicodeDecodeError:
# Handle partial UTF-8 at end of view
text = chunk_view.tobytes().decode('utf-8', errors='ignore')
tokens = []
pos = 0
text_len = len(text)
limit = text_len if is_last else text_len - 100 # Safety margin [cite: 892]
while pos < text_len:
# Stop if we are in the danger zone (overlap area) and not at EOF
if not is_last and pos > limit:
break
token_id, match_len = self.vocab.longest_match(text, pos)
if match_len > 0:
tokens.append(token_id)
pos += match_len
else:
tokens.append(self.vocab.unk_token_id)
pos += 1
# Calculate actual bytes consumed to adjust file offset correctly
# This part is tricky in Python due to char vs byte length mismatch
consumed_bytes = len(text[:pos].encode('utf-8'))
return tokens, consumed_bytes