Xerv-AI
/

CRAYON-tokenizer

Text Generation

hardware-accelerated

double-array-trie

Model card Files Files and versions

CRAYON-tokenizer / src /crayon /memory /zerocopy.py

Phase-Technologies's picture

Phase-Technologies

Upload folder using huggingface_hub

708f4a3 verified 4 days ago

history blame contribute delete

3.72 kB

	import mmap
	import os
	from typing import Iterator, Tuple, List
	from ..core.vocabulary import CrayonVocab

	class ZeroCopyTokenizer:
	"""
	Zero-copy tokenizer minimizing memory allocation and data movement.

	Uses OS virtual memory (mmap) to handle files larger than RAM[cite: 844].
	"""

	def __init__(self, vocab: CrayonVocab):
	self.vocab = vocab

	def tokenize_file_zerocopy(self, file_path: str) -> Iterator[Tuple[int, int]]:
	"""
	Tokenize large files without loading entire content into memory.
	Yields: (token_id, file_offset)
	"""
	file_size = os.path.getsize(file_path)
	chunk_size = 64 * 1024 # 64KB fits L2 cache [cite: 858]
	overlap = 1024 # Safety margin for boundary tokens

	with open(file_path, 'rb') as f:
	# Memory map the entire file [cite: 854]
	with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmapped:
	offset = 0

	while offset < file_size:
	chunk_end = min(offset + chunk_size, file_size)

	# Create zero-copy memoryview [cite: 860]
	# Includes overlap to catch tokens spanning chunks
	view_end = min(chunk_end + overlap, file_size)
	# Convert to bytes immediately to avoid holding mmap reference
	chunk_bytes = bytes(mmapped[offset:view_end])

	# Process chunk
	# Note: We pass is_last to know if we can consume the very end
	is_last = (chunk_end == file_size)
	tokens, consumed = self._tokenize_chunk_with_boundaries(
	memoryview(chunk_bytes), offset, is_last
	)

	for tid in tokens:
	yield tid, offset # In reality, offset needs strict tracking per token

	# Advance
	offset += consumed

	def _tokenize_chunk_with_boundaries(self,
	chunk_view: memoryview,
	base_offset: int,
	is_last: bool) -> Tuple[List[int], int]:
	"""
	Tokenize memory chunk handling token boundaries at edges[cite: 877].
	"""
	# Decode (copy happens here unfortunately in Python, unless C-ext used)
	# In strict zero-copy C-ext, we'd pass the pointer directly.
	try:
	text = chunk_view.tobytes().decode('utf-8')
	except UnicodeDecodeError:
	# Handle partial UTF-8 at end of view
	text = chunk_view.tobytes().decode('utf-8', errors='ignore')

	tokens = []
	pos = 0
	text_len = len(text)
	limit = text_len if is_last else text_len - 100 # Safety margin [cite: 892]

	while pos < text_len:
	# Stop if we are in the danger zone (overlap area) and not at EOF
	if not is_last and pos > limit:
	break

	token_id, match_len = self.vocab.longest_match(text, pos)

	if match_len > 0:
	tokens.append(token_id)
	pos += match_len
	else:
	tokens.append(self.vocab.unk_token_id)
	pos += 1

	# Calculate actual bytes consumed to adjust file offset correctly
	# This part is tricky in Python due to char vs byte length mismatch
	consumed_bytes = len(text[:pos].encode('utf-8'))

	return tokens, consumed_bytes