Xerv-AI
/

CRAYON-tokenizer

Text Generation

hardware-accelerated

double-array-trie

Model card Files Files and versions

CRAYON-tokenizer / src /crayon /core /tokenizer.py

Phase-Technologies's picture

Phase-Technologies

Upload folder using huggingface_hub

708f4a3 verified 4 days ago

history blame contribute delete

1.6 kB

	from typing import List
	from .vocabulary import CrayonVocab

	# Try importing C-extension
	try:
	from ..c_ext import _core
	_C_EXT_AVAILABLE = True
	except ImportError:
	_C_EXT_AVAILABLE = False

	def crayon_tokenize(text: str, vocab: CrayonVocab) -> List[int]:
	"""
	Core tokenization algorithm optimized for throughput and accuracy.

	Time Complexity: O(n) due to O(1) average lookup and constant max_lookahead.
	Space Complexity: O(n) for output tokens.

	Automatically uses C-Extension with SIMD acceleration if available [cite: 358-375].
	"""
	# 1. Fast Path: Use C-Extension if available and trie is built
	if _C_EXT_AVAILABLE and vocab._c_ext_available and vocab._c_trie is not None:
	return _core.crayon_tokenize_fast(text, vocab._c_trie, vocab.unk_token_id)

	# 2. Slow Path: Pure Python Implementation (Fallback)
	# Optimized using local variables for loop speed
	tokens: List[int] = []
	position: int = 0
	text_length: int = len(text)

	# Pre-fetch methods to avoid attribute lookup in loop
	vocab_match = vocab.longest_match
	tokens_append = tokens.append
	unk_id = vocab.unk_token_id

	while position < text_length:
	# Longest matching token using optimized trie traversal
	token_id, match_length = vocab_match(text, position)

	if match_length > 0:
	tokens_append(token_id)
	position += match_length
	else:
	# Handle out-of-vocabulary characters
	tokens_append(unk_id)
	position += 1

	return tokens