Xerv-AI
/

CRAYON-tokenizer

Text Generation

hardware-accelerated

double-array-trie

Model card Files Files and versions

CRAYON-tokenizer / src /crayon /unicode /multilingual.py

Phase-Technologies's picture

Phase-Technologies

Upload folder using huggingface_hub

708f4a3 verified 4 days ago

history blame contribute delete

2.7 kB

	import re
	from typing import List, Tuple, Dict, Any

	class MultilingualProcessor:
	"""
	Optimizes processing based on detected scripts.

	Section 5.3: Handles mixed-script content by segmenting text into
	homogeneous blocks for specialized tokenizer handling.
	"""

	def __init__(self):
	# Pre-compiled regex patterns for common scripts
	# Optimized for rapid scanning of large text blocks
	self.script_patterns = {
	'latin': re.compile(r'[a-zA-Z0-9\u00C0-\u024F]+'),
	'cyrillic': re.compile(r'[\u0400-\u04FF]+'),
	'arabic': re.compile(r'[\u0600-\u06FF]+'),
	'cjk': re.compile(r'[\u4E00-\u9FFF]+'),
	'emoji': re.compile(r'[\U0001F600-\U0001F64F]+')
	}
	# Fallback for anything not caught above
	self.generic_pattern = re.compile(r'\S+')

	def process_multilingual_text(self, text: str, tokenizer_func: Any) -> List[int]:
	"""
	Segment text by script and apply optimized tokenization.

	Args:
	text: Raw input text
	tokenizer_func: The core tokenizer callable (usually C-ext function)

	Returns:
	List of token IDs
	"""
	tokens: List[int] = []

	# In a full C-optimized implementation, this segmentation happens
	# inside the C-extension using SIMD classification (Section 6.3).
	# This Python implementation serves as the reference logic for
	# complex mixed-script scenarios.

	# Simple whitespace tokenization as a baseline for segmentation
	# (Real implementation uses the regexes to split)
	# Here we demonstrate the logic flow:

	position = 0
	length = len(text)

	while position < length:
	# 1. Identify script at current position
	# This is a simplified heuristic. Production would use a scanning loop.
	# For strict high-performance, we pass the whole string to C-ext
	# and let it handle UTF-8 boundaries.

	# Direct pass-through to core tokenizer is usually faster than
	# python-level segmentation unless specific rules apply (e.g. Arabic RTL).
	pass

	# Since the C-Extension handles UTF-8 natively now (Section 6),
	# this processor acts mainly as a pre-filter for domain-specific logic
	# or legacy support.

	# Overachieving target: We bypass Python segmentation for speed
	# and rely on the C-layer unless specifically invoked.
	return tokenizer_func(text)

	return tokens