CRAYON-tokenizer / src /crayon /unicode /multilingual.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
import re
from typing import List, Tuple, Dict, Any
class MultilingualProcessor:
"""
Optimizes processing based on detected scripts.
Section 5.3: Handles mixed-script content by segmenting text into
homogeneous blocks for specialized tokenizer handling.
"""
def __init__(self):
# Pre-compiled regex patterns for common scripts
# Optimized for rapid scanning of large text blocks
self.script_patterns = {
'latin': re.compile(r'[a-zA-Z0-9\u00C0-\u024F]+'),
'cyrillic': re.compile(r'[\u0400-\u04FF]+'),
'arabic': re.compile(r'[\u0600-\u06FF]+'),
'cjk': re.compile(r'[\u4E00-\u9FFF]+'),
'emoji': re.compile(r'[\U0001F600-\U0001F64F]+')
}
# Fallback for anything not caught above
self.generic_pattern = re.compile(r'\S+')
def process_multilingual_text(self, text: str, tokenizer_func: Any) -> List[int]:
"""
Segment text by script and apply optimized tokenization.
Args:
text: Raw input text
tokenizer_func: The core tokenizer callable (usually C-ext function)
Returns:
List of token IDs
"""
tokens: List[int] = []
# In a full C-optimized implementation, this segmentation happens
# inside the C-extension using SIMD classification (Section 6.3).
# This Python implementation serves as the reference logic for
# complex mixed-script scenarios.
# Simple whitespace tokenization as a baseline for segmentation
# (Real implementation uses the regexes to split)
# Here we demonstrate the logic flow:
position = 0
length = len(text)
while position < length:
# 1. Identify script at current position
# This is a simplified heuristic. Production would use a scanning loop.
# For strict high-performance, we pass the whole string to C-ext
# and let it handle UTF-8 boundaries.
# Direct pass-through to core tokenizer is usually faster than
# python-level segmentation unless specific rules apply (e.g. Arabic RTL).
pass
# Since the C-Extension handles UTF-8 natively now (Section 6),
# this processor acts mainly as a pre-filter for domain-specific logic
# or legacy support.
# Overachieving target: We bypass Python segmentation for speed
# and rely on the C-layer unless specifically invoked.
return tokenizer_func(text)
return tokens