CRAYON-tokenizer / src /crayon /c_ext /crayon_cpu_fallback.py
Phase-Technologies's picture
Upload folder using huggingface_hub
708f4a3 verified
"""
Pure Python Fallback Implementation for Crayon
==========================================
This provides a pure Python implementation when compiled extensions are not available.
Performance will be slower but functional.
"""
import re
import json
import os
from typing import List, Dict, Any
class PurePythonCPUBackend:
"""Pure Python fallback for CPU tokenization"""
def __init__(self):
self.vocab = {}
self.reverse_vocab = {}
self.loaded = False
self.dat_path = None
def load_dat(self, buffer: bytes) -> int:
"""Load vocabulary from DAT buffer"""
try:
# Handle mmap objects (common in real usage)
if hasattr(buffer, 'read'):
# It's a mmap object, read the bytes
try:
buffer_bytes = buffer.read()
except:
# Fallback: try to get bytes from mmap
buffer_bytes = bytes(buffer)
elif hasattr(buffer, 'decode'):
# It's already a string-like object
buffer_str = buffer
try:
json_data = json.loads(buffer_str)
if isinstance(json_data, dict):
self.vocab = json_data
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
self.loaded = True
print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON")
return len(self.vocab)
elif isinstance(json_data, list):
self.vocab = {word: i for i, word in enumerate(json_data)}
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
self.loaded = True
print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON list")
return len(self.vocab)
except json.JSONDecodeError as e:
print(f"⚠ JSON decode failed: {e}")
pass
except Exception as e:
print(f"⚠ JSON parsing failed: {e}")
pass
else:
# It's bytes, proceed normally
buffer_bytes = buffer
# Try to parse as JSON (should work with actual vocab files)
try:
json_data = json.loads(buffer_bytes.decode('utf-8'))
if isinstance(json_data, dict):
self.vocab = json_data
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
self.loaded = True
print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON")
return len(self.vocab)
elif isinstance(json_data, list):
self.vocab = {word: i for i, word in enumerate(json_data)}
self.reverse_vocab = {v: k for k, v in self.vocab.items()}
self.loaded = True
print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON list")
return len(self.vocab)
except json.JSONDecodeError as e:
print(f"⚠ JSON decode failed: {e}")
pass
except Exception as e:
print(f"⚠ JSON parsing failed: {e}")
pass
# If JSON parsing fails, create a basic working vocabulary
# This is a fallback - real solution is to fix the compiled extension
print("🔄 Creating fallback vocabulary (compiled extension recommended)")
# Create a functional basic vocabulary with proper structure
basic_words = [
"<PAD>", "<UNK>", "<BOS>", "<EOS>", "the", "a", "to", "and", "is", "of", "in", "that", "it", "for",
"on", "with", "as", "at", "this", "be", "are", "from", "or", "an", "by", "not", "but", "what",
"all", "was", "were", "have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "can", "said", "say", "go", "get", "make", "know", "think",
"take", "see", "come", "want", "look", "use", "find", "give", "tell", "work", "call", "try", "ask",
"need", "feel", "become", "leave", "put", "mean", "keep", "let", "seem", "help", "talk", "turn",
"start", "show", "hear", "play", "run", "move", "live", "believe", "hold", "bring", "happen", "write",
"provide", "sit", "stand", "lose", "pay", "meet", "include", "continue", "set", "learn", "change", "lead",
"understand", "watch", "follow", "stop", "create", "speak", "read", "allow", "add", "spend", "grow",
"open", "walk", "win", "offer", "remember", "love", "consider", "appear", "buy", "wait", "serve",
"die", "send", "expect", "build", "stay", "fall", "cut", "reach", "kill", "remain", "suggest",
"raise", "pass", "sell", "require", "report", "decide", "pull", "cover", "stop", "break", "miss",
"hit", "lie", "move", "touch", "protect", "measure", "mention", "discover", "avoid", "raise", "pass",
"sell", "decide", "pull", "cover", "stop", "break", "miss", "hit", "lie", "touch", "protect",
"measure", "mention", "discover", "avoid", "raise", "pass", "sell", "decide", "pull", "cover",
# Add common words for better coverage
"time", "person", "year", "way", "day", "man", "thing", "woman", "life", "child",
"world", "school", "state", "family", "student", "group", "country", "problem", "hand",
"part", "place", "case", "week", "company", "system", "program", "question", "work",
"government", "number", "night", "point", "home", "water", "room", "mother", "area",
"money", "story", "fact", "month", "lot", "right", "study", "book", "eye", "job",
"word", "business", "issue", "side", "kind", "head", "house", "service", "friend",
"father", "power", "hour", "game", "line", "end", "member", "law", "car", "city",
"community", "name", "president", "team", "minute", "idea", "kid", "parent", "face",
"door", "health", "history", "party", "result", "morning", "reason", "research", "girl",
"guy", "food", "moment", "air", "teacher", "force", "help", "online", "computer", "information",
"data", "back", "process", "support", "technology", "software", "market", "price", "product",
"service", "project", "access", "control", "development", "design", "management", "security",
"network", "database", "application", "server", "system", "analysis", "method", "approach",
"strategy", "performance", "quality", "experience", "knowledge", "skill", "ability", "training",
"education", "background", "career", "opportunity", "position", "department", "team", "role",
"responsibility", "objective", "goal", "target", "achievement", "success", "failure", "challenge",
"solution", "improvement", "innovation", "creativity", "communication", "collaboration", "leadership"
]
# Create vocabulary
for i, word in enumerate(basic_words):
self.vocab[word] = i
self.reverse_vocab[i] = word
self.loaded = True
print(f"✅ Created fallback vocabulary with {len(self.vocab)} tokens")
return len(self.vocab)
except Exception as e:
raise RuntimeError(f"Failed to load vocabulary: {e}")
def tokenize(self, text: str) -> List[int]:
"""Tokenize text into token IDs"""
if not self.loaded:
raise RuntimeError("Vocabulary not loaded. Call load_dat() first.")
# Simple whitespace and punctuation tokenization
tokens = []
words = re.findall(r'\b\w+\b', text.lower())
for word in words:
token_id = self.vocab.get(word, 1) # 1 = UNK token
tokens.append(token_id)
return tokens
def get_hardware_info(self) -> str:
"""Get hardware information"""
import platform
import sys
cpu_info = platform.processor() or "Unknown CPU"
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
return f"Pure Python Backend [{cpu_info}] [Python {python_version}]"
# Create global instance
_pure_python_backend = None
def get_pure_python_backend():
"""Get or create pure Python backend instance"""
global _pure_python_backend
if _pure_python_backend is None:
_pure_python_backend = PurePythonCPUBackend()
return _pure_python_backend
# Export functions that match the C++ extension interface
def tokenize(text: str) -> List[int]:
"""Tokenize text using pure Python implementation"""
backend = get_pure_python_backend()
return backend.tokenize(text)
def load_dat(buffer: bytes) -> int:
"""Load DAT file using pure Python implementation"""
backend = get_pure_python_backend()
return backend.load_dat(buffer)
def get_hardware_info() -> str:
"""Get hardware info for pure Python implementation"""
backend = get_pure_python_backend()
return backend.get_hardware_info()
__all__ = ['tokenize', 'load_dat', 'get_hardware_info']