| """ |
| Pure Python Fallback Implementation for Crayon |
| ========================================== |
| |
| This provides a pure Python implementation when compiled extensions are not available. |
| Performance will be slower but functional. |
| """ |
|
|
| import re |
| import json |
| import os |
| from typing import List, Dict, Any |
|
|
| class PurePythonCPUBackend: |
| """Pure Python fallback for CPU tokenization""" |
| |
| def __init__(self): |
| self.vocab = {} |
| self.reverse_vocab = {} |
| self.loaded = False |
| self.dat_path = None |
| |
| def load_dat(self, buffer: bytes) -> int: |
| """Load vocabulary from DAT buffer""" |
| try: |
| |
| if hasattr(buffer, 'read'): |
| |
| try: |
| buffer_bytes = buffer.read() |
| except: |
| |
| buffer_bytes = bytes(buffer) |
| elif hasattr(buffer, 'decode'): |
| |
| buffer_str = buffer |
| try: |
| json_data = json.loads(buffer_str) |
| if isinstance(json_data, dict): |
| self.vocab = json_data |
| self.reverse_vocab = {v: k for k, v in self.vocab.items()} |
| self.loaded = True |
| print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON") |
| return len(self.vocab) |
| elif isinstance(json_data, list): |
| self.vocab = {word: i for i, word in enumerate(json_data)} |
| self.reverse_vocab = {v: k for k, v in self.vocab.items()} |
| self.loaded = True |
| print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON list") |
| return len(self.vocab) |
| except json.JSONDecodeError as e: |
| print(f"⚠ JSON decode failed: {e}") |
| pass |
| except Exception as e: |
| print(f"⚠ JSON parsing failed: {e}") |
| pass |
| else: |
| |
| buffer_bytes = buffer |
| |
| |
| try: |
| json_data = json.loads(buffer_bytes.decode('utf-8')) |
| if isinstance(json_data, dict): |
| self.vocab = json_data |
| self.reverse_vocab = {v: k for k, v in self.vocab.items()} |
| self.loaded = True |
| print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON") |
| return len(self.vocab) |
| elif isinstance(json_data, list): |
| self.vocab = {word: i for i, word in enumerate(json_data)} |
| self.reverse_vocab = {v: k for k, v in self.vocab.items()} |
| self.loaded = True |
| print(f"✅ Loaded vocabulary with {len(self.vocab)} tokens from JSON list") |
| return len(self.vocab) |
| except json.JSONDecodeError as e: |
| print(f"⚠ JSON decode failed: {e}") |
| pass |
| except Exception as e: |
| print(f"⚠ JSON parsing failed: {e}") |
| pass |
| |
| |
| |
| print("🔄 Creating fallback vocabulary (compiled extension recommended)") |
| |
| |
| basic_words = [ |
| "<PAD>", "<UNK>", "<BOS>", "<EOS>", "the", "a", "to", "and", "is", "of", "in", "that", "it", "for", |
| "on", "with", "as", "at", "this", "be", "are", "from", "or", "an", "by", "not", "but", "what", |
| "all", "was", "were", "have", "has", "had", "do", "does", "did", "will", "would", "could", |
| "should", "may", "might", "must", "can", "said", "say", "go", "get", "make", "know", "think", |
| "take", "see", "come", "want", "look", "use", "find", "give", "tell", "work", "call", "try", "ask", |
| "need", "feel", "become", "leave", "put", "mean", "keep", "let", "seem", "help", "talk", "turn", |
| "start", "show", "hear", "play", "run", "move", "live", "believe", "hold", "bring", "happen", "write", |
| "provide", "sit", "stand", "lose", "pay", "meet", "include", "continue", "set", "learn", "change", "lead", |
| "understand", "watch", "follow", "stop", "create", "speak", "read", "allow", "add", "spend", "grow", |
| "open", "walk", "win", "offer", "remember", "love", "consider", "appear", "buy", "wait", "serve", |
| "die", "send", "expect", "build", "stay", "fall", "cut", "reach", "kill", "remain", "suggest", |
| "raise", "pass", "sell", "require", "report", "decide", "pull", "cover", "stop", "break", "miss", |
| "hit", "lie", "move", "touch", "protect", "measure", "mention", "discover", "avoid", "raise", "pass", |
| "sell", "decide", "pull", "cover", "stop", "break", "miss", "hit", "lie", "touch", "protect", |
| "measure", "mention", "discover", "avoid", "raise", "pass", "sell", "decide", "pull", "cover", |
| |
| "time", "person", "year", "way", "day", "man", "thing", "woman", "life", "child", |
| "world", "school", "state", "family", "student", "group", "country", "problem", "hand", |
| "part", "place", "case", "week", "company", "system", "program", "question", "work", |
| "government", "number", "night", "point", "home", "water", "room", "mother", "area", |
| "money", "story", "fact", "month", "lot", "right", "study", "book", "eye", "job", |
| "word", "business", "issue", "side", "kind", "head", "house", "service", "friend", |
| "father", "power", "hour", "game", "line", "end", "member", "law", "car", "city", |
| "community", "name", "president", "team", "minute", "idea", "kid", "parent", "face", |
| "door", "health", "history", "party", "result", "morning", "reason", "research", "girl", |
| "guy", "food", "moment", "air", "teacher", "force", "help", "online", "computer", "information", |
| "data", "back", "process", "support", "technology", "software", "market", "price", "product", |
| "service", "project", "access", "control", "development", "design", "management", "security", |
| "network", "database", "application", "server", "system", "analysis", "method", "approach", |
| "strategy", "performance", "quality", "experience", "knowledge", "skill", "ability", "training", |
| "education", "background", "career", "opportunity", "position", "department", "team", "role", |
| "responsibility", "objective", "goal", "target", "achievement", "success", "failure", "challenge", |
| "solution", "improvement", "innovation", "creativity", "communication", "collaboration", "leadership" |
| ] |
| |
| |
| for i, word in enumerate(basic_words): |
| self.vocab[word] = i |
| self.reverse_vocab[i] = word |
| |
| self.loaded = True |
| print(f"✅ Created fallback vocabulary with {len(self.vocab)} tokens") |
| return len(self.vocab) |
| except Exception as e: |
| raise RuntimeError(f"Failed to load vocabulary: {e}") |
| |
| def tokenize(self, text: str) -> List[int]: |
| """Tokenize text into token IDs""" |
| if not self.loaded: |
| raise RuntimeError("Vocabulary not loaded. Call load_dat() first.") |
| |
| |
| tokens = [] |
| words = re.findall(r'\b\w+\b', text.lower()) |
| |
| for word in words: |
| token_id = self.vocab.get(word, 1) |
| tokens.append(token_id) |
| |
| return tokens |
| |
| def get_hardware_info(self) -> str: |
| """Get hardware information""" |
| import platform |
| import sys |
| |
| cpu_info = platform.processor() or "Unknown CPU" |
| python_version = f"{sys.version_info.major}.{sys.version_info.minor}" |
| |
| return f"Pure Python Backend [{cpu_info}] [Python {python_version}]" |
|
|
| |
| _pure_python_backend = None |
|
|
| def get_pure_python_backend(): |
| """Get or create pure Python backend instance""" |
| global _pure_python_backend |
| if _pure_python_backend is None: |
| _pure_python_backend = PurePythonCPUBackend() |
| return _pure_python_backend |
|
|
| |
| def tokenize(text: str) -> List[int]: |
| """Tokenize text using pure Python implementation""" |
| backend = get_pure_python_backend() |
| return backend.tokenize(text) |
|
|
| def load_dat(buffer: bytes) -> int: |
| """Load DAT file using pure Python implementation""" |
| backend = get_pure_python_backend() |
| return backend.load_dat(buffer) |
|
|
| def get_hardware_info() -> str: |
| """Get hardware info for pure Python implementation""" |
| backend = get_pure_python_backend() |
| return backend.get_hardware_info() |
|
|
| __all__ = ['tokenize', 'load_dat', 'get_hardware_info'] |
|
|