""" OmniCoreX Custom Tokenizer A super advanced, ultra high-tech tokenizer utility designed for OmniCoreX to handle custom tokenization requirements beyond standard libraries. Features: - Subword tokenization using Byte-Pair Encoding (BPE) - Efficient vocabulary management with encoding and decoding - Support for special tokens and adaptable vocabulary expansion - Fast string-to-token and token-to-string translation - Serialization and deserialization utilities for tokenizer state """ import re import json from collections import defaultdict from typing import List, Dict, Optional class BPETokenizer: def __init__(self, vocab: Optional[Dict[str, int]] = None, merges: Optional[List[List[str]]] = None): """ Initialize the BPE tokenizer. Args: vocab: Dictionary mapping tokens to indices. merges: List of token pair merges in order. """ self.vocab = vocab or {} self.merges = merges or [] # Build merge pairs to rank for quick lookup self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)} self.cache = {} self.pattern = re.compile(r"\w+|[^\w\s]", re.UNICODE) self.special_tokens = ["", "", "", ""] for token in self.special_tokens: if token not in self.vocab: self.vocab[token] = len(self.vocab) def get_vocab_size(self) -> int: return len(self.vocab) def tokenize(self, text: str) -> List[str]: """ Tokenize input text to list of subword tokens using BPE. Args: text: Input string. Returns: List of tokens. """ tokens = [] words = self.pattern.findall(text) for word in words: word_tokens = self.bpe(word) tokens.extend(word_tokens) return tokens def bpe(self, token: str) -> List[str]: """ Perform Byte Pair Encoding on a single token. Args: token: Token string. Returns: List of BPE sub-tokens. """ if token in self.cache: return self.cache[token] word = list(token) + [""] pairs = self.get_pairs(word) while True: if not pairs: break # Find lowest rank pair min_pair = None min_rank = float('inf') for pair in pairs: rank = self.bpe_ranks.get(pair, None) if rank is not None and rank < min_rank: min_rank = rank min_pair = pair if min_pair is None: break first, second = min_pair new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) except ValueError: new_word.extend(word[i:]) break new_word.extend(word[i:j]) if j < len(word)-1 and word[j+1] == second: new_word.append(first+second) i = j + 2 else: new_word.append(word[j]) i = j + 1 word = new_word pairs = self.get_pairs(word) if word[-1] == "": word = word[:-1] self.cache[token] = word return word def get_pairs(self, word: List[str]) -> set: """ Return set of symbol pairs in a word. Args: word: List of symbols. Returns: Set of adjacent pairs. """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs def encode(self, text: str) -> List[int]: """ Tokenize and convert tokens to indices. Args: text: Input string. Returns: List of token indices. """ tokens = self.tokenize(text) indices = [self.vocab.get(token, self.vocab.get("")) for token in tokens] return indices def decode(self, indices: List[int]) -> str: """ Convert indices back to string. Args: indices: List of token indices. Returns: Decoded string. """ inv_vocab = {v: k for k, v in self.vocab.items()} tokens = [inv_vocab.get(idx, "") for idx in indices] # Remove end of word tokens and join text = "".join([token.replace("", " ") for token in tokens]) return text.strip() def save(self, vocab_path: str, merges_path: str): """ Save vocabulary and merges to files. Args: vocab_path: Path for vocab JSON. merges_path: Path for merges JSON. """ with open(vocab_path, "w", encoding="utf-8") as f: json.dump(self.vocab, f, indent=2) with open(merges_path, "w", encoding="utf-8") as f: json.dump(self.merges, f, indent=2) def load(self, vocab_path: str, merges_path: str): """ Load vocabulary and merges from files. Args: vocab_path: Path for vocab JSON. merges_path: Path for merges JSON. """ with open(vocab_path, "r", encoding="utf-8") as f: self.vocab = json.load(f) with open(merges_path, "r", encoding="utf-8") as f: self.merges = json.load(f) self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)} self.cache = {} if __name__ == "__main__": # Simple usage example with dummy vocab and merges dummy_vocab = { "": 0, "": 1, "a": 2, "b": 3, "c": 4, "ab": 5, "bc": 6, "abc": 7, "": 8 } dummy_merges = [["a", "b"], ["b", "c"], ["ab", "c"]] tokenizer = BPETokenizer(vocab=dummy_vocab, merges=dummy_merges) sample_text = "abc cab" print(f"Encoding text: {sample_text}") encoded = tokenizer.encode(sample_text) print(f"Encoded tokens: {encoded}") decoded = tokenizer.decode(encoded) print(f"Decoded text: '{decoded}'")