OmniCoreX / tokenizer.py

Create tokenizer.py

5215885 verified 9 months ago

6.31 kB

	"""
	OmniCoreX Custom Tokenizer

	A super advanced, ultra high-tech tokenizer utility designed for OmniCoreX to handle
	custom tokenization requirements beyond standard libraries.

	Features:
	- Subword tokenization using Byte-Pair Encoding (BPE)
	- Efficient vocabulary management with encoding and decoding
	- Support for special tokens and adaptable vocabulary expansion
	- Fast string-to-token and token-to-string translation
	- Serialization and deserialization utilities for tokenizer state
	"""

	import re
	import json
	from collections import defaultdict
	from typing import List, Dict, Optional


	class BPETokenizer:
	def __init__(self, vocab: Optional[Dict[str, int]] = None, merges: Optional[List[List[str]]] = None):
	"""
	Initialize the BPE tokenizer.

	Args:
	vocab: Dictionary mapping tokens to indices.
	merges: List of token pair merges in order.
	"""
	self.vocab = vocab or {}
	self.merges = merges or []
	# Build merge pairs to rank for quick lookup
	self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)}
	self.cache = {}

	self.pattern = re.compile(r"\w+\|[^\w\s]", re.UNICODE)
	self.special_tokens = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"]
	for token in self.special_tokens:
	if token not in self.vocab:
	self.vocab[token] = len(self.vocab)

	def get_vocab_size(self) -> int:
	return len(self.vocab)

	def tokenize(self, text: str) -> List[str]:
	"""
	Tokenize input text to list of subword tokens using BPE.

	Args:
	text: Input string.

	Returns:
	List of tokens.
	"""
	tokens = []
	words = self.pattern.findall(text)
	for word in words:
	word_tokens = self.bpe(word)
	tokens.extend(word_tokens)
	return tokens

	def bpe(self, token: str) -> List[str]:
	"""
	Perform Byte Pair Encoding on a single token.

	Args:
	token: Token string.

	Returns:
	List of BPE sub-tokens.
	"""
	if token in self.cache:
	return self.cache[token]

	word = list(token) + ["</w>"]
	pairs = self.get_pairs(word)

	while True:
	if not pairs:
	break
	# Find lowest rank pair
	min_pair = None
	min_rank = float('inf')
	for pair in pairs:
	rank = self.bpe_ranks.get(pair, None)
	if rank is not None and rank < min_rank:
	min_rank = rank
	min_pair = pair
	if min_pair is None:
	break
	first, second = min_pair
	new_word = []
	i = 0
	while i < len(word):
	try:
	j = word.index(first, i)
	except ValueError:
	new_word.extend(word[i:])
	break
	new_word.extend(word[i:j])
	if j < len(word)-1 and word[j+1] == second:
	new_word.append(first+second)
	i = j + 2
	else:
	new_word.append(word[j])
	i = j + 1
	word = new_word
	pairs = self.get_pairs(word)

	if word[-1] == "</w>":
	word = word[:-1]
	self.cache[token] = word
	return word

	def get_pairs(self, word: List[str]) -> set:
	"""
	Return set of symbol pairs in a word.

	Args:
	word: List of symbols.

	Returns:
	Set of adjacent pairs.
	"""
	pairs = set()
	prev_char = word[0]
	for char in word[1:]:
	pairs.add((prev_char, char))
	prev_char = char
	return pairs

	def encode(self, text: str) -> List[int]:
	"""
	Tokenize and convert tokens to indices.

	Args:
	text: Input string.

	Returns:
	List of token indices.
	"""
	tokens = self.tokenize(text)
	indices = [self.vocab.get(token, self.vocab.get("<UNK>")) for token in tokens]
	return indices

	def decode(self, indices: List[int]) -> str:
	"""
	Convert indices back to string.

	Args:
	indices: List of token indices.

	Returns:
	Decoded string.
	"""
	inv_vocab = {v: k for k, v in self.vocab.items()}
	tokens = [inv_vocab.get(idx, "<UNK>") for idx in indices]
	# Remove end of word tokens and join
	text = "".join([token.replace("</w>", " ") for token in tokens])
	return text.strip()

	def save(self, vocab_path: str, merges_path: str):
	"""
	Save vocabulary and merges to files.

	Args:
	vocab_path: Path for vocab JSON.
	merges_path: Path for merges JSON.
	"""
	with open(vocab_path, "w", encoding="utf-8") as f:
	json.dump(self.vocab, f, indent=2)
	with open(merges_path, "w", encoding="utf-8") as f:
	json.dump(self.merges, f, indent=2)

	def load(self, vocab_path: str, merges_path: str):
	"""
	Load vocabulary and merges from files.

	Args:
	vocab_path: Path for vocab JSON.
	merges_path: Path for merges JSON.
	"""
	with open(vocab_path, "r", encoding="utf-8") as f:
	self.vocab = json.load(f)
	with open(merges_path, "r", encoding="utf-8") as f:
	self.merges = json.load(f)
	self.bpe_ranks = {tuple(pair): i for i, pair in enumerate(self.merges)}
	self.cache = {}

	if __name__ == "__main__":
	# Simple usage example with dummy vocab and merges
	dummy_vocab = {
	"<PAD>": 0,
	"<UNK>": 1,
	"a": 2,
	"b": 3,
	"c": 4,
	"ab": 5,
	"bc": 6,
	"abc": 7,
	"</w>": 8
	}
	dummy_merges = [["a", "b"], ["b", "c"], ["ab", "c"]]

	tokenizer = BPETokenizer(vocab=dummy_vocab, merges=dummy_merges)

	sample_text = "abc cab"
	print(f"Encoding text: {sample_text}")
	encoded = tokenizer.encode(sample_text)
	print(f"Encoded tokens: {encoded}")

	decoded = tokenizer.decode(encoded)
	print(f"Decoded text: '{decoded}'")