Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

Gül Sena Altıntaş

Fixed tokenmonster issue

f58b113 5 months ago

7.32 kB

	import os
	import re
	import unicodedata
	import traceback

	import tiktoken
	from transformers import AutoTokenizer

	from mappings import MODEL_MAP, TOKENIZER_INFO


	class TokenMonsterTokenizer:
	def __init__(self, name):
	import tokenmonster
	self.name = name
	self.vocab = tokenmonster.load(name.split("/")[-1])

	def __call__(self, text, **kwargs):
	ids = list(self.vocab.tokenize(text))
	return {"input_ids": ids}

	def convert_ids_to_tokens(self, ids):
	return [self.vocab.decode(id_) for id_ in ids]


	def get_token_type(token_text):
	if re.match(r"^\s+$", token_text):
	return "whitespace"
	elif re.match(r"^[a-zA-Z]+$", token_text):
	return "word"
	elif re.match(r"^\d+$", token_text):
	return "number"
	elif re.match(r"^[^\w\s]+$", token_text):
	return "punctuation"
	elif token_text.startswith("<") and token_text.endswith(">"):
	return "special"
	else:
	return "mixed"


	def is_subword(token_text, model, is_first):
	if not token_text or token_text.isspace():
	return False

	if token_text.startswith("<") and token_text.endswith(">"):
	return False # special token

	if model in {
	"llama-2",
	"llama-3",
	"gemma-2",
	"bloom",
	"aya-expanse",
	"comma",
	}:
	return (
	not (token_text.startswith("▁") or token_text.startswith("Ġ"))
	and not is_first
	)
	elif model == "bert":
	return token_text.startswith("##")
	elif model in {"qwen3", "qwen2.5"}:
	return (
	not (token_text.startswith("▁") or token_text.startswith("Ġ"))
	and not is_first
	)
	elif model in {"gpt-4", "gpt-2", "byt5"}:
	return not token_text.startswith(" ") and not is_first
	else:
	return not is_first


	def tokenize_with_tiktoken(text, model):
	encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
	enc = tiktoken.get_encoding(encoding)
	tokens = enc.encode(text)

	token_data = []
	current_pos = 0

	for i, token_id in enumerate(tokens):
	token_text = enc.decode([token_id])
	token_type = get_token_type(token_text)
	subword = is_subword(token_text, model, i == 0)

	token_data.append(
	{
	"text": token_text,
	"id": int(token_id),
	"type": token_type,
	"is_subword": subword,
	"bytes": len(token_text.encode("utf-8")),
	"position": i,
	}
	)
	current_pos += len(token_text)

	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": len(tokens),
	"tokens": token_data,
	"compression_ratio": len(text) / len(tokens) if tokens else 0,
	"encoding": TOKENIZER_INFO[model]["encoding"],
	"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
	}


	def tokenize_with_hf(text, model):
	try:
	model_name = MODEL_MAP.get(model, "gpt2")
	# Get token from environment
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": 0,
	"tokens": [],
	"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
	}

	if "tokenmonster" in model_name:
	tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
	else:
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, token=hf_token, trust_remote_code=True
	)
	token_data = []
	encoding = tokenizer(
	text,
	return_offsets_mapping=False,
	return_tensors=None,
	add_special_tokens=True,
	)
	token_ids = encoding["input_ids"]
	tokens = tokenizer.convert_ids_to_tokens(token_ids)
	print(model_name, tokens, token_ids)
	# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

	for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
	token_type = get_token_type(token_text)
	subword = is_subword(token_text, model, i == 0)

	token_data.append(
	{
	"text": token_text,
	"id": int(token_id),
	"type": token_type,
	"is_subword": subword,
	"bytes": len(token_text.encode("utf-8")),
	"position": i,
	}
	)

	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": len(token_ids),
	"tokens": token_data,
	"compression_ratio": len(text) / len(token_ids) if token_ids else 0,
	"encoding": TOKENIZER_INFO[model]["encoding"],
	"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
	}
	except Exception as e:
	error_msg = str(e)
	print(f"DEBUG: Error: {error_msg}")
	print(traceback.format_exc())

	# Provide helpful error messages
	if "gated repo" in error_msg.lower():
	error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
	elif "401" in error_msg:
	error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
	elif "not found" in error_msg.lower():
	error_msg = (
	f"Model {model_name} not found. It may have been moved or renamed."
	)

	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": 0,
	"tokens": [],
	"compression_ratio": 0,
	"encoding": "Error",
	"vocab_size": 0,
	"error": error_msg,
	}


	def normalize_text(text, method):
	"""Apply normalization method to text"""
	if method == "none":
	return text
	elif method == "lowercase":
	return text.lower()
	elif method == "nfc":
	return unicodedata.normalize("NFC", text)
	elif method == "nfd":
	return unicodedata.normalize("NFD", text)
	elif method == "nfkc":
	return unicodedata.normalize("NFKC", text)
	elif method == "nfkd":
	return unicodedata.normalize("NFKD", text)
	elif method == "strip_accents":
	return "".join(
	c
	for c in unicodedata.normalize("NFD", text)
	if unicodedata.category(c) != "Mn"
	)
	elif method == "strip_punctuation":
	return re.sub(r"[^\w\s]", "", text)
	elif method == "whitespace_normalize":
	return " ".join(text.split())
	return text


	def get_normalization_methods():
	"""Return available normalization methods"""
	return [
	("none", "No normalization"),
	("lowercase", "Lowercase"),
	("nfc", "Unicode NFC (Canonical)"),
	("nfd", "Unicode NFD (Decomposed)"),
	("nfkc", "Unicode NFKC (Compatible)"),
	("nfkd", "Unicode NFKD (Compatible Decomposed)"),
	("strip_accents", "Remove Accents"),
	("strip_punctuation", "Remove Punctuation"),
	("whitespace_normalize", "Normalize Whitespace"),
	]