Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import unicodedata | |
| import traceback | |
| import tiktoken | |
| from transformers import AutoTokenizer | |
| from mappings import MODEL_MAP, TOKENIZER_INFO | |
| class TokenMonsterTokenizer: | |
| def __init__(self, name): | |
| import tokenmonster | |
| self.name = name | |
| self.vocab = tokenmonster.load(name.split("/")[-1]) | |
| def __call__(self, text, **kwargs): | |
| ids = list(self.vocab.tokenize(text)) | |
| return {"input_ids": ids} | |
| def convert_ids_to_tokens(self, ids): | |
| return [self.vocab.decode(id_) for id_ in ids] | |
| def get_token_type(token_text): | |
| if re.match(r"^\s+$", token_text): | |
| return "whitespace" | |
| elif re.match(r"^[a-zA-Z]+$", token_text): | |
| return "word" | |
| elif re.match(r"^\d+$", token_text): | |
| return "number" | |
| elif re.match(r"^[^\w\s]+$", token_text): | |
| return "punctuation" | |
| elif token_text.startswith("<") and token_text.endswith(">"): | |
| return "special" | |
| else: | |
| return "mixed" | |
| def is_subword(token_text, model, is_first): | |
| if not token_text or token_text.isspace(): | |
| return False | |
| if token_text.startswith("<") and token_text.endswith(">"): | |
| return False # special token | |
| if model in { | |
| "llama-2", | |
| "llama-3", | |
| "gemma-2", | |
| "bloom", | |
| "aya-expanse", | |
| "comma", | |
| }: | |
| return ( | |
| not (token_text.startswith("▁") or token_text.startswith("Ġ")) | |
| and not is_first | |
| ) | |
| elif model == "bert": | |
| return token_text.startswith("##") | |
| elif model in {"qwen3", "qwen2.5"}: | |
| return ( | |
| not (token_text.startswith("▁") or token_text.startswith("Ġ")) | |
| and not is_first | |
| ) | |
| elif model in {"gpt-4", "gpt-2", "byt5"}: | |
| return not token_text.startswith(" ") and not is_first | |
| else: | |
| return not is_first | |
| def tokenize_with_tiktoken(text, model): | |
| encoding = "cl100k_base" if model == "gpt-4" else "gpt2" | |
| enc = tiktoken.get_encoding(encoding) | |
| tokens = enc.encode(text) | |
| token_data = [] | |
| current_pos = 0 | |
| for i, token_id in enumerate(tokens): | |
| token_text = enc.decode([token_id]) | |
| token_type = get_token_type(token_text) | |
| subword = is_subword(token_text, model, i == 0) | |
| token_data.append( | |
| { | |
| "text": token_text, | |
| "id": int(token_id), | |
| "type": token_type, | |
| "is_subword": subword, | |
| "bytes": len(token_text.encode("utf-8")), | |
| "position": i, | |
| } | |
| ) | |
| current_pos += len(token_text) | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": len(tokens), | |
| "tokens": token_data, | |
| "compression_ratio": len(text) / len(tokens) if tokens else 0, | |
| "encoding": TOKENIZER_INFO[model]["encoding"], | |
| "vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
| } | |
| def tokenize_with_hf(text, model): | |
| try: | |
| model_name = MODEL_MAP.get(model, "gpt2") | |
| # Get token from environment | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": 0, | |
| "tokens": [], | |
| "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.", | |
| } | |
| if "tokenmonster" in model_name: | |
| tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1") | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, token=hf_token, trust_remote_code=True | |
| ) | |
| token_data = [] | |
| encoding = tokenizer( | |
| text, | |
| return_offsets_mapping=False, | |
| return_tensors=None, | |
| add_special_tokens=True, | |
| ) | |
| token_ids = encoding["input_ids"] | |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
| print(model_name, tokens, token_ids) | |
| # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?")) | |
| for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)): | |
| token_type = get_token_type(token_text) | |
| subword = is_subword(token_text, model, i == 0) | |
| token_data.append( | |
| { | |
| "text": token_text, | |
| "id": int(token_id), | |
| "type": token_type, | |
| "is_subword": subword, | |
| "bytes": len(token_text.encode("utf-8")), | |
| "position": i, | |
| } | |
| ) | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": len(token_ids), | |
| "tokens": token_data, | |
| "compression_ratio": len(text) / len(token_ids) if token_ids else 0, | |
| "encoding": TOKENIZER_INFO[model]["encoding"], | |
| "vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
| } | |
| except Exception as e: | |
| error_msg = str(e) | |
| print(f"DEBUG: Error: {error_msg}") | |
| print(traceback.format_exc()) | |
| # Provide helpful error messages | |
| if "gated repo" in error_msg.lower(): | |
| error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set." | |
| elif "401" in error_msg: | |
| error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets." | |
| elif "not found" in error_msg.lower(): | |
| error_msg = ( | |
| f"Model {model_name} not found. It may have been moved or renamed." | |
| ) | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": 0, | |
| "tokens": [], | |
| "compression_ratio": 0, | |
| "encoding": "Error", | |
| "vocab_size": 0, | |
| "error": error_msg, | |
| } | |
| def normalize_text(text, method): | |
| """Apply normalization method to text""" | |
| if method == "none": | |
| return text | |
| elif method == "lowercase": | |
| return text.lower() | |
| elif method == "nfc": | |
| return unicodedata.normalize("NFC", text) | |
| elif method == "nfd": | |
| return unicodedata.normalize("NFD", text) | |
| elif method == "nfkc": | |
| return unicodedata.normalize("NFKC", text) | |
| elif method == "nfkd": | |
| return unicodedata.normalize("NFKD", text) | |
| elif method == "strip_accents": | |
| return "".join( | |
| c | |
| for c in unicodedata.normalize("NFD", text) | |
| if unicodedata.category(c) != "Mn" | |
| ) | |
| elif method == "strip_punctuation": | |
| return re.sub(r"[^\w\s]", "", text) | |
| elif method == "whitespace_normalize": | |
| return " ".join(text.split()) | |
| return text | |
| def get_normalization_methods(): | |
| """Return available normalization methods""" | |
| return [ | |
| ("none", "No normalization"), | |
| ("lowercase", "Lowercase"), | |
| ("nfc", "Unicode NFC (Canonical)"), | |
| ("nfd", "Unicode NFD (Decomposed)"), | |
| ("nfkc", "Unicode NFKC (Compatible)"), | |
| ("nfkd", "Unicode NFKD (Compatible Decomposed)"), | |
| ("strip_accents", "Remove Accents"), | |
| ("strip_punctuation", "Remove Punctuation"), | |
| ("whitespace_normalize", "Normalize Whitespace"), | |
| ] | |