Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

App Files Files Community

Gül Sena Altıntaş commited on Jul 29

Commit

f58b113

1 Parent(s): 37a99cb

Fixed tokenmonster issue

Browse files

Files changed (4) hide show

app.py +1 -1
mappings.py +1 -0
requirements.txt +2 -1
utils.py +22 -4

app.py CHANGED Viewed

@@ -716,7 +716,7 @@ with gr.Blocks(
     - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
     - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
-    - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece
     - **BLOOM**: BigScience's multilingual model with BPE

     - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
     - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
+    - **Gemma-2**: Google's model with SentencePiece (though HuggingFace uses BPE)
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece
     - **BLOOM**: BigScience's multilingual model with BPE

mappings.py CHANGED Viewed

@@ -14,6 +14,7 @@ MODEL_MAP = {
     "byt5": "google/byt5-small",
 }
 TOKENIZER_INFO = {
     "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
     "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},

     "byt5": "google/byt5-small",
 }
 TOKENIZER_INFO = {
     "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
     "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ tiktoken
 transformers
 torch
 pandas
-plotly

 transformers
 torch
 pandas
+plotly
+tokenmonster

utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import re
 import unicodedata
 import tiktoken
 from transformers import AutoTokenizer
@@ -8,6 +9,20 @@ from transformers import AutoTokenizer
 from mappings import MODEL_MAP, TOKENIZER_INFO
 def get_token_type(token_text):
     if re.match(r"^\s+$", token_text):
         return "whitespace"
@@ -93,7 +108,6 @@ def tokenize_with_tiktoken(text, model):
 def tokenize_with_hf(text, model):
     try:
         model_name = MODEL_MAP.get(model, "gpt2")
         # Get token from environment
         hf_token = os.getenv("HF_TOKEN")
         if not hf_token:
@@ -103,9 +117,11 @@ def tokenize_with_hf(text, model):
                 "tokens": [],
                 "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
             }
-        print(f"DEBUG: Loading model {model_name} with token")
-        tokenizer = AutoTokenizer.from_pretrained(
             model_name, token=hf_token, trust_remote_code=True
         )
         token_data = []
@@ -117,6 +133,7 @@ def tokenize_with_hf(text, model):
         )
         token_ids = encoding["input_ids"]
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
         # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
         for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
@@ -145,6 +162,7 @@ def tokenize_with_hf(text, model):
     except Exception as e:
         error_msg = str(e)
         print(f"DEBUG: Error: {error_msg}")
         # Provide helpful error messages
         if "gated repo" in error_msg.lower():

 import os
 import re
 import unicodedata
+import traceback
 import tiktoken
 from transformers import AutoTokenizer
 from mappings import MODEL_MAP, TOKENIZER_INFO
+class TokenMonsterTokenizer:
+    def __init__(self, name):
+        import tokenmonster
+        self.name = name
+        self.vocab = tokenmonster.load(name.split("/")[-1])
+    def __call__(self, text, **kwargs):
+        ids = list(self.vocab.tokenize(text))
+        return {"input_ids": ids}
+    def convert_ids_to_tokens(self, ids):
+        return [self.vocab.decode(id_) for id_ in ids]
 def get_token_type(token_text):
     if re.match(r"^\s+$", token_text):
         return "whitespace"
 def tokenize_with_hf(text, model):
     try:
         model_name = MODEL_MAP.get(model, "gpt2")
         # Get token from environment
         hf_token = os.getenv("HF_TOKEN")
         if not hf_token:
                 "tokens": [],
                 "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
             }
+        if "tokenmonster" in model_name:
+            tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
             model_name, token=hf_token, trust_remote_code=True
         )
         token_data = []
         )
         token_ids = encoding["input_ids"]
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        print(model_name, tokens, token_ids)
         # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
         for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
     except Exception as e:
         error_msg = str(e)
         print(f"DEBUG: Error: {error_msg}")
+        print(traceback.format_exc())
         # Provide helpful error messages
         if "gated repo" in error_msg.lower():