Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

char_tokenizer.py +21 -1
special_tokens_map.json +7 -0
tokenizer_config.json +21 -8
vocab.json +22 -16

char_tokenizer.py CHANGED Viewed

@@ -42,6 +42,11 @@ class CharTokenizer(PreTrainedTokenizer):
         unk_token = kwargs.pop("unk_token", "<unk>")
         bos_token = kwargs.pop("bos_token", "<s>")
         eos_token = kwargs.pop("eos_token", "</s>")
         # Initialize vocab dictionaries first
         self.char_to_id = {}
@@ -60,7 +65,17 @@ class CharTokenizer(PreTrainedTokenizer):
             }
         elif characters is not None:
             # Build vocabulary from characters
-            special_tokens = [pad_token, unk_token, bos_token, eos_token]
             unique_chars = []
             for char in characters:
                 if char not in unique_chars and char not in special_tokens:
@@ -74,6 +89,11 @@ class CharTokenizer(PreTrainedTokenizer):
             unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
             model_max_length=model_max_length,
             padding_side=padding_side,
             **kwargs,

         unk_token = kwargs.pop("unk_token", "<unk>")
         bos_token = kwargs.pop("bos_token", "<s>")
         eos_token = kwargs.pop("eos_token", "</s>")
+        user_token = kwargs.pop("user_token", "<|user|>")
+        assistant_token = kwargs.pop("assistant_token", "<|assistant|>")
+        system_token = kwargs.pop("system_token", "<|system|>")
+        eot_token = kwargs.pop("eot_token", "<|end|>")
+        mask_token = kwargs.pop("mask_token", "<|mdm_mask|>")
         # Initialize vocab dictionaries first
         self.char_to_id = {}
             }
         elif characters is not None:
             # Build vocabulary from characters
+            special_tokens = [
+                pad_token,
+                unk_token,
+                bos_token,
+                eos_token,
+                user_token,
+                assistant_token,
+                system_token,
+                eot_token,
+                mask_token,
+            ]
             unique_chars = []
             for char in characters:
                 if char not in unique_chars and char not in special_tokens:
             unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
+            user_token=user_token,
+            assistant_token=assistant_token,
+            system_token=system_token,
+            eot_token=eot_token,
+            mask_token=mask_token,
             model_max_length=model_max_length,
             padding_side=padding_side,
             **kwargs,

special_tokens_map.json CHANGED Viewed

@@ -27,6 +27,13 @@
       "normalized": false,
       "rstrip": false,
       "single_word": false
     }
   ],
   "bos_token": "<|startoftext|>",

       "normalized": false,
       "rstrip": false,
       "single_word": false
+    },
+    {
+      "content": "<|mdm_mask|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
     }
   ],
   "bos_token": "<|startoftext|>",

tokenizer_config.json CHANGED Viewed

@@ -16,55 +16,68 @@
       "single_word": false,
       "special": true
     },
-    "19": {
-      "content": "<|system|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "20": {
-      "content": "<|user|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "21": {
-      "content": "<|assistant|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "22": {
       "content": "<|end|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
     "<|system|>",
     "<|user|>",
     "<|assistant|>",
-    "<|end|>"
   ],
   "bos_token": "<|startoftext|>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "extra_special_tokens": {},
   "mask_token": "<|mdm_mask|>",
   "model_max_length": 4096,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "tokenizer_class": "CharTokenizer",
   "unk_token": null,
   "auto_map": {
     "AutoTokenizer": [
       "char_tokenizer.CharTokenizer",

       "single_word": false,
       "special": true
     },
+    "4": {
+      "content": "<|user|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "5": {
+      "content": "<|assistant|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "6": {
+      "content": "<|system|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "7": {
       "content": "<|end|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "8": {
+      "content": "<|mdm_mask|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
     "<|system|>",
     "<|user|>",
     "<|assistant|>",
+    "<|end|>",
+    "<|mdm_mask|>"
   ],
+  "assistant_token": "<|assistant|>",
   "bos_token": "<|startoftext|>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
+  "eot_token": "<|end|>",
   "extra_special_tokens": {},
   "mask_token": "<|mdm_mask|>",
   "model_max_length": 4096,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
+  "system_token": "<|system|>",
   "tokenizer_class": "CharTokenizer",
   "unk_token": null,
+  "user_token": "<|user|>",
   "auto_map": {
     "AutoTokenizer": [
       "char_tokenizer.CharTokenizer",

vocab.json CHANGED Viewed

@@ -2,20 +2,26 @@
   "<|endoftext|>": 3,
   "null": 1,
   "<|startoftext|>": 2,
-  "*": 4,
-  "+": 5,
-  "-": 6,
-  "/": 7,
-  "0": 8,
-  "1": 9,
-  "2": 10,
-  "3": 11,
-  "4": 12,
-  "5": 13,
-  "6": 14,
-  "7": 15,
-  "8": 16,
-  "9": 17,
-  "=": 18,
-  "?": 19
 }

   "<|endoftext|>": 3,
   "null": 1,
   "<|startoftext|>": 2,
+  "<|user|>": 4,
+  "<|assistant|>": 5,
+  "<|system|>": 6,
+  "<|end|>": 7,
+  "<|mdm_mask|>": 8,
+  "\n": 9,
+  "*": 10,
+  "+": 11,
+  "-": 12,
+  "/": 13,
+  "0": 14,
+  "1": 15,
+  "2": 16,
+  "3": 17,
+  "4": 18,
+  "5": 19,
+  "6": 20,
+  "7": 21,
+  "8": 22,
+  "9": 23,
+  "=": 24,
+  "?": 25
 }