Zeb commited on May 9, 2025

Commit

6b0beee

1 Parent(s): 2e4b4eb

Fix tokenizers

Files changed (22) hide show

.gitattributes CHANGED Viewed

@@ -54,3 +54,4 @@ mutual-information_256000/special_tokens_map.json filter=lfs diff=lfs merge=lfs
 mutual-information_256000/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
 mutual-information_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 fw57M_Entropy_threshold_128000/stats.csv filter=lfs diff=lfs merge=lfs -text

 mutual-information_256000/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
 mutual-information_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 fw57M_Entropy_threshold_128000/stats.csv filter=lfs diff=lfs merge=lfs -text
+frequencymulti_256000/tokenizer.json filter=lfs diff=lfs merge=lfs -text

frequency_256000/merges.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

frequency_256000/merges_data.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

frequency_256000/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed37c183a6fdf658a259f23b64ca57aab65a1b96dbcb8343b06e108fe6fbec55
-size 20274248

 version https://git-lfs.github.com/spec/v1
+oid sha256:47a19d76ccd8400f78de8cb1bb0cc81e948596a5a9337c90d739c81a7c59ce8b
+size 19623518

frequency_256000/vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_8064/stats.csv → frequencymulti_256000/merges.txt RENAMED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_32000/stats.csv → frequencymulti_256000/merges_data.csv RENAMED Viewed

The diff for this file is too large to render. See raw diff

frequencymulti_256000/special_tokens_map.json ADDED Viewed

+{
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|padding|>"
+}

frequencymulti_256000/tokenizer.json ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed37c183a6fdf658a259f23b64ca57aab65a1b96dbcb8343b06e108fe6fbec55
+size 20274248

{fw57M_Entropy_thresholdBX_32000 → frequencymulti_256000}/tokenizer_config.json RENAMED Viewed

@@ -16,22 +16,14 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "259": {
-      "content": "<|unk|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
-  "bos_token": "<|endoftext|>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<|padding|>",
   "tokenizer_class": "PreTrainedTokenizer",
-  "unk_token": "<|unk|>"
 }

       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
+  "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<|padding|>",
   "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": null
 }

fw57M_Entropy_thresholdBX_16000/stats.csv → frequencymulti_256000/vocab.json RENAMED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_16000/special_tokens_map.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
-  "pad_token": "<|padding|>",
-  "unk_token": "<|unk|>"
-}

fw57M_Entropy_thresholdBX_16000/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_16000/tokenizer_config.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "add_prefix_space": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<|padding|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "259": {
-      "content": "<|unk|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "extra_special_tokens": {},
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<|padding|>",
-  "tokenizer_class": "PreTrainedTokenizer",
-  "unk_token": "<|unk|>"
-}

fw57M_Entropy_thresholdBX_16000/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_32000/special_tokens_map.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
-  "pad_token": "<|padding|>",
-  "unk_token": "<|unk|>"
-}

fw57M_Entropy_thresholdBX_32000/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_32000/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_8064/special_tokens_map.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
-  "pad_token": "<|padding|>",
-  "unk_token": "<|unk|>"
-}

fw57M_Entropy_thresholdBX_8064/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

fw57M_Entropy_thresholdBX_8064/tokenizer_config.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "add_prefix_space": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<|padding|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "259": {
-      "content": "<|unk|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "extra_special_tokens": {},
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<|padding|>",
-  "tokenizer_class": "PreTrainedTokenizer",
-  "unk_token": "<|unk|>"
-}

fw57M_Entropy_thresholdBX_8064/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff