Lakoc
/

libri_100

@@ -47,21 +47,12 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 100,
-      "content": " ",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "ByteLevel",
-    "add_prefix_space": false,
     "trim_offsets": true,
     "use_regex": true
   },
@@ -179,139 +170,139 @@
       "Ġt": 33,
       "he": 34,
       "Ġa": 35,
-      "in": 36,
-      "Ġthe": 37,
       "Ġs": 38,
       "Ġw": 39,
       "Ġo": 40,
       "re": 41,
       "nd": 42,
-      "Ġh": 43,
-      "Ġb": 44,
       "er": 45,
       "Ġm": 46,
-      "ou": 47,
-      "Ġi": 48,
       "Ġc": 49,
       "Ġf": 50,
       "at": 51,
       "ed": 52,
-      "en": 53,
-      "on": 54,
-      "Ġof": 55,
-      "Ġto": 56,
-      "Ġand": 57,
       "is": 58,
-      "ing": 59,
-      "Ġd": 60,
-      "Ġp": 61,
-      "Ġth": 62,
-      "or": 63,
-      "Ġhe": 64,
-      "es": 65,
-      "it": 66,
-      "Ġl": 67,
-      "as": 68,
-      "ar": 69,
-      "ll": 70,
-      "an": 71,
-      "Ġin": 72,
       "Ġn": 73,
       "Ġg": 74,
       "om": 75,
       "Ġbe": 76,
       "Ġha": 77,
-      "le": 78,
-      "Ġe": 79,
       "ot": 80,
-      "ut": 81,
-      "ow": 82,
-      "ic": 83,
-      "Ġy": 84,
-      "ld": 85,
-      "ve": 86,
-      "ly": 87,
-      "Ġit": 88,
-      "Ġwh": 89,
-      "Ġwas": 90,
-      "se": 91,
-      "Ġthat": 92,
-      "id": 93,
       "st": 94,
-      "gh": 95,
-      "Ġon": 96,
       "ent": 97,
       "Ġre": 98,
-      "im": 99
     },
     "merges": [
       "Ġ t",
       "h e",
       "Ġ a",
-      "i n",
       "Ġt he",
       "Ġ s",
       "Ġ w",
       "Ġ o",
       "r e",
       "n d",
-      "Ġ h",
       "Ġ b",
       "e r",
       "Ġ m",
-      "o u",
       "Ġ i",
       "Ġ c",
       "Ġ f",
       "a t",
       "e d",
       "e n",
-      "o n",
-      "Ġo f",
       "Ġt o",
-      "Ġa nd",
       "i s",
-      "in g",
       "Ġ d",
-      "Ġ p",
       "Ġt h",
-      "o r",
       "Ġ he",
       "e s",
       "i t",
-      "Ġ l",
-      "a s",
       "a r",
-      "l l",
       "a n",
-      "Ġ in",
       "Ġ n",
       "Ġ g",
       "o m",
       "Ġb e",
       "Ġh a",
-      "l e",
       "Ġ e",
       "o t",
       "u t",
       "o w",
       "i c",
-      "Ġ y",
       "l d",
       "v e",
       "l y",
-      "Ġi t",
-      "Ġw h",
       "Ġw as",
-      "s e",
-      "Ġth at",
       "i d",
       "s t",
-      "g h",
       "Ġo n",
       "en t",
       "Ġ re",
-      "i m"
     ]
   }
 }

       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": null,
   "pre_tokenizer": {
     "type": "ByteLevel",
+    "add_prefix_space": true,
     "trim_offsets": true,
     "use_regex": true
   },
       "Ġt": 33,
       "he": 34,
       "Ġa": 35,
+      "Ġthe": 36,
+      "in": 37,
       "Ġs": 38,
       "Ġw": 39,
       "Ġo": 40,
       "re": 41,
       "nd": 42,
+      "Ġb": 43,
+      "Ġh": 44,
       "er": 45,
       "Ġm": 46,
+      "Ġi": 47,
+      "ou": 48,
       "Ġc": 49,
       "Ġf": 50,
       "at": 51,
       "ed": 52,
+      "Ġand": 53,
+      "en": 54,
+      "Ġto": 55,
+      "Ġof": 56,
+      "on": 57,
       "is": 58,
+      "Ġd": 59,
+      "ing": 60,
+      "Ġth": 61,
+      "Ġp": 62,
+      "Ġhe": 63,
+      "or": 64,
+      "Ġl": 65,
+      "es": 66,
+      "Ġin": 67,
+      "ll": 68,
+      "it": 69,
+      "ar": 70,
+      "as": 71,
+      "an": 72,
       "Ġn": 73,
       "Ġg": 74,
       "om": 75,
       "Ġbe": 76,
       "Ġha": 77,
+      "Ġe": 78,
+      "le": 79,
       "ot": 80,
+      "Ġy": 81,
+      "ut": 82,
+      "ow": 83,
+      "ic": 84,
+      "Ġwh": 85,
+      "Ġit": 86,
+      "ld": 87,
+      "ve": 88,
+      "Ġthat": 89,
+      "ly": 90,
+      "Ġwas": 91,
+      "id": 92,
+      "se": 93,
       "st": 94,
+      "Ġon": 95,
+      "gh": 96,
       "ent": 97,
       "Ġre": 98,
+      "Ġyou": 99
     },
     "merges": [
       "Ġ t",
       "h e",
       "Ġ a",
       "Ġt he",
+      "i n",
       "Ġ s",
       "Ġ w",
       "Ġ o",
       "r e",
       "n d",
       "Ġ b",
+      "Ġ h",
       "e r",
       "Ġ m",
       "Ġ i",
+      "o u",
       "Ġ c",
       "Ġ f",
       "a t",
       "e d",
+      "Ġa nd",
       "e n",
       "Ġt o",
+      "Ġo f",
+      "o n",
       "i s",
       "Ġ d",
+      "in g",
       "Ġt h",
+      "Ġ p",
       "Ġ he",
+      "o r",
+      "Ġ l",
       "e s",
+      "Ġ in",
+      "l l",
       "i t",
       "a r",
+      "a s",
       "a n",
       "Ġ n",
       "Ġ g",
       "o m",
       "Ġb e",
       "Ġh a",
       "Ġ e",
+      "l e",
       "o t",
+      "Ġ y",
       "u t",
       "o w",
       "i c",
+      "Ġw h",
+      "Ġi t",
       "l d",
       "v e",
+      "Ġth at",
       "l y",
       "Ġw as",
       "i d",
+      "s e",
       "s t",
       "Ġo n",
+      "g h",
       "en t",
       "Ġ re",
+      "Ġy ou"
     ]
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -39,14 +39,6 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "100": {
-      "content": " ",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
   "bos_token": "([bos])",
@@ -55,7 +47,6 @@
   "mask_token": "([mask])",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "([pad])",
-  "sep_token": " ",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "([unk])"
 }

       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "([bos])",
   "mask_token": "([mask])",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "([pad])",
   "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "([unk])"
 }