Upload custom Unigram tokenizer (v1)

Files changed (4) hide show

special_tokens_map.json ADDED Viewed

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "[UNK]",
+  "pad_token": "[PAD]",
+  "cls_token": "[CLS]",
+  "sep_token": "[SEP]",
+  "mask_token": "[MASK]"
+}

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c875bd75614e5b2d086e18fe7f361258c259c948afde8a0092b30a9a29c5164
-size 12734617

 version https://git-lfs.github.com/spec/v1
+oid sha256:bad7b64abeae95c1df5d624cabdfd9ea723157bc626b75f2960a39ac889eefa4
+size 4316356

tokenizer.model ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:22946ba034a75aebd68458ce149c7039392dca9c7c99c8fda9d8c21a2c61a205
+size 1262639

tokenizer_config.json CHANGED Viewed

@@ -1,20 +1,11 @@
 {
-  "tokenizer_class": "DFArcTokenizer",
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_df_arc.DFArcTokenizer",
-      null
-    ]
-  },
-  "phrases_file": "phrase_vocab.json",
-  "vocab_file": "tokenizer.json",
-  "min_stem_length": 2,
-  "normalization_config": {
-    "unify_alef": true,
-    "unify_yeh": true,
-    "unify_teh_marbuta": true,
-    "remove_diacritics": true,
-    "remove_tatweel": true,
-    "remove_repeats": true
-  }
 }

 {
+  "tokenizer_class": "LlamaTokenizer",
+  "model_max_length": 32768,
+  "clean_up_tokenization_spaces": false,
+  "unk_token": "[UNK]",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "[PAD]",
+  "add_bos_token": true,
+  "add_eos_token": false
 }