RA-ALTA
/

tokenizer-pl

Model card Files Files and versions

xet

Community

suchirsalhan commited on 9 days ago

Commit

ba90aac

verified ·

1 Parent(s): 06b07bb

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

tokenizer.json +91 -0
tokenizer_config.json +11 -8

tokenizer.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "</s>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Metaspace",
+    "replacement": "▁",
+    "prepend_scheme": "first",
+    "split": false
+  },
+  "post_processor": null,
+  "decoder": {
+    "type": "Sequence",
+    "decoders": [
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": "▁"
+        },
+        "content": " "
+      },
+      {
+        "type": "ByteFallback"
+      },
+      {
+        "type": "Fuse"
+      },
+      {
+        "type": "Strip",
+        "content": " ",
+        "start": 1,
+        "stop": 0
+      }
+    ]
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": true,
+    "byte_fallback": true,
+    "ignore_merges": false,
+    "vocab": {
+      "<unk>": 0,
+      "<s>": 1,
+      "</s>": 2
+    },
+    "merges": []
+  }
+}

tokenizer_config.json CHANGED Viewed

@@ -1,9 +1,12 @@
 {
-  "tokenizer_class": "SentencePieceTokenizer",
-  "model_type": "bpe",
-  "unk_token": "<unk>",
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "pad_token": "<pad>",
-  "use_fast": false
-}

 {
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<unk>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<pad>",
+  "use_default_system_prompt": false
+}