lanstat0123 commited on
Commit
c2cd4b6
·
verified ·
1 Parent(s): fcb1516

Upload converted tokenizer

Browse files
Files changed (3) hide show
  1. README.md +25 -0
  2. tokenizer.json +125 -0
  3. tokenizer_config.json +13 -0
README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - tokenizer
5
+ - sentencepiece
6
+ ---
7
+
8
+ # SentencePiece-based Hugging Face tokenizer
9
+
10
+ This repository contains a Hugging Face tokenizer converted from a SentencePiece model.
11
+
12
+ ## Load
13
+
14
+ ```python
15
+ from transformers import AutoTokenizer
16
+
17
+ tokenizer = AutoTokenizer.from_pretrained("lanstat0123/multilingual_tokenizer")
18
+ ```
19
+
20
+ ## Special tokens
21
+
22
+ ```python
23
+ print(tokenizer.special_tokens_map)
24
+ print(len(tokenizer))
25
+ ```
tokenizer.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<s>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<pad>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<|mdm_mask|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "Metaspace",
55
+ "replacement": "▁",
56
+ "prepend_scheme": "first",
57
+ "split": false
58
+ },
59
+ "post_processor": {
60
+ "type": "TemplateProcessing",
61
+ "single": [
62
+ {
63
+ "Sequence": {
64
+ "id": "A",
65
+ "type_id": 0
66
+ }
67
+ }
68
+ ],
69
+ "pair": [
70
+ {
71
+ "Sequence": {
72
+ "id": "A",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "Sequence": {
78
+ "id": "B",
79
+ "type_id": 1
80
+ }
81
+ }
82
+ ],
83
+ "special_tokens": {}
84
+ },
85
+ "decoder": {
86
+ "type": "Sequence",
87
+ "decoders": [
88
+ {
89
+ "type": "Replace",
90
+ "pattern": {
91
+ "String": "▁"
92
+ },
93
+ "content": " "
94
+ },
95
+ {
96
+ "type": "ByteFallback"
97
+ },
98
+ {
99
+ "type": "Fuse"
100
+ },
101
+ {
102
+ "type": "Strip",
103
+ "content": " ",
104
+ "start": 1,
105
+ "stop": 0
106
+ }
107
+ ]
108
+ },
109
+ "model": {
110
+ "type": "BPE",
111
+ "dropout": null,
112
+ "unk_token": null,
113
+ "continuing_subword_prefix": null,
114
+ "end_of_word_suffix": null,
115
+ "fuse_unk": true,
116
+ "byte_fallback": true,
117
+ "ignore_merges": false,
118
+ "vocab": {
119
+ "<unk>": 0,
120
+ "<s>": 1,
121
+ "</s>": 2
122
+ },
123
+ "merges": []
124
+ }
125
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "</s>",
7
+ "mask_token": "<|mdm_mask|>",
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "<pad>",
10
+ "tokenizer_class": "LlamaTokenizer",
11
+ "unk_token": "<unk>",
12
+ "use_default_system_prompt": false
13
+ }