protgpt3 commited on
Commit
49b896a
·
verified ·
1 Parent(s): fa69ff4

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +30 -0
  2. tokenizer.json +91 -0
  3. tokenizer_config.json +44 -0
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|bos|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|eos|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|pad|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|bos|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|eos|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": null,
44
+ "pre_tokenizer": {
45
+ "type": "WhitespaceSplit"
46
+ },
47
+ "post_processor": null,
48
+ "decoder": null,
49
+ "model": {
50
+ "type": "WordLevel",
51
+ "vocab": {
52
+ "<|pad|>": 0,
53
+ "<|bos|>": 1,
54
+ "<|eos|>": 2,
55
+ "<unk>": 3,
56
+ "<gap>": 4,
57
+ "<no_gap>": 5,
58
+ "<query>": 6,
59
+ "<s>": 7,
60
+ "-": 8,
61
+ "1": 9,
62
+ "2": 10,
63
+ "A": 11,
64
+ "B": 12,
65
+ "C": 13,
66
+ "D": 14,
67
+ "E": 15,
68
+ "F": 16,
69
+ "G": 17,
70
+ "H": 18,
71
+ "I": 19,
72
+ "K": 20,
73
+ "L": 21,
74
+ "M": 22,
75
+ "N": 23,
76
+ "O": 24,
77
+ "P": 25,
78
+ "Q": 26,
79
+ "R": 27,
80
+ "S": 28,
81
+ "T": 29,
82
+ "U": 30,
83
+ "V": 31,
84
+ "W": 32,
85
+ "X": 33,
86
+ "Y": 34,
87
+ "Z": 35
88
+ },
89
+ "unk_token": "<unk>"
90
+ }
91
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<|pad|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<|bos|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<|eos|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<|bos|>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "<|eos|>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 16384,
41
+ "pad_token": "<|pad|>",
42
+ "tokenizer_class": "PreTrainedTokenizerFast",
43
+ "unk_token": "<unk>"
44
+ }