guyhadad01 commited on
Commit
82e8401
·
1 Parent(s): ea4f962

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +8 -44
  2. tokenizer.json +32 -87
  3. tokenizer_config.json +11 -13
  4. vocab.json +0 -0
special_tokens_map.json CHANGED
@@ -1,51 +1,15 @@
1
  {
2
- "bos_token": {
3
- "content": "[CLS]",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "[CLS]",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "[SEP]",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "mask_token": {
24
- "content": "[MASK]",
25
  "lstrip": true,
26
- "normalized": true,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
- "pad_token": {
31
- "content": "[PAD]",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "[SEP]",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "[UNK]",
46
- "lstrip": false,
47
- "normalized": true,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "mask_token": {
6
+ "content": "<mask>",
7
  "lstrip": true,
8
+ "normalized": false,
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
tokenizer.json CHANGED
@@ -1,51 +1,58 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": true,
13
  "special": true
14
  },
15
  {
16
  "id": 1,
17
- "content": "[CLS]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": true,
22
  "special": true
23
  },
24
  {
25
  "id": 2,
26
- "content": "[SEP]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": true,
31
  "special": true
32
  },
33
  {
34
  "id": 3,
35
- "content": "[UNK]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": true,
40
  "special": true
41
  },
42
  {
43
  "id": 50264,
44
- "content": "[MASK]",
45
  "single_word": false,
46
  "lstrip": true,
47
  "rstrip": false,
48
- "normalized": true,
49
  "special": true
50
  }
51
  ],
@@ -57,79 +64,17 @@
57
  "use_regex": true
58
  },
59
  "post_processor": {
60
- "type": "TemplateProcessing",
61
- "single": [
62
- {
63
- "SpecialToken": {
64
- "id": "[CLS]",
65
- "type_id": 0
66
- }
67
- },
68
- {
69
- "Sequence": {
70
- "id": "A",
71
- "type_id": 0
72
- }
73
- },
74
- {
75
- "SpecialToken": {
76
- "id": "[SEP]",
77
- "type_id": 0
78
- }
79
- }
80
  ],
81
- "pair": [
82
- {
83
- "SpecialToken": {
84
- "id": "[CLS]",
85
- "type_id": 0
86
- }
87
- },
88
- {
89
- "Sequence": {
90
- "id": "A",
91
- "type_id": 0
92
- }
93
- },
94
- {
95
- "SpecialToken": {
96
- "id": "[SEP]",
97
- "type_id": 0
98
- }
99
- },
100
- {
101
- "Sequence": {
102
- "id": "B",
103
- "type_id": 1
104
- }
105
- },
106
- {
107
- "SpecialToken": {
108
- "id": "[SEP]",
109
- "type_id": 1
110
- }
111
- }
112
  ],
113
- "special_tokens": {
114
- "[CLS]": {
115
- "id": "[CLS]",
116
- "ids": [
117
- 1
118
- ],
119
- "tokens": [
120
- "[CLS]"
121
- ]
122
- },
123
- "[SEP]": {
124
- "id": "[SEP]",
125
- "ids": [
126
- 2
127
- ],
128
- "tokens": [
129
- "[SEP]"
130
- ]
131
- }
132
- }
133
  },
134
  "decoder": {
135
  "type": "ByteLevel",
@@ -146,10 +91,10 @@
146
  "fuse_unk": false,
147
  "byte_fallback": false,
148
  "vocab": {
149
- "[PAD]": 0,
150
- "[CLS]": 1,
151
- "[SEP]": 2,
152
- "[UNK]": 3,
153
  ".": 4,
154
  "Ġthe": 5,
155
  ",": 6,
@@ -50410,7 +50355,7 @@
50410
  "madeupword0000": 50261,
50411
  "madeupword0001": 50262,
50412
  "madeupword0002": 50263,
50413
- "[MASK]": 50264
50414
  },
50415
  "merges": [
50416
  "Ġ t",
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": {
5
+ "strategy": "BatchLongest",
6
+ "direction": "Right",
7
+ "pad_to_multiple_of": null,
8
+ "pad_id": 1,
9
+ "pad_type_id": 0,
10
+ "pad_token": "<pad>"
11
+ },
12
  "added_tokens": [
13
  {
14
  "id": 0,
15
+ "content": "<s>",
16
  "single_word": false,
17
  "lstrip": false,
18
  "rstrip": false,
19
+ "normalized": false,
20
  "special": true
21
  },
22
  {
23
  "id": 1,
24
+ "content": "<pad>",
25
  "single_word": false,
26
  "lstrip": false,
27
  "rstrip": false,
28
+ "normalized": false,
29
  "special": true
30
  },
31
  {
32
  "id": 2,
33
+ "content": "</s>",
34
  "single_word": false,
35
  "lstrip": false,
36
  "rstrip": false,
37
+ "normalized": false,
38
  "special": true
39
  },
40
  {
41
  "id": 3,
42
+ "content": "<unk>",
43
  "single_word": false,
44
  "lstrip": false,
45
  "rstrip": false,
46
+ "normalized": false,
47
  "special": true
48
  },
49
  {
50
  "id": 50264,
51
+ "content": "<mask>",
52
  "single_word": false,
53
  "lstrip": true,
54
  "rstrip": false,
55
+ "normalized": false,
56
  "special": true
57
  }
58
  ],
 
64
  "use_regex": true
65
  },
66
  "post_processor": {
67
+ "type": "RobertaProcessing",
68
+ "sep": [
69
+ "</s>",
70
+ 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ],
72
+ "cls": [
73
+ "<s>",
74
+ 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  ],
76
+ "trim_offsets": true,
77
+ "add_prefix_space": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
  "decoder": {
80
  "type": "ByteLevel",
 
91
  "fuse_unk": false,
92
  "byte_fallback": false,
93
  "vocab": {
94
+ "<s>": 0,
95
+ "<pad>": 1,
96
+ "</s>": 2,
97
+ "<unk>": 3,
98
  ".": 4,
99
  "Ġthe": 5,
100
  ",": 6,
 
50355
  "madeupword0000": 50261,
50356
  "madeupword0001": 50262,
50357
  "madeupword0002": 50263,
50358
+ "<mask>": 50264
50359
  },
50360
  "merges": [
50361
  "Ġ t",
tokenizer_config.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "add_bos_token": false,
3
  "add_prefix_space": false,
4
  "bos_token": {
5
  "__type": "AddedToken",
6
- "content": "[CLS]",
7
  "lstrip": false,
8
  "normalized": true,
9
  "rstrip": false,
@@ -12,16 +11,15 @@
12
  "clean_up_tokenization_spaces": true,
13
  "cls_token": {
14
  "__type": "AddedToken",
15
- "content": "[CLS]",
16
  "lstrip": false,
17
  "normalized": true,
18
  "rstrip": false,
19
  "single_word": false
20
  },
21
- "do_lower_case": false,
22
  "eos_token": {
23
  "__type": "AddedToken",
24
- "content": "[SEP]",
25
  "lstrip": false,
26
  "normalized": true,
27
  "rstrip": false,
@@ -30,16 +28,16 @@
30
  "errors": "replace",
31
  "mask_token": {
32
  "__type": "AddedToken",
33
- "content": "[MASK]",
34
  "lstrip": true,
35
  "normalized": true,
36
  "rstrip": false,
37
  "single_word": false
38
  },
39
- "model_max_length": 512,
40
  "pad_token": {
41
  "__type": "AddedToken",
42
- "content": "[PAD]",
43
  "lstrip": false,
44
  "normalized": true,
45
  "rstrip": false,
@@ -47,20 +45,20 @@
47
  },
48
  "sep_token": {
49
  "__type": "AddedToken",
50
- "content": "[SEP]",
51
  "lstrip": false,
52
  "normalized": true,
53
  "rstrip": false,
54
  "single_word": false
55
  },
56
- "tokenizer_class": "DebertaTokenizer",
 
57
  "unk_token": {
58
  "__type": "AddedToken",
59
- "content": "[UNK]",
60
  "lstrip": false,
61
  "normalized": true,
62
  "rstrip": false,
63
  "single_word": false
64
- },
65
- "vocab_type": "gpt2"
66
  }
 
1
  {
 
2
  "add_prefix_space": false,
3
  "bos_token": {
4
  "__type": "AddedToken",
5
+ "content": "<s>",
6
  "lstrip": false,
7
  "normalized": true,
8
  "rstrip": false,
 
11
  "clean_up_tokenization_spaces": true,
12
  "cls_token": {
13
  "__type": "AddedToken",
14
+ "content": "<s>",
15
  "lstrip": false,
16
  "normalized": true,
17
  "rstrip": false,
18
  "single_word": false
19
  },
 
20
  "eos_token": {
21
  "__type": "AddedToken",
22
+ "content": "</s>",
23
  "lstrip": false,
24
  "normalized": true,
25
  "rstrip": false,
 
28
  "errors": "replace",
29
  "mask_token": {
30
  "__type": "AddedToken",
31
+ "content": "<mask>",
32
  "lstrip": true,
33
  "normalized": true,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
+ "model_max_length": 1000000000000000019884624838656,
38
  "pad_token": {
39
  "__type": "AddedToken",
40
+ "content": "<pad>",
41
  "lstrip": false,
42
  "normalized": true,
43
  "rstrip": false,
 
45
  },
46
  "sep_token": {
47
  "__type": "AddedToken",
48
+ "content": "</s>",
49
  "lstrip": false,
50
  "normalized": true,
51
  "rstrip": false,
52
  "single_word": false
53
  },
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
  "unk_token": {
57
  "__type": "AddedToken",
58
+ "content": "<unk>",
59
  "lstrip": false,
60
  "normalized": true,
61
  "rstrip": false,
62
  "single_word": false
63
+ }
 
64
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff