styalai commited on
Commit
35bfebb
·
verified ·
1 Parent(s): 7173544

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -3,5 +3,10 @@
3
  "<mask>": 8004,
4
  "<pad>": 8003,
5
  "<s>": 8000,
6
- "<unk>": 8002
 
 
 
 
 
7
  }
 
3
  "<mask>": 8004,
4
  "<pad>": 8003,
5
  "<s>": 8000,
6
+ "<unk>": 8002,
7
+ "<|assistant|>": 8008,
8
+ "<|endoftext|>": 8005,
9
+ "<|end|>": 8007,
10
+ "<|system|>": 8009,
11
+ "<|user|>": 8006
12
  }
merges.txt CHANGED
@@ -7739,3 +7739,7 @@ bal anced
7739
  Ġoverwhel med
7740
  Ġlic ense
7741
  Ġflood ing
 
 
 
 
 
7739
  Ġoverwhel med
7740
  Ġlic ense
7741
  Ġflood ing
7742
+ Ġcrow d
7743
+ Ġtow ns
7744
+ ĠEle phant
7745
+ Ġwond ers
special_tokens_map.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "bos_token": {
3
- "content": "<s>",
4
  "lstrip": false,
5
- "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
@@ -14,9 +14,9 @@
14
  "single_word": false
15
  },
16
  "eos_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
- "normalized": true,
20
  "rstrip": false,
21
  "single_word": false
22
  },
@@ -35,9 +35,9 @@
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "</s>",
39
  "lstrip": false,
40
- "normalized": true,
41
  "rstrip": false,
42
  "single_word": false
43
  },
 
1
  {
2
  "bos_token": {
3
+ "content": "<|user|>",
4
  "lstrip": false,
5
+ "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
 
14
  "single_word": false
15
  },
16
  "eos_token": {
17
+ "content": "<|endoftext|>",
18
  "lstrip": false,
19
+ "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
 
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "<|end|>",
39
  "lstrip": false,
40
+ "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -40,17 +40,57 @@
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  }
44
  },
45
- "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": true,
47
  "cls_token": "<s>",
48
- "eos_token": "</s>",
49
  "errors": "replace",
50
  "mask_token": "<mask>",
51
  "model_max_length": 1000000000000000019884624838656,
52
  "pad_token": "<pad>",
53
- "sep_token": "</s>",
54
  "tokenizer_class": "RobertaTokenizer",
55
  "trim_offsets": true,
56
  "unk_token": "<unk>"
 
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
43
+ },
44
+ "8005": {
45
+ "content": "<|endoftext|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "8006": {
53
+ "content": "<|user|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "8007": {
61
+ "content": "<|end|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8008": {
69
+ "content": "<|assistant|>",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": false
75
+ },
76
+ "8009": {
77
+ "content": "<|system|>",
78
+ "lstrip": false,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": false
83
  }
84
  },
85
+ "bos_token": "<|user|>",
86
  "clean_up_tokenization_spaces": true,
87
  "cls_token": "<s>",
88
+ "eos_token": "<|endoftext|>",
89
  "errors": "replace",
90
  "mask_token": "<mask>",
91
  "model_max_length": 1000000000000000019884624838656,
92
  "pad_token": "<pad>",
93
+ "sep_token": "<|end|>",
94
  "tokenizer_class": "RobertaTokenizer",
95
  "trim_offsets": true,
96
  "unk_token": "<unk>"
vocab.json CHANGED
The diff for this file is too large to render. See raw diff