Jlonge4 commited on
Commit
6aa4ffd
·
verified ·
1 Parent(s): dcc9c8d

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -8,5 +8,6 @@
8
  "<|tool_call|>": 200025,
9
  "<|tool_response|>": 200027,
10
  "<|tool|>": 200023,
11
- "<|user|>": 200021
 
12
  }
 
8
  "<|tool_call|>": 200025,
9
  "<|tool_response|>": 200027,
10
  "<|tool|>": 200023,
11
+ "<|user|>": 200021,
12
+ "<|PAD▁TOKEN|>": 200029
13
  }
chat_template.jinja CHANGED
@@ -1 +1 @@
1
- {% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}
 
1
+ {% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}
special_tokens_map.json CHANGED
@@ -7,21 +7,21 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|endoftext|>",
11
  "lstrip": false,
12
  "normalized": false,
13
- "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
- "content": "<|endoftext|>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
+ "rstrip": true,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<|PAD▁TOKEN|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
+ "content": "�",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:382cc235b56c725945e149cc25f191da667c836655efd0857b004320e90e91ea
3
- size 15524095
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b10016a39382ff2d24acc20a291ed83243a26c4549ab01f6240e72c6291d56
3
+ size 15524472
tokenizer_config.json CHANGED
@@ -3,6 +3,14 @@
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
 
 
 
 
 
 
 
 
6
  "199999": {
7
  "content": "<|endoftext|>",
8
  "lstrip": false,
@@ -98,14 +106,23 @@
98
  "rstrip": true,
99
  "single_word": false,
100
  "special": true
 
 
 
 
 
 
 
 
101
  }
102
  },
103
  "bos_token": "<|endoftext|>",
104
  "clean_up_tokenization_spaces": false,
105
- "eos_token": "<|endoftext|>",
106
  "extra_special_tokens": {},
107
  "model_max_length": 131072,
108
- "pad_token": "<|endoftext|>",
 
109
  "tokenizer_class": "GPT2Tokenizer",
110
- "unk_token": "<|endoftext|>"
111
  }
 
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
+ "3251": {
7
+ "content": "�",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
  "199999": {
15
  "content": "<|endoftext|>",
16
  "lstrip": false,
 
106
  "rstrip": true,
107
  "single_word": false,
108
  "special": true
109
+ },
110
+ "200029": {
111
+ "content": "<|PAD▁TOKEN|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
  }
118
  },
119
  "bos_token": "<|endoftext|>",
120
  "clean_up_tokenization_spaces": false,
121
+ "eos_token": "<|end|>",
122
  "extra_special_tokens": {},
123
  "model_max_length": 131072,
124
+ "pad_token": "<|PAD▁TOKEN|>",
125
+ "padding_side": "left",
126
  "tokenizer_class": "GPT2Tokenizer",
127
+ "unk_token": "�"
128
  }