iperbole commited on
Commit
ca0b697
·
verified ·
1 Parent(s): 56de9e0

Upload tokenizer

Browse files
chat_template.jinja CHANGED
@@ -1,4 +1,4 @@
1
- {{ '<s>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>
2
 
3
  ' + system_message + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>
4
 
 
1
+ {{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>
2
 
3
  ' + system_message + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>
4
 
special_tokens_map.json CHANGED
@@ -1,4 +1,7 @@
1
  {
 
 
 
2
  "bos_token": {
3
  "content": "<|begin_of_text|>",
4
  "lstrip": false,
@@ -7,14 +10,14 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<|end_of_text|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|eom_id|>"
4
+ ],
5
  "bos_token": {
6
  "content": "<|begin_of_text|>",
7
  "lstrip": false,
 
10
  "single_word": false
11
  },
12
  "eos_token": {
13
+ "content": "<|eot_id|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
  "pad_token": {
20
+ "content": "<|eot_id|>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -2049,16 +2049,19 @@
2049
  "special": true
2050
  }
2051
  },
 
 
 
2052
  "bos_token": "<|begin_of_text|>",
2053
  "clean_up_tokenization_spaces": true,
2054
- "eos_token": "<|end_of_text|>",
2055
  "extra_special_tokens": {},
2056
  "model_input_names": [
2057
  "input_ids",
2058
  "attention_mask"
2059
  ],
2060
  "model_max_length": 8192,
2061
- "pad_token": "<|end_of_text|>",
2062
  "padding_side": "right",
2063
  "split_special_tokens": false,
2064
  "tokenizer_class": "PreTrainedTokenizerFast"
 
2049
  "special": true
2050
  }
2051
  },
2052
+ "additional_special_tokens": [
2053
+ "<|eom_id|>"
2054
+ ],
2055
  "bos_token": "<|begin_of_text|>",
2056
  "clean_up_tokenization_spaces": true,
2057
+ "eos_token": "<|eot_id|>",
2058
  "extra_special_tokens": {},
2059
  "model_input_names": [
2060
  "input_ids",
2061
  "attention_mask"
2062
  ],
2063
  "model_max_length": 8192,
2064
+ "pad_token": "<|eot_id|>",
2065
  "padding_side": "right",
2066
  "split_special_tokens": false,
2067
  "tokenizer_class": "PreTrainedTokenizerFast"