danielhanchen commited on
Commit
cdd432a
·
verified ·
1 Parent(s): bc9634c

Upload tokenizer

Browse files
chat_template.jinja CHANGED
@@ -1,37 +1,43 @@
1
- {% if tools -%}
2
- {{ '<|im_system|>tool_declare<|im_middle|>' -}}
3
- {{- tools | tojson -}}
4
- {{ '<|im_end|>' -}}
5
  {%- endif -%}
6
-
7
  {%- for message in messages -%}
8
  {%- if loop.first and messages[0]['role'] != 'system' -%}
9
- {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}
10
  {%- endif -%}
11
  {%- if message['role'] == 'system' -%}
12
- {{ '<|im_system|>system<|im_middle|>' }}
13
  {%- elif message['role'] == 'user' -%}
14
- {{ '<|im_user|>user<|im_middle|>' }}
15
  {%- elif message['role'] == 'assistant' -%}
16
- {{ '<|im_assistant|>assistant<|im_middle|>' }}
17
  {%- elif message['role'] == 'tool' -%}
18
- {{ '<|im_system|>tool<|im_middle|>' }}
19
  {%- endif -%}
20
-
21
- {%- if message['content'] is string -%}
22
- {{- message['content'] + '<|im_end|>' -}}
23
- {%- else -%}
24
- {%- for content in message['content'] -%}
25
- {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
26
- {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}
27
- {%- else -%}
 
 
 
 
 
 
 
 
 
 
28
  {{ content['text'] }}
29
  {%- endif -%}
30
  {%- endfor -%}
31
- {{ '<|im_end|>' }}
32
  {%- endif -%}
 
33
  {%- endfor -%}
34
-
35
  {%- if add_generation_prompt -%}
36
- {{ '<|im_assistant|>assistant<|im_middle|>' }}
37
  {%- endif -%}
 
1
+ {%- if tools -%}
2
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
 
 
3
  {%- endif -%}
 
4
  {%- for message in messages -%}
5
  {%- if loop.first and messages[0]['role'] != 'system' -%}
6
+ <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
7
  {%- endif -%}
8
  {%- if message['role'] == 'system' -%}
9
+ <|im_system|>system<|im_middle|>
10
  {%- elif message['role'] == 'user' -%}
11
+ <|im_user|>user<|im_middle|>
12
  {%- elif message['role'] == 'assistant' -%}
13
+ <|im_assistant|>assistant<|im_middle|>
14
  {%- elif message['role'] == 'tool' -%}
15
+ <|im_system|>tool<|im_middle|>
16
  {%- endif -%}
17
+ {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
18
+ {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}
19
+ <|tool_calls_section_begin|>
20
+ {%- for tool_call in message['tool_calls'] -%}
21
+ {%- set func_name = tool_call['function']['name'] -%}
22
+ {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}
23
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>
24
+ {%- endfor -%}
25
+ <|tool_calls_section_end|>
26
+ {%- elif message['role'] == 'tool' -%}
27
+ ## Return of {{ message.tool_call_id }}\n{{ message['content'] }}
28
+ {%- elif message['content'] is string -%}
29
+ {{ message['content'] }}
30
+ {%- elif message['content'] is not none -%}
31
+ {% for content in message['content'] -%}
32
+ {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
33
+ <|media_start|>image<|media_content|><|media_pad|><|media_end|>
34
+ {% else -%}
35
  {{ content['text'] }}
36
  {%- endif -%}
37
  {%- endfor -%}
 
38
  {%- endif -%}
39
+ <|im_end|>
40
  {%- endfor -%}
 
41
  {%- if add_generation_prompt -%}
42
+ <|im_assistant|>assistant<|im_middle|>
43
  {%- endif -%}
tokenization_kimi.py CHANGED
@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
158
  def encode(
159
  self,
160
  text: str,
 
161
  **kwargs
162
  ) -> List[int]:
163
  """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
203
 
204
  t: List[int] = []
205
  for substr in all_substrs:
206
- t.extend(
207
- # we should consider special token as a common token
208
- self.model.encode(
209
- substr,
210
- disallowed_special=(),
 
 
 
 
 
 
 
 
 
 
211
  )
212
- )
213
 
214
  return t
215
 
 
158
  def encode(
159
  self,
160
  text: str,
161
+ allow_special_tokens: bool = True,
162
  **kwargs
163
  ) -> List[int]:
164
  """
 
204
 
205
  t: List[int] = []
206
  for substr in all_substrs:
207
+ if allow_special_tokens:
208
+ t.extend(
209
+ # we should consider special token as a common token
210
+ self.model.encode(
211
+ substr,
212
+ allowed_special="all",
213
+ )
214
+ )
215
+ else:
216
+ t.extend(
217
+ # we should consider special token as a common token
218
+ self.model.encode(
219
+ substr,
220
+ disallowed_special=(),
221
+ )
222
  )
 
223
 
224
  return t
225
 
tokenizer_config.json CHANGED
@@ -121,6 +121,5 @@
121
  "pad_token": "[PAD]",
122
  "padding_side": "left",
123
  "tokenizer_class": "TikTokenTokenizer",
124
- "unk_token": "[UNK]",
125
- "chat_template": "{% if tools -%}\n {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n {{- tools | tojson -}}\n {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n {{ '<|im_system|>system<|im_middle|>' }}\n {%- elif message['role'] == 'user' -%}\n {{ '<|im_user|>user<|im_middle|>' }}\n {%- elif message['role'] == 'assistant' -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n {%- elif message['role'] == 'tool' -%}\n {{ '<|im_system|>tool<|im_middle|>' }}\n {%- endif -%}\n\n {%- if message['content'] is string -%}\n {{- message['content'] + '<|im_end|>' -}}\n {%- else -%}\n {%- for content in message['content'] -%}\n {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n {%- else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {{ '<|im_end|>' }}\n {%- endif -%}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}"
126
- }
 
121
  "pad_token": "[PAD]",
122
  "padding_side": "left",
123
  "tokenizer_class": "TikTokenTokenizer",
124
+ "unk_token": "[UNK]"
125
+ }