bdubayah commited on
Commit
80761a1
·
1 Parent(s): 2551973
Files changed (2) hide show
  1. tokenization_kimi.py +16 -6
  2. tokenizer_config.json +1 -1
tokenization_kimi.py CHANGED
@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
158
  def encode(
159
  self,
160
  text: str,
 
161
  **kwargs
162
  ) -> List[int]:
163
  """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
203
 
204
  t: List[int] = []
205
  for substr in all_substrs:
206
- t.extend(
207
- # we should consider special token as a common token
208
- self.model.encode(
209
- substr,
210
- disallowed_special=(),
 
 
 
 
 
 
 
 
 
 
211
  )
212
- )
213
 
214
  return t
215
 
 
158
  def encode(
159
  self,
160
  text: str,
161
+ allow_special_tokens: bool = True,
162
  **kwargs
163
  ) -> List[int]:
164
  """
 
204
 
205
  t: List[int] = []
206
  for substr in all_substrs:
207
+ if allow_special_tokens:
208
+ t.extend(
209
+ # we should consider special token as a common token
210
+ self.model.encode(
211
+ substr,
212
+ allowed_special="all",
213
+ )
214
+ )
215
+ else:
216
+ t.extend(
217
+ # we should consider special token as a common token
218
+ self.model.encode(
219
+ substr,
220
+ disallowed_special=(),
221
+ )
222
  )
 
223
 
224
  return t
225
 
tokenizer_config.json CHANGED
@@ -110,7 +110,7 @@
110
  "clean_up_tokenization_spaces": false,
111
  "eos_token": "<|im_end|>",
112
  "extra_special_tokens": {},
113
- "chat_template": "{% if tools -%}\n {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n {{- tools | tojson -}}\n {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n {{ '<|im_system|>system<|im_middle|>' }}\n {%- elif message['role'] == 'user' -%}\n {{ '<|im_user|>user<|im_middle|>' }}\n {%- elif message['role'] == 'assistant' -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n {%- elif message['role'] == 'tool' -%}\n {{ '<|im_system|>tool<|im_middle|>' }}\n {%- endif -%}\n\n {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n {%- if message['content'] -%}\n {{ message['content'] }}\n {%- endif -%}\n {{ '<|tool_calls_section_begin|>' }}\n {%- for tool_call in message['tool_calls'] -%}\n {%- set func_name = tool_call['function']['name'] -%}\n {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n {{ '<|tool_call_begin|>' }}{{ formatted_id }}{{ '<|tool_call_argument_begin|>' }}{{ tool_call['function']['arguments'] }}{{ '<|tool_call_end|>' }}\n {%- endfor -%}\n {{ '<|tool_calls_section_end|>' }}\n {%- elif message['role'] == 'tool' -%}\n {{ '## Return of ' + message['tool_call_id'] + '\\n' + message['content'] }}\n {%- elif message['content'] is string -%}\n {{- message['content'] -}}\n {%- elif message['content'] is not none -%}\n {%- for content in message['content'] -%}\n {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n {%- else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {{ '<|im_end|>' }}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}",
114
  "model_max_length": 131072,
115
  "pad_token": "<|im_end|>",
116
  "tokenizer_class": "TikTokenTokenizer",
 
110
  "clean_up_tokenization_spaces": false,
111
  "eos_token": "<|im_end|>",
112
  "extra_special_tokens": {},
113
+ "chat_template": "{%- if tools -%}\n <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>\n{%- endif -%}\n{%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>\n {%- endif -%}\n {%- if message['role'] == 'system' -%}\n <|im_system|>system<|im_middle|>\n {%- elif message['role'] == 'user' -%}\n <|im_user|>user<|im_middle|>\n {%- elif message['role'] == 'assistant' -%}\n <|im_assistant|>assistant<|im_middle|>\n {%- elif message['role'] == 'tool' -%}\n <|im_system|>tool<|im_middle|>\n {%- endif -%}\n {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}\n <|tool_calls_section_begin|>\n {%- for tool_call in message['tool_calls'] -%}\n {%- set func_name = tool_call['function']['name'] -%}\n {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>\n {%- endfor -%}\n <|tool_calls_section_end|>\n {%- elif message['role'] == 'tool' -%}\n ## Return of {{ message.tool_call_id }}\\n{{ message['content'] }}\n {%- elif message['content'] is string -%}\n {{ message['content'] }}\n {%- elif message['content'] is not none -%}\n {% for content in message['content'] -%}\n {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n <|media_start|>image<|media_content|><|media_pad|><|media_end|>\n {% else -%}\n {{ content['text'] }}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n <|im_end|>\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n <|im_assistant|>assistant<|im_middle|>\n{%- endif -%}",
114
  "model_max_length": 131072,
115
  "pad_token": "<|im_end|>",
116
  "tokenizer_class": "TikTokenTokenizer",