Upload tokenizer
Browse files- chat_template.jinja +27 -21
- tokenization_kimi.py +16 -6
- tokenizer_config.json +2 -3
chat_template.jinja
CHANGED
|
@@ -1,37 +1,43 @@
|
|
| 1 |
-
{% if tools -%}
|
| 2 |
-
|
| 3 |
-
{{- tools | tojson -}}
|
| 4 |
-
{{ '<|im_end|>' -}}
|
| 5 |
{%- endif -%}
|
| 6 |
-
|
| 7 |
{%- for message in messages -%}
|
| 8 |
{%- if loop.first and messages[0]['role'] != 'system' -%}
|
| 9 |
-
|
| 10 |
{%- endif -%}
|
| 11 |
{%- if message['role'] == 'system' -%}
|
| 12 |
-
|
| 13 |
{%- elif message['role'] == 'user' -%}
|
| 14 |
-
|
| 15 |
{%- elif message['role'] == 'assistant' -%}
|
| 16 |
-
|
| 17 |
{%- elif message['role'] == 'tool' -%}
|
| 18 |
-
|
| 19 |
{%- endif -%}
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
{%-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
{{ content['text'] }}
|
| 29 |
{%- endif -%}
|
| 30 |
{%- endfor -%}
|
| 31 |
-
{{ '<|im_end|>' }}
|
| 32 |
{%- endif -%}
|
|
|
|
| 33 |
{%- endfor -%}
|
| 34 |
-
|
| 35 |
{%- if add_generation_prompt -%}
|
| 36 |
-
|
| 37 |
{%- endif -%}
|
|
|
|
| 1 |
+
{%- if tools -%}
|
| 2 |
+
<|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
|
|
|
|
|
|
|
| 3 |
{%- endif -%}
|
|
|
|
| 4 |
{%- for message in messages -%}
|
| 5 |
{%- if loop.first and messages[0]['role'] != 'system' -%}
|
| 6 |
+
<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
|
| 7 |
{%- endif -%}
|
| 8 |
{%- if message['role'] == 'system' -%}
|
| 9 |
+
<|im_system|>system<|im_middle|>
|
| 10 |
{%- elif message['role'] == 'user' -%}
|
| 11 |
+
<|im_user|>user<|im_middle|>
|
| 12 |
{%- elif message['role'] == 'assistant' -%}
|
| 13 |
+
<|im_assistant|>assistant<|im_middle|>
|
| 14 |
{%- elif message['role'] == 'tool' -%}
|
| 15 |
+
<|im_system|>tool<|im_middle|>
|
| 16 |
{%- endif -%}
|
| 17 |
+
{%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
|
| 18 |
+
{%- if message['content'] -%}{{ message['content'] }}{%- endif -%}
|
| 19 |
+
<|tool_calls_section_begin|>
|
| 20 |
+
{%- for tool_call in message['tool_calls'] -%}
|
| 21 |
+
{%- set func_name = tool_call['function']['name'] -%}
|
| 22 |
+
{%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}
|
| 23 |
+
<|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>
|
| 24 |
+
{%- endfor -%}
|
| 25 |
+
<|tool_calls_section_end|>
|
| 26 |
+
{%- elif message['role'] == 'tool' -%}
|
| 27 |
+
## Return of {{ message.tool_call_id }}\n{{ message['content'] }}
|
| 28 |
+
{%- elif message['content'] is string -%}
|
| 29 |
+
{{ message['content'] }}
|
| 30 |
+
{%- elif message['content'] is not none -%}
|
| 31 |
+
{% for content in message['content'] -%}
|
| 32 |
+
{% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
|
| 33 |
+
<|media_start|>image<|media_content|><|media_pad|><|media_end|>
|
| 34 |
+
{% else -%}
|
| 35 |
{{ content['text'] }}
|
| 36 |
{%- endif -%}
|
| 37 |
{%- endfor -%}
|
|
|
|
| 38 |
{%- endif -%}
|
| 39 |
+
<|im_end|>
|
| 40 |
{%- endfor -%}
|
|
|
|
| 41 |
{%- if add_generation_prompt -%}
|
| 42 |
+
<|im_assistant|>assistant<|im_middle|>
|
| 43 |
{%- endif -%}
|
tokenization_kimi.py
CHANGED
|
@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
| 158 |
def encode(
|
| 159 |
self,
|
| 160 |
text: str,
|
|
|
|
| 161 |
**kwargs
|
| 162 |
) -> List[int]:
|
| 163 |
"""
|
|
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
| 203 |
|
| 204 |
t: List[int] = []
|
| 205 |
for substr in all_substrs:
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
)
|
| 212 |
-
)
|
| 213 |
|
| 214 |
return t
|
| 215 |
|
|
|
|
| 158 |
def encode(
|
| 159 |
self,
|
| 160 |
text: str,
|
| 161 |
+
allow_special_tokens: bool = True,
|
| 162 |
**kwargs
|
| 163 |
) -> List[int]:
|
| 164 |
"""
|
|
|
|
| 204 |
|
| 205 |
t: List[int] = []
|
| 206 |
for substr in all_substrs:
|
| 207 |
+
if allow_special_tokens:
|
| 208 |
+
t.extend(
|
| 209 |
+
# we should consider special token as a common token
|
| 210 |
+
self.model.encode(
|
| 211 |
+
substr,
|
| 212 |
+
allowed_special="all",
|
| 213 |
+
)
|
| 214 |
+
)
|
| 215 |
+
else:
|
| 216 |
+
t.extend(
|
| 217 |
+
# we should consider special token as a common token
|
| 218 |
+
self.model.encode(
|
| 219 |
+
substr,
|
| 220 |
+
disallowed_special=(),
|
| 221 |
+
)
|
| 222 |
)
|
|
|
|
| 223 |
|
| 224 |
return t
|
| 225 |
|
tokenizer_config.json
CHANGED
|
@@ -121,6 +121,5 @@
|
|
| 121 |
"pad_token": "[PAD]",
|
| 122 |
"padding_side": "left",
|
| 123 |
"tokenizer_class": "TikTokenTokenizer",
|
| 124 |
-
"unk_token": "[UNK]"
|
| 125 |
-
|
| 126 |
-
}
|
|
|
|
| 121 |
"pad_token": "[PAD]",
|
| 122 |
"padding_side": "left",
|
| 123 |
"tokenizer_class": "TikTokenTokenizer",
|
| 124 |
+
"unk_token": "[UNK]"
|
| 125 |
+
}
|
|
|