unsloth
/

Kimi-K2-Instruct

@@ -1,37 +1,43 @@
-{% if tools -%}
-    {{ '<|im_system|>tool_declare<|im_middle|>' -}}
-    {{- tools | tojson -}}
-    {{ '<|im_end|>' -}}
 {%- endif -%}
 {%- for message in messages -%}
   {%- if loop.first and messages[0]['role'] != 'system' -%}
-    {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}
   {%- endif -%}
   {%- if message['role'] == 'system' -%}
-    {{ '<|im_system|>system<|im_middle|>' }}
   {%- elif message['role'] == 'user' -%}
-    {{ '<|im_user|>user<|im_middle|>' }}
   {%- elif message['role'] == 'assistant' -%}
-    {{ '<|im_assistant|>assistant<|im_middle|>' }}
   {%- elif message['role'] == 'tool' -%}
-    {{ '<|im_system|>tool<|im_middle|>' }}
   {%- endif -%}
-  {%- if message['content'] is string -%}
-    {{- message['content'] + '<|im_end|>' -}}
-  {%- else -%}
-    {%- for content in message['content'] -%}
-      {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
-        {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}
-      {%- else -%}
         {{ content['text'] }}
       {%- endif -%}
     {%- endfor -%}
-    {{ '<|im_end|>' }}
   {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
-  {{ '<|im_assistant|>assistant<|im_middle|>' }}
 {%- endif -%}

+{%- if tools -%}
+  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>
 {%- endif -%}
 {%- for message in messages -%}
   {%- if loop.first and messages[0]['role'] != 'system' -%}
+    <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>
   {%- endif -%}
   {%- if message['role'] == 'system' -%}
+    <|im_system|>system<|im_middle|>
   {%- elif message['role'] == 'user' -%}
+    <|im_user|>user<|im_middle|>
   {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>assistant<|im_middle|>
   {%- elif message['role'] == 'tool' -%}
+    <|im_system|>tool<|im_middle|>
   {%- endif -%}
+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
+    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}
+    <|tool_calls_section_begin|>
+    {%- for tool_call in message['tool_calls'] -%}
+      {%- set func_name = tool_call['function']['name'] -%}
+      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}
+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>
+    {%- endfor -%}
+    <|tool_calls_section_end|>
+  {%- elif message['role'] == 'tool' -%}
+    ## Return of {{ message.tool_call_id }}\n{{ message['content'] }}
+  {%- elif message['content'] is string -%}
+    {{ message['content'] }}
+  {%- elif message['content'] is not none -%}
+    {% for content in message['content'] -%}
+      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+        <|media_start|>image<|media_content|><|media_pad|><|media_end|>
+      {% else -%}
         {{ content['text'] }}
       {%- endif -%}
     {%- endfor -%}
   {%- endif -%}
+  <|im_end|>
 {%- endfor -%}
 {%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
 {%- endif -%}

tokenization_kimi.py CHANGED Viewed

@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
     def encode(
         self,
         text: str,
         **kwargs
     ) -> List[int]:
         """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
         t: List[int] = []
         for substr in all_substrs:
-            t.extend(
-                # we should consider special token as a common token
-                self.model.encode(
-                    substr,
-                    disallowed_special=(),
                 )
-            )
         return t

     def encode(
         self,
         text: str,
+        allow_special_tokens: bool = True,
         **kwargs
     ) -> List[int]:
         """
         t: List[int] = []
         for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
                 )
         return t

tokenizer_config.json CHANGED Viewed

@@ -121,6 +121,5 @@
   "pad_token": "[PAD]",
   "padding_side": "left",
   "tokenizer_class": "TikTokenTokenizer",
-  "unk_token": "[UNK]",
-  "chat_template": "{% if tools -%}\n    {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n    {{- tools | tojson -}}\n    {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n  {%- if loop.first and messages[0]['role'] != 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n  {%- endif -%}\n  {%- if message['role'] == 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>' }}\n  {%- elif message['role'] == 'user' -%}\n    {{ '<|im_user|>user<|im_middle|>' }}\n  {%- elif message['role'] == 'assistant' -%}\n    {{ '<|im_assistant|>assistant<|im_middle|>' }}\n  {%- elif message['role'] == 'tool' -%}\n    {{ '<|im_system|>tool<|im_middle|>' }}\n  {%- endif -%}\n\n  {%- if message['content'] is string -%}\n    {{- message['content'] + '<|im_end|>' -}}\n  {%- else -%}\n    {%- for content in message['content'] -%}\n      {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n        {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n      {%- else -%}\n        {{ content['text'] }}\n      {%- endif -%}\n    {%- endfor -%}\n    {{ '<|im_end|>' }}\n  {%- endif -%}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n  {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}"
-}

   "pad_token": "[PAD]",
   "padding_side": "left",
   "tokenizer_class": "TikTokenTokenizer",
+  "unk_token": "[UNK]"
+}