baseten
/

Kimi-K2-Instruct-FP4

@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
     def encode(
         self,
         text: str,
         **kwargs
     ) -> List[int]:
         """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
         t: List[int] = []
         for substr in all_substrs:
-            t.extend(
-                # we should consider special token as a common token
-                self.model.encode(
-                    substr,
-                    disallowed_special=(),
                 )
-            )
         return t

     def encode(
         self,
         text: str,
+        allow_special_tokens: bool = True,
         **kwargs
     ) -> List[int]:
         """
         t: List[int] = []
         for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
                 )
         return t

tokenizer_config.json CHANGED Viewed

@@ -110,7 +110,7 @@
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "extra_special_tokens": {},
-  "chat_template": "{% if tools -%}\n    {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n    {{- tools | tojson -}}\n    {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n  {%- if loop.first and messages[0]['role'] != 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n  {%- endif -%}\n  {%- if message['role'] == 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>' }}\n  {%- elif message['role'] == 'user' -%}\n    {{ '<|im_user|>user<|im_middle|>' }}\n  {%- elif message['role'] == 'assistant' -%}\n    {{ '<|im_assistant|>assistant<|im_middle|>' }}\n  {%- elif message['role'] == 'tool' -%}\n    {{ '<|im_system|>tool<|im_middle|>' }}\n  {%- endif -%}\n\n  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n    {%- if message['content'] -%}\n      {{ message['content'] }}\n    {%- endif -%}\n    {{ '<|tool_calls_section_begin|>' }}\n    {%- for tool_call in message['tool_calls'] -%}\n      {%- set func_name = tool_call['function']['name'] -%}\n      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n      {{ '<|tool_call_begin|>' }}{{ formatted_id }}{{ '<|tool_call_argument_begin|>' }}{{ tool_call['function']['arguments'] }}{{ '<|tool_call_end|>' }}\n    {%- endfor -%}\n    {{ '<|tool_calls_section_end|>' }}\n  {%- elif message['role'] == 'tool' -%}\n    {{ '## Return of ' + message['tool_call_id'] + '\\n' + message['content'] }}\n  {%- elif message['content'] is string -%}\n    {{- message['content'] -}}\n  {%- elif message['content'] is not none -%}\n    {%- for content in message['content'] -%}\n      {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n        {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n      {%- else -%}\n        {{ content['text'] }}\n      {%- endif -%}\n    {%- endfor -%}\n  {%- endif -%}\n  {{ '<|im_end|>' }}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n  {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}",
   "model_max_length": 131072,
   "pad_token": "<|im_end|>",
   "tokenizer_class": "TikTokenTokenizer",

   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "extra_special_tokens": {},
+  "chat_template": "{%- if tools -%}\n  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>\n{%- endif -%}\n{%- for message in messages -%}\n  {%- if loop.first and messages[0]['role'] != 'system' -%}\n    <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>\n  {%- endif -%}\n  {%- if message['role'] == 'system' -%}\n    <|im_system|>system<|im_middle|>\n  {%- elif message['role'] == 'user' -%}\n    <|im_user|>user<|im_middle|>\n  {%- elif message['role'] == 'assistant' -%}\n    <|im_assistant|>assistant<|im_middle|>\n  {%- elif message['role'] == 'tool' -%}\n    <|im_system|>tool<|im_middle|>\n  {%- endif -%}\n  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}\n    <|tool_calls_section_begin|>\n    {%- for tool_call in message['tool_calls'] -%}\n      {%- set func_name = tool_call['function']['name'] -%}\n      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>\n    {%- endfor -%}\n    <|tool_calls_section_end|>\n  {%- elif message['role'] == 'tool' -%}\n    ## Return of {{ message.tool_call_id }}\\n{{ message['content'] }}\n  {%- elif message['content'] is string -%}\n    {{ message['content'] }}\n  {%- elif message['content'] is not none -%}\n    {% for content in message['content'] -%}\n      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n        <|media_start|>image<|media_content|><|media_pad|><|media_end|>\n      {% else -%}\n        {{ content['text'] }}\n      {%- endif -%}\n    {%- endfor -%}\n  {%- endif -%}\n  <|im_end|>\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n  <|im_assistant|>assistant<|im_middle|>\n{%- endif -%}",
   "model_max_length": 131072,
   "pad_token": "<|im_end|>",
   "tokenizer_class": "TikTokenTokenizer",