Add files using upload-large-folder tool

Browse files

Files changed (2) hide show

tokenization_kimi.py +16 -6
tokenizer_config.json +48 -8

tokenization_kimi.py CHANGED Viewed

@@ -158,6 +158,7 @@ class TikTokenTokenizer(PreTrainedTokenizer):
     def encode(
         self,
         text: str,
         **kwargs
     ) -> List[int]:
         """
@@ -203,13 +204,22 @@ class TikTokenTokenizer(PreTrainedTokenizer):
         t: List[int] = []
         for substr in all_substrs:
-            t.extend(
-                # we should consider special token as a common token
-                self.model.encode(
-                    substr,
-                    disallowed_special=(),
                 )
-            )
         return t

     def encode(
         self,
         text: str,
+        allow_special_tokens: bool = True,
         **kwargs
     ) -> List[int]:
         """
         t: List[int] = []
         for substr in all_substrs:
+            if allow_special_tokens:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        allowed_special="all",
+                    )
+                )
+            else:
+                t.extend(
+                    # we should consider special token as a common token
+                    self.model.encode(
+                        substr,
+                        disallowed_special=(),
+                    )
                 )
         return t

tokenizer_config.json CHANGED Viewed

@@ -72,6 +72,46 @@
       "single_word": false,
       "special": true
     },
     "163601": {
       "content": "<|im_middle|>",
       "lstrip": false,
@@ -107,19 +147,19 @@
     "<|im_system|>",
     "<|im_middle|>"
   ],
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_kimi.TikTokenTokenizer",
-      null
-    ]
-  },
   "bos_token": "[BOS]",
-  "chat_template": "{% if tools -%}\n    {{ '<|im_system|>tool_declare<|im_middle|>' -}}\n    {{- tools | tojson -}}\n    {{ '<|im_end|>' -}}\n{%- endif -%}\n\n{%- for message in messages -%}\n  {%- if loop.first and messages[0]['role'] != 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>' }}\n  {%- endif -%}\n  {%- if message['role'] == 'system' -%}\n    {{ '<|im_system|>system<|im_middle|>' }}\n  {%- elif message['role'] == 'user' -%}\n    {{ '<|im_user|>user<|im_middle|>' }}\n  {%- elif message['role'] == 'assistant' -%}\n    {{ '<|im_assistant|>assistant<|im_middle|>' }}\n  {%- elif message['role'] == 'tool' -%}\n    {{ '<|im_system|>tool<|im_middle|>' }}\n  {%- endif -%}\n\n  {%- if message['content'] is string -%}\n    {{- message['content'] + '<|im_end|>' -}}\n  {%- else -%}\n    {%- for content in message['content'] -%}\n      {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n        {{ '<|media_start|>image<|media_content|><|media_pad|><|media_end|>' }}\n      {%- else -%}\n        {{ content['text'] }}\n      {%- endif -%}\n    {%- endfor -%}\n    {{ '<|im_end|>' }}\n  {%- endif -%}\n{%- endfor -%}\n\n{%- if add_generation_prompt -%}\n  {{ '<|im_assistant|>assistant<|im_middle|>' }}\n{%- endif -%}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "[EOS]",
   "extra_special_tokens": {},
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "TikTokenTokenizer",
-  "unk_token": "[UNK]"
 }

       "single_word": false,
       "special": true
     },
+    "163595": {
+      "content": "<|tool_calls_section_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163596": {
+      "content": "<|tool_calls_section_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163597": {
+      "content": "<|tool_call_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163598": {
+      "content": "<|tool_call_argument_begin|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163599": {
+      "content": "<|tool_call_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
     "163601": {
       "content": "<|im_middle|>",
       "lstrip": false,
     "<|im_system|>",
     "<|im_middle|>"
   ],
   "bos_token": "[BOS]",
   "clean_up_tokenization_spaces": false,
   "eos_token": "[EOS]",
   "extra_special_tokens": {},
+  "chat_template": "{%- if tools -%}\n  <|im_system|>tool_declare<|im_middle|>{{ tools | tojson }}<|im_end|>\n{%- endif -%}\n{%- for message in messages -%}\n  {%- if loop.first and messages[0]['role'] != 'system' -%}\n    <|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>\n  {%- endif -%}\n  {%- if message['role'] == 'system' -%}\n    <|im_system|>system<|im_middle|>\n  {%- elif message['role'] == 'user' -%}\n    <|im_user|>user<|im_middle|>\n  {%- elif message['role'] == 'assistant' -%}\n    <|im_assistant|>assistant<|im_middle|>\n  {%- elif message['role'] == 'tool' -%}\n    <|im_system|>tool<|im_middle|>\n  {%- endif -%}\n  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}\n    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}\n    <|tool_calls_section_begin|>\n    {%- for tool_call in message['tool_calls'] -%}\n      {%- set func_name = tool_call['function']['name'] -%}\n      {%- set formatted_id = 'functions.' + func_name + ':' + loop.index0|string -%}\n      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{{ tool_call['function']['arguments'] | tojson}}<|tool_call_end|>\n    {%- endfor -%}\n    <|tool_calls_section_end|>\n  {%- elif message['role'] == 'tool' -%}\n    ## Return of {{ message.tool_call_id }}\\n{{ message['content'] }}\n  {%- elif message['content'] is string -%}\n    {{ message['content'] }}\n  {%- elif message['content'] is not none -%}\n    {% for content in message['content'] -%}\n      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}\n        <|media_start|>image<|media_content|><|media_pad|><|media_end|>\n      {% else -%}\n        {{ content['text'] }}\n      {%- endif -%}\n    {%- endfor -%}\n  {%- endif -%}\n  <|im_end|>\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n  <|im_assistant|>assistant<|im_middle|>\n{%- endif -%}",
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "[PAD]",
   "tokenizer_class": "TikTokenTokenizer",
+  "unk_token": "[UNK]",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_kimi.TikTokenTokenizer",
+      null
+    ]
+  }
 }