Upload processor

Browse files

Files changed (6) hide show

added_tokens.json +24 -0
chat_template.jinja +39 -48
merges.txt +0 -0
special_tokens_map.json +13 -21
tokenizer_config.json +12 -2
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja CHANGED Viewed

@@ -1,54 +1,45 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- messages[0]['content'] }}
-    {%- else %}
-        {{- 'You are a helpful assistant.' }}
     {%- endif %}
-    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0]['role'] == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
-    {%- else %}
-        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
     {%- endif %}
-{%- endif %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
-        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
-    {%- elif message.role == "assistant" %}
-        {{- '<|im_start|>' + message.role }}
-        {%- if message.content %}
-            {{- '\n' + message.content }}
-        {%- endif %}
-        {%- for tool_call in message.tool_calls %}
-            {%- if tool_call.function is defined %}
-                {%- set tool_call = tool_call.function %}
-            {%- endif %}
-            {{- '\n<tool_call>\n{"name": "' }}
-            {{- tool_call.name }}
-            {{- '", "arguments": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- '}\n</tool_call>' }}
-        {%- endfor %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
         {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
         {%- endif %}
     {%- endif %}
 {%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-{%- endif %}

+{%- for message in messages %}
+    {#-- Validate role is a stringified integer --#}
+    {%- if not message['role'] is string or not message['role'].isdigit() %}
+        {{- raise_exception("The role must be an integer or a stringified integer (e.g. '0') designating the speaker id") }}
     {%- endif %}
+    {#-- Validate content is a list --#}
+    {%- set content = message['content'] %}
+    {%- if content is not iterable or content is string %}
+        {{- raise_exception("The content must be a list") }}
     {%- endif %}
+    {#-- Collect content types --#}
+    {%- set content_types = content | map(attribute='type') | list %}
+    {%- set is_last = loop.last %}
+    {#-- Last message validation --#}
+    {%- if is_last %}
+        {%- if 'text' not in content_types %}
+            {{- raise_exception("The last message must include one item of type 'text'") }}
+        {%- elif (content_types | select('equalto', 'text') | list | length > 1) or (content_types | select('equalto', 'audio') | list | length > 1) %}
+            {{- raise_exception("At most two items are allowed in the last message: one 'text' and one 'audio'") }}
         {%- endif %}
+    {#-- All other messages validation --#}
+    {%- else %}
+        {%- if content_types | select('equalto', 'text') | list | length != 1
+              or content_types | select('equalto', 'audio') | list | length != 1 %}
+            {{- raise_exception("Each message (except the last) must contain exactly one 'text' and one 'audio' item") }}
+        {%- elif content_types | reject('in', ['text', 'audio']) | list | length > 0 %}
+            {{- raise_exception("Only 'text' and 'audio' types are allowed in content") }}
         {%- endif %}
     {%- endif %}
 {%- endfor %}
+{%- for message in messages %}
+    {{- bos_token }}
+    {{- '[' + message['role'] + ']' }}
+    {{- message['content'][0]['text'] }}
+    {{- eos_token }}
+    {%- if message['content']|length > 1 %}
+        {{- '<|vision_start|><|vision_end|>' }}
+    {%- endif %}
+{%- endfor %}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json CHANGED Viewed

@@ -1,26 +1,18 @@
 {
   "additional_special_tokens": [
-    {
-      "content": "<|vision_start|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<|vision_end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<|vision_pad|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    }
   ],
   "eos_token": {
     "content": "<|endoftext|>",

 {
   "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
   ],
   "eos_token": {
     "content": "<|endoftext|>",

tokenizer_config.json CHANGED Viewed

@@ -180,9 +180,19 @@
     }
   },
   "additional_special_tokens": [
     "<|vision_start|>",
     "<|vision_end|>",
-    "<|vision_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
@@ -193,6 +203,6 @@
   "pad_token": "<|endoftext|>",
   "processor_class": "VibeVoiceProcessor",
   "split_special_tokens": false,
-  "tokenizer_class": "VibeVoiceTokenizerFast",
   "unk_token": null
 }

     }
   },
   "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
     "<|vision_start|>",
     "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "pad_token": "<|endoftext|>",
   "processor_class": "VibeVoiceProcessor",
   "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null
 }

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff