bezzam HF Staff commited on
Commit
00a2946
·
verified ·
1 Parent(s): f970dc6

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja CHANGED
@@ -1,54 +1,45 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'You are a helpful assistant.' }}
7
  {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
  {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
- {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
32
- {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
- {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
  {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
 
 
 
49
  {%- endif %}
50
  {%- endif %}
51
  {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
 
 
 
 
 
 
 
1
+
2
+ {%- for message in messages %}
3
+ {#-- Validate role is a stringified integer --#}
4
+ {%- if not message['role'] is string or not message['role'].isdigit() %}
5
+ {{- raise_exception("The role must be an integer or a stringified integer (e.g. '0') designating the speaker id") }}
 
6
  {%- endif %}
7
+
8
+ {#-- Validate content is a list --#}
9
+ {%- set content = message['content'] %}
10
+ {%- if content is not iterable or content is string %}
11
+ {{- raise_exception("The content must be a list") }}
 
 
 
 
 
 
12
  {%- endif %}
13
+
14
+ {#-- Collect content types --#}
15
+ {%- set content_types = content | map(attribute='type') | list %}
16
+ {%- set is_last = loop.last %}
17
+
18
+ {#-- Last message validation --#}
19
+ {%- if is_last %}
20
+ {%- if 'text' not in content_types %}
21
+ {{- raise_exception("The last message must include one item of type 'text'") }}
22
+ {%- elif (content_types | select('equalto', 'text') | list | length > 1) or (content_types | select('equalto', 'audio') | list | length > 1) %}
23
+ {{- raise_exception("At most two items are allowed in the last message: one 'text' and one 'audio'") }}
 
 
 
 
 
 
 
 
 
 
 
 
24
  {%- endif %}
25
+
26
+ {#-- All other messages validation --#}
27
+ {%- else %}
28
+ {%- if content_types | select('equalto', 'text') | list | length != 1
29
+ or content_types | select('equalto', 'audio') | list | length != 1 %}
30
+ {{- raise_exception("Each message (except the last) must contain exactly one 'text' and one 'audio' item") }}
31
+ {%- elif content_types | reject('in', ['text', 'audio']) | list | length > 0 %}
32
+ {{- raise_exception("Only 'text' and 'audio' types are allowed in content") }}
33
  {%- endif %}
34
  {%- endif %}
35
  {%- endfor %}
36
+
37
+ {%- for message in messages %}
38
+ {{- bos_token }}
39
+ {{- '[' + message['role'] + ']' }}
40
+ {{- message['content'][0]['text'] }}
41
+ {{- eos_token }}
42
+ {%- if message['content']|length > 1 %}
43
+ {{- '<|vision_start|><|vision_end|>' }}
44
+ {%- endif %}
45
+ {%- endfor %}
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,26 +1,18 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "<|vision_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|vision_end|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- {
18
- "content": "<|vision_pad|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
  ],
25
  "eos_token": {
26
  "content": "<|endoftext|>",
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
 
 
 
 
 
 
 
 
16
  ],
17
  "eos_token": {
18
  "content": "<|endoftext|>",
tokenizer_config.json CHANGED
@@ -180,9 +180,19 @@
180
  }
181
  },
182
  "additional_special_tokens": [
 
 
 
 
 
 
 
 
183
  "<|vision_start|>",
184
  "<|vision_end|>",
185
- "<|vision_pad|>"
 
 
186
  ],
187
  "bos_token": null,
188
  "clean_up_tokenization_spaces": false,
@@ -193,6 +203,6 @@
193
  "pad_token": "<|endoftext|>",
194
  "processor_class": "VibeVoiceProcessor",
195
  "split_special_tokens": false,
196
- "tokenizer_class": "VibeVoiceTokenizerFast",
197
  "unk_token": null
198
  }
 
180
  }
181
  },
182
  "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
  "<|vision_start|>",
192
  "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
  ],
197
  "bos_token": null,
198
  "clean_up_tokenization_spaces": false,
 
203
  "pad_token": "<|endoftext|>",
204
  "processor_class": "VibeVoiceProcessor",
205
  "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
  "unk_token": null
208
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff