HoangHa commited on
Commit
087bc62
·
verified ·
1 Parent(s): 74d21bd

Upload tokenizer

Browse files
Files changed (3) hide show
  1. chat_template.jinja +50 -0
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +20 -0
chat_template.jinja ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set ns = namespace(system_prompt="") -%}
3
+
4
+ {# --- 1. Extract system prompt if provided --- #}
5
+ {%- if messages[0]["role"] == "system" -%}
6
+ {%- set ns.system_prompt = messages[0]["content"] -%}
7
+ {%- set messages = messages[1:] -%}
8
+ {%- else -%}
9
+ {# --- 2. Default system prompt if none provided --- #}
10
+ {%- set ns.system_prompt = "Extract <address>, <company_name>, <email_address>, <human_name>, <phone_number>" -%}
11
+ {%- endif -%}
12
+
13
+ {# --- 3. Add tool list if any --- #}
14
+ {%- if tools -%}
15
+ {%- set ns.system_prompt = ns.system_prompt + ("
16
+ " if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
17
+ {%- for tool in tools -%}
18
+ {%- if tool is not string -%}
19
+ {%- set tool = tool | tojson -%}
20
+ {%- endif -%}
21
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
22
+ {%- if not loop.last -%}
23
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
24
+ {%- endif -%}
25
+ {%- endfor -%}
26
+ {%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
27
+ {%- endif -%}
28
+
29
+ {# --- 4. Render system prompt --- #}
30
+ {%- if ns.system_prompt -%}
31
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
32
+ {%- endif -%}
33
+
34
+ {# --- 5. Render all conversation messages --- #}
35
+ {%- for message in messages -%}
36
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
37
+ {%- set content = message["content"] -%}
38
+ {%- if content is not string -%}
39
+ {%- set content = content | tojson -%}
40
+ {%- endif -%}
41
+ {%- if message["role"] == "tool" -%}
42
+ {%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
43
+ {%- endif -%}
44
+ {{- content + "<|im_end|>\n" -}}
45
+ {%- endfor -%}
46
+
47
+ {# --- 6. Append generation prompt for assistant --- #}
48
+ {%- if add_generation_prompt -%}
49
+ {{- "<|im_start|>assistant\n" -}}
50
+ {%- endif -%}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "left",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }