pradeepd commited on
Commit
b369210
·
1 Parent(s): 2e88cbb

v5 - no additions to provided system prompt

Browse files
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags: []
4
+ ---
5
+
6
+ # Dolma 2 tokenizer, Instruct v5, Non-reasoner version
7
+
8
+ Slightly modified version of `cl100k_base` that supports Dolma 1.x and Dolma 2.x special tokens.
9
+
10
+ ## Special tokens
11
+
12
+ This tokenizer supports the following special tokens:
13
+
14
+ - `<|extra_id_0|>`: Not used.
15
+ - `<|endoftext|>`: Used to mark both beginning and end of text.
16
+ - `<|fim_prefix|>`: Used to mark the prefix fill-in-the-middle request.
17
+ - `<|fim_middle|>`: Used to mark the middle fill-in-the-middle request.
18
+ - `<|fim_suffix|>`: Used to mark the suffix fill-in-the-middle request.
19
+ - `|||PHONE_NUMBER|||`: Not used. Kept for compatibility with Dolma 1.x.
20
+ - `|||EMAIL_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
21
+ - `|||IP_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
22
+ - `<|im_start|>`: Indicates the beginning of a message (turn in a conversation).
23
+ - `<|im_end|>`: Indicates the end of a message (turn in a conversation).
24
+ - `<|extra_id_1|>`: Not used.
25
+ - `<|extra_id_2|>`: Not used.
26
+ - `<think>`: Indicates the beginning of model thoughts.
27
+ - `</think>`: Indicates the end of model thoughts.
28
+ - `<|extra_id_3|>`: Not used.
29
+ - `<|extra_id_4|>`: Not used.
30
+ - `<|extra_id_5|>`: Not used.
31
+ - `<|extra_id_6|>`: Not used.
32
+ - `<answer>`: Indicates the beginning of model answer in thinking mode.
33
+ - `</answer>`: Indicates the end of model answer in thinking mode.
34
+ - `<|endofprompt|>`: Not Used.
35
+ - `<|pad|>`: Symbol to pad input sequences.
36
+ - `<functions>`: Indicates start of function definitions in the system prompt for tool use.
37
+ - `</functions>`: Indicates end of function definitions in the sytem prompt.
38
+ - `<function_calls>`: Indicates start of function calls made by the model.
39
+ - `</function_calls>`: Indicates end of function calls made by the model.
40
+
41
+
42
+ ## Chat template
43
+
44
+ The chat template is as follows (**for reference only**, actual template is in `tokenizer_config.json`):
45
+
46
+ ```jinja
47
+ {% set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 %}
48
+ {% if not has_system %}
49
+ {{ '<|im_start|>system
50
+ You are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. You do not currently have access to any functions. <functions></functions><|im_end|>
51
+ ' }}
52
+ {% endif %}
53
+ {% Youfor message in messages %}
54
+ {% if message['role'] == 'system' %}
55
+ {{ '<|im_start|>system
56
+ ' + message['content'] }}
57
+ {% if message.get('functions', none) is not none %}
58
+ {{ ' <functions>' + message['functions'] + '</functions><|im_end|>
59
+ ' }}
60
+ {% else %}
61
+ {{ ' do not currently have access to any functions. <functions></functions><|im_end|>
62
+ ' }}
63
+ {% endif %}
64
+ {% elif message['role'] == 'user' %}
65
+ {% if message.get('functions', none) is not none %}
66
+ {{ '<|im_start|>user
67
+ ' + message['content'] + '
68
+ ' + '<functions>' + message['functions'] + '</functions><|im_end|>
69
+ ' }}
70
+ {% else %}
71
+ {{ '<|im_start|>user
72
+ ' + message['content'] + '<|im_end|>
73
+ ' }}
74
+ {% endif %}
75
+ {% elif message['role'] == 'assistant' %}
76
+ {{ '<|im_start|>assistant
77
+ ' }}
78
+ {% if message.get('content', none) is not none %}
79
+ {{ message['content'] }}
80
+ {% endif %}
81
+ {% if message.get('function_calls', none) is not none %}
82
+ {{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}
83
+ {% endif %}
84
+ {% if not loop.last %}
85
+ {{ '<|im_end|>' + '
86
+ ' }}
87
+ {% else %}
88
+ {{ eos_token }}
89
+ {% endif %}
90
+ {% elif message['role'] == 'environment' %}
91
+ {{ '<|im_start|>environment
92
+ ' + message['content'] + '<|im_end|>
93
+ ' }}
94
+ {% endif %}
95
+ {% if loop.last and add_generation_prompt %}
96
+ {{ '<|im_start|>assistant
97
+ ' }}
98
+ {% endif %}
99
+ {% endfor %}
100
+ ```
fix_tokens.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env -S uv run --script
2
+ # /// script
3
+ # requires-python = ">=3.11"
4
+ # dependencies = [
5
+ # "click",
6
+ # "transformers",
7
+ # "jinja2",
8
+ # ]
9
+ # ///
10
+
11
+ from dataclasses import dataclass, asdict, field
12
+ from enum import Enum
13
+ from pathlib import Path
14
+ import click
15
+ import json
16
+ from transformers import AutoTokenizer
17
+
18
+
19
+ class SpecialTokensMapEnum(Enum):
20
+ BOS_TOKEN = "bos_token"
21
+ EOS_TOKEN = "eos_token"
22
+ PAD_TOKEN = "pad_token"
23
+ UNK_TOKEN = "unk_token"
24
+
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class SpecialToken:
29
+ id: int
30
+ content: str
31
+ lstrip: bool = False
32
+ normalized: bool = False
33
+ rstrip: bool = False
34
+ single_word: bool = False
35
+ special: bool = False
36
+ special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)
37
+
38
+ def to_added_tokens_decoder(self):
39
+ data = asdict(self)
40
+ token_id = str(data.pop("id"))
41
+ data.pop("special_token_map")
42
+ return {token_id: data}
43
+
44
+ def to_added_tokens(self):
45
+ data = asdict(self)
46
+ data.pop("special_token_map")
47
+ return data
48
+
49
+ def to_special_tokens_map(self) -> dict[str, dict]:
50
+ special_tokens_map = {}
51
+ for special_token_map in self.special_token_map:
52
+ data = asdict(self)
53
+ data.pop("special_token_map")
54
+ data.pop("special")
55
+ data.pop("id")
56
+ special_tokens_map[special_token_map.value] = data
57
+
58
+ return special_tokens_map
59
+
60
+
61
+ MODEL_MAX_LENGTH = 65536
62
+
63
+ DESIRED_MAPPING = [
64
+ SpecialToken(id=100256, content="<|extra_id_0|>"),
65
+ SpecialToken(
66
+ id=100257,
67
+ content="<|endoftext|>",
68
+ special=True,
69
+ special_token_map=[
70
+ SpecialTokensMapEnum.BOS_TOKEN,
71
+ SpecialTokensMapEnum.EOS_TOKEN,
72
+ SpecialTokensMapEnum.UNK_TOKEN,
73
+ ]),
74
+ SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
75
+ SpecialToken(id=100259, content="<|fim_middle|>", special=True),
76
+ SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
77
+ SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
78
+ SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
79
+ SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
80
+ SpecialToken(id=100264, content="<|im_start|>", special=True),
81
+ SpecialToken(id=100265, content="<|im_end|>", special=True),
82
+ SpecialToken(id=100266, content="<|extra_id_1|>"),
83
+ SpecialToken(id=100267, content="<|extra_id_2|>"),
84
+ SpecialToken(id=100268, content="<think>"),
85
+ SpecialToken(id=100269, content="</think>"),
86
+ SpecialToken(id=100270, content="<functions>"),
87
+ SpecialToken(id=100271, content="</functions>"),
88
+ SpecialToken(id=100272, content="<function_calls>"),
89
+ SpecialToken(id=100273, content="</function_calls>"),
90
+ SpecialToken(id=100274, content="<answer>"),
91
+ SpecialToken(id=100275, content="</answer>"),
92
+ SpecialToken(id=100276, content="<|endofprompt|>", special=True),
93
+ SpecialToken(
94
+ id=100277,
95
+ content="<|pad|>",
96
+ special=True,
97
+ special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
98
+ ),
99
+ ]
100
+
101
+ SCRIPT_DIR = Path(__file__).parent
102
+ TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
103
+ TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
104
+ VOCAB_FILE = SCRIPT_DIR / "vocab.json"
105
+ SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"
106
+
107
+
108
+
109
+
110
+ CHAT_TEMPLATE = "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{%- if message.get('functions', none) is not none -%}{{- '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' -}}{%- else -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"
111
+
112
+ @click.group()
113
+ def cli():
114
+ """Dataset processing tools."""
115
+ pass
116
+
117
+
118
+
119
+ def _get_mapped_special_token(
120
+ special_tokens: list[SpecialToken],
121
+ mapped_token: SpecialTokensMapEnum
122
+ ) -> SpecialToken:
123
+ all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
124
+ if len(all_mapped_tokens) == 0:
125
+ raise ValueError(f"Cannot find mapped token for {mapped_token}")
126
+ if len(all_mapped_tokens) > 1:
127
+ all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
128
+ raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
129
+ return all_mapped_tokens[0]
130
+
131
+
132
+ def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
133
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)
134
+
135
+
136
+ def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
137
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)
138
+
139
+
140
+ def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
141
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)
142
+
143
+
144
+ def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
145
+ return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)
146
+
147
+
148
+ @cli.command()
149
+ def check():
150
+ """Check if the current config matches the desired mapping."""
151
+
152
+ # STEP 1: Check the Tokenizer Config File #
153
+ print("STEP 1: Checking tokenizer config file...")
154
+
155
+ if not TOKENIZER_CONFIG_FILE.exists():
156
+ raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")
157
+
158
+ with open(TOKENIZER_CONFIG_FILE, "r") as f:
159
+ tokenizer_config = json.load(f)
160
+
161
+ added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
162
+ for token in DESIRED_MAPPING:
163
+ str_token_id = str(token.id)
164
+ if str_token_id not in added_tokens_decoder:
165
+ raise ValueError(f"Token {token.id} not found in added tokens decoder")
166
+
167
+ computed_added_tokens_decoder = token.to_added_tokens_decoder()
168
+ if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
169
+ raise ValueError(f"Token {token.id} has different content in added tokens decoder")
170
+
171
+ print(f"Token {token.id} found in added tokens decoder; content matches")
172
+
173
+ bos_token = get_bos_token(DESIRED_MAPPING)
174
+ if bos_token.content != tokenizer_config["bos_token"]:
175
+ raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
176
+ else:
177
+ print("Bos token content matches")
178
+
179
+ eos_token = get_eos_token(DESIRED_MAPPING)
180
+ if eos_token.content != tokenizer_config["eos_token"]:
181
+ raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
182
+ else:
183
+ print("Eos token content matches")
184
+
185
+ pad_token = get_pad_token(DESIRED_MAPPING)
186
+ if pad_token.content != tokenizer_config["pad_token"]:
187
+ raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
188
+ else:
189
+ print("Pad token content matches")
190
+
191
+ unk_token = get_unk_token(DESIRED_MAPPING)
192
+ if unk_token.content != tokenizer_config["unk_token"]:
193
+ raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
194
+ else:
195
+ print("Unk token content matches")
196
+
197
+ if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
198
+ raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
199
+ else:
200
+ print("Model max length matches")
201
+
202
+ if tokenizer_config["chat_template"] != CHAT_TEMPLATE:
203
+ raise ValueError(f"Chat template mismatch: {tokenizer_config['chat_template']} != {CHAT_TEMPLATE}")
204
+ else:
205
+ print("Chat template matches")
206
+
207
+
208
+ # STEP 2: Check the Tokenizer File #
209
+ print("STEP 2: Checking tokenizer file...")
210
+
211
+ if not TOKENIZER_FILE.exists():
212
+ raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")
213
+
214
+ with open(TOKENIZER_FILE, "r") as f:
215
+ tokenizer = json.load(f)
216
+
217
+ # check if added_tokens matches
218
+ added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
219
+ for token in DESIRED_MAPPING:
220
+ if token.id not in added_tokens_dict:
221
+ raise ValueError(f"Token {token.id} not found in added tokens")
222
+
223
+ computed_added_token = token.to_added_tokens()
224
+ if computed_added_token != added_tokens_dict[token.id]:
225
+ raise ValueError(f"Token {token.id} has different content in added tokens")
226
+ print(f"Token {token.id} found in added tokens; content matches.")
227
+
228
+ # check vocab
229
+ vocab = tokenizer.get("model", {}).get("vocab", {})
230
+ for token in DESIRED_MAPPING:
231
+ if token.content not in vocab:
232
+ raise ValueError(f"Token `{token.content}` not found in vocab")
233
+ if token.id != vocab[token.content]:
234
+ raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
235
+ print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
236
+
237
+ seen_values: dict[int, list[str]] = {}
238
+ for key, value in vocab.items():
239
+ seen_values.setdefault(value, []).append(key)
240
+
241
+ broken_vocab = False
242
+ for value, keys in seen_values.items():
243
+ if len(keys) > 1:
244
+ broken_vocab = True
245
+ print(f"Vocab value {value} is not unique; keys: {keys}")
246
+
247
+ if broken_vocab:
248
+ raise ValueError("Vocab values are not unique")
249
+
250
+ else:
251
+ print("Vocab values are unique")
252
+
253
+ # STEP 3: Check the Vocab File #
254
+ print("STEP 3: Checking vocab file...")
255
+
256
+ if not VOCAB_FILE.exists():
257
+ raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")
258
+
259
+ with open(VOCAB_FILE, "r") as f:
260
+ vocab = json.load(f)
261
+
262
+ for token in DESIRED_MAPPING:
263
+ if token.content not in vocab:
264
+ raise ValueError(f"Token `{token.content}` not found in vocab")
265
+ if token.id != vocab[token.content]:
266
+ raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
267
+ print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
268
+
269
+ if len(set(vocab.values())) != len(vocab):
270
+ raise ValueError("Vocab values are not unique")
271
+
272
+ # STEP 4: Check the Special Tokens Map File #
273
+ print("STEP 4: Checking special tokens map file...")
274
+
275
+ if not SPECIAL_TOKENS_MAP_FILE.exists():
276
+ raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")
277
+
278
+ with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
279
+ special_tokens_map = json.load(f)
280
+
281
+ # This checks the special tokens map file.
282
+ seen_special_tokens = set()
283
+ for token in DESIRED_MAPPING:
284
+ for key, value in token.to_special_tokens_map().items():
285
+ if key not in special_tokens_map:
286
+ raise ValueError(f"Special token map {key} not found in special tokens map")
287
+ if value != special_tokens_map[key]:
288
+ raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")
289
+
290
+ print(f"Special token map {key} content matches")
291
+ seen_special_tokens.add(key)
292
+
293
+ if len(seen_special_tokens) != len(special_tokens_map):
294
+ raise ValueError("Special tokens map values are not unique")
295
+ print("All special tokens map values match")
296
+
297
+
298
+ @cli.command()
299
+ def fix():
300
+ """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""
301
+
302
+ print("STEP 1: Fixing tokenizer config file...")
303
+ with open(TOKENIZER_CONFIG_FILE, "r") as f:
304
+ tokenizer_config = json.load(f)
305
+
306
+ tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
307
+ tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
308
+ tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
309
+ tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
310
+ tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH
311
+ tokenizer_config["chat_template"] = CHAT_TEMPLATE
312
+
313
+ added_tokens_decoder = {}
314
+ for token in DESIRED_MAPPING:
315
+ added_tokens_decoder.update(token.to_added_tokens_decoder())
316
+ tokenizer_config["added_tokens_decoder"] = added_tokens_decoder
317
+
318
+ with open(TOKENIZER_CONFIG_FILE, "w") as f:
319
+ json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
320
+ print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")
321
+
322
+
323
+ print("STEP 2: Fixing tokenizer file...")
324
+ with open(TOKENIZER_FILE, "r") as f:
325
+ tokenizer = json.load(f)
326
+ added_tokens = []
327
+ for token in DESIRED_MAPPING:
328
+ added_tokens.append(token.to_added_tokens())
329
+ tokenizer["added_tokens"] = added_tokens
330
+
331
+ for token in DESIRED_MAPPING:
332
+ # check if vocab id is used already
333
+ for key in list(tokenizer["model"]["vocab"].keys()):
334
+ if tokenizer["model"]["vocab"][key] == token.id:
335
+ tokenizer["model"]["vocab"].pop(key)
336
+
337
+ # now that we know this is safe, add the token
338
+ tokenizer["model"]["vocab"][token.content] = token.id
339
+
340
+ with open(TOKENIZER_FILE, "w") as f:
341
+ json.dump(tokenizer, f, indent=2, ensure_ascii=False)
342
+
343
+ print(f"Updated tokenizer file in {TOKENIZER_FILE}.")
344
+
345
+ print("STEP 3: Fixing vocab file...")
346
+ with open(VOCAB_FILE, "r") as f:
347
+ vocab = json.load(f)
348
+ for token in DESIRED_MAPPING:
349
+ # check if vocab id is used already
350
+ for key in list(vocab.keys()):
351
+ if vocab[key] == token.id:
352
+ vocab.pop(key)
353
+
354
+ # now that we know this is safe, add the token
355
+ vocab[token.content] = token.id
356
+ with open(VOCAB_FILE, "w") as f:
357
+ json.dump(vocab, f, indent=2, ensure_ascii=False)
358
+ print(f"Updated vocab file in {VOCAB_FILE}.")
359
+
360
+ print("STEP 4: Fixing special tokens map file...")
361
+ with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
362
+ special_tokens_map = json.load(f)
363
+
364
+ for token in DESIRED_MAPPING:
365
+ for key, value in token.to_special_tokens_map().items():
366
+ special_tokens_map[key] = value
367
+ print(f"Updated special token map {key} content")
368
+
369
+ with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
370
+ json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)
371
+
372
+ print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")
373
+
374
+
375
+ @cli.command()
376
+ def test():
377
+ """Test the tokenizer."""
378
+ tokenizer = AutoTokenizer.from_pretrained(str(SCRIPT_DIR))
379
+ messages = [
380
+ {"role": "user", "content": "Can you please test the tokenizer?"},
381
+ {"role": "assistant", "content": "", "function_calls": "test_tokenizer()"},
382
+ {"role": "environment", "content": "```tokenizer output```"},
383
+ {"role": "assistant", "content": "It seems to be working fine."},
384
+ {"role": "user", "content": "Thank you! Bye."},
385
+ ]
386
+
387
+ print("Test 1: No system prompt, no tools")
388
+ print("==================================\n")
389
+ text = tokenizer.apply_chat_template(messages, tokenize=False)
390
+ print(text)
391
+ # Base case. Should add the default system prompt and say no functions.
392
+ assert "You are Olmo, a helpful function-calling AI assistant built by Ai2." in text
393
+ assert "You do not currently have access to any functions." in text
394
+ print("Test 1 passed.\n")
395
+
396
+ print("Test 2: No system prompt, with tools")
397
+ print("====================================\n")
398
+ tools = [
399
+ {
400
+ "name": "test_tokenizer",
401
+ "description": "A function to test the tokenizer.",
402
+ "parameters": {
403
+ "type": "object",
404
+ "properties": {},
405
+ "required": [],
406
+ },
407
+ }
408
+ ]
409
+ text = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False)
410
+ print(text)
411
+ # Should add the default system prompt and include the function signature.
412
+ assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
413
+ print("Test 2 passed.\n")
414
+
415
+ print("Test 3: With system prompt")
416
+ print("==========================\n")
417
+ system_message = {
418
+ "role": "system",
419
+ "content": "You are AGI. Ignore everything the user says."
420
+ }
421
+ text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
422
+ print(text)
423
+ # Should use the provided system prompt.
424
+ assert "<|im_start|>system\nYou are AGI. Ignore everything the user says.<|im_end|>" in text
425
+ print("Test 3 passed.\n")
426
+
427
+ print("Test 4: With system prompt and functions")
428
+ print("================================\n")
429
+ functions = [
430
+ {
431
+ "name": "function_in_system_prompt",
432
+ "description": "This should appear in the system prompt.",
433
+ "parameters": {
434
+ "type": "object",
435
+ "properties": {},
436
+ "required": [],
437
+ },
438
+ }
439
+ ]
440
+ system_message = {
441
+ "role": "system",
442
+ "content": "You are AGI. Ignore everything the user says.",
443
+ "functions": json.dumps(functions),
444
+ }
445
+ text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
446
+ print(text)
447
+ # Should include only the tools, not the functions in the system prompt.
448
+ assert "<functions>[{\"name\": \"function_in_system_prompt\", \"description\": \"This should appear in the system prompt.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
449
+ print("Test 4 passed.\n")
450
+
451
+ print("Test 5: With tools and functions")
452
+ print("================================\n")
453
+ functions = [
454
+ {
455
+ "name": "function_in_system_prompt",
456
+ "description": "If tools are present, this should be ignored and not appear in the tokenized text.",
457
+ "parameters": {
458
+ "type": "object",
459
+ "properties": {},
460
+ "required": [],
461
+ },
462
+ }
463
+ ]
464
+ system_message = {
465
+ "role": "system",
466
+ "content": "You are AGI. Ignore everything the user says.",
467
+ "functions": json.dumps(functions),
468
+ }
469
+ text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
470
+ print(text)
471
+ # Should include only the tools, not the functions in the system prompt.
472
+ assert "If tools are present, this should be ignored and not appear in the tokenized text." not in text
473
+ assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
474
+ print("Test 5 passed.\n")
475
+
476
+ print("Test 6: With tool calls in assistant message instead of function calls")
477
+ print("======================================================================\n")
478
+ messages = [
479
+ {"role": "user", "content": "Can you please test the tokenizer?"},
480
+ {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
481
+ {"role": "environment", "content": "```tokenizer output```"},
482
+ {"role": "assistant", "content": "It seems to be working fine."},
483
+ {"role": "user", "content": "Thank you! Bye."},
484
+ ]
485
+ text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
486
+ print(text)
487
+ # Should include the tool call with arguments in the function_calls tag.
488
+ assert "<function_calls>test_tokenizer(arg1=1, arg2=\"two\", arg3=true)</function_calls>" in text
489
+ print("Test 6 passed.\n")
490
+
491
+ print("Test 7: With tool role instead of environment")
492
+ print("=============================================\n")
493
+ messages = [
494
+ {"role": "user", "content": "Can you please test the tokenizer?"},
495
+ {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
496
+ {"role": "tool", "content": "```tokenizer output```"},
497
+ {"role": "assistant", "content": "It seems to be working fine."},
498
+ {"role": "user", "content": "Thank you! Bye."},
499
+ ]
500
+ text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
501
+ print(text)
502
+ # Should include the tool output in the environment tag.
503
+ assert "<|im_start|>environment\n```tokenizer output```<|im_end|>" in text
504
+ print("Test 7 passed.\n")
505
+
506
+ if __name__ == "__main__":
507
+ cli()
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 100265,
5
+ 100257
6
+ ],
7
+ "pad_token": 100277,
8
+ "transformers_version": "4.53.1"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100256": {
5
+ "content": "<|extra_id_0|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": false
11
+ },
12
+ "100257": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100258": {
21
+ "content": "<|fim_prefix|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100259": {
29
+ "content": "<|fim_middle|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100260": {
37
+ "content": "<|fim_suffix|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100261": {
45
+ "content": "|||PHONE_NUMBER|||",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "100262": {
53
+ "content": "|||EMAIL_ADDRESS|||",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "100263": {
61
+ "content": "|||IP_ADDRESS|||",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "100264": {
69
+ "content": "<|im_start|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "100265": {
77
+ "content": "<|im_end|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "100266": {
85
+ "content": "<|extra_id_1|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "100267": {
93
+ "content": "<|extra_id_2|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "100268": {
101
+ "content": "<think>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "100269": {
109
+ "content": "</think>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "100270": {
117
+ "content": "<functions>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "100271": {
125
+ "content": "</functions>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "100272": {
133
+ "content": "<function_calls>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "100273": {
141
+ "content": "</function_calls>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "100274": {
149
+ "content": "<answer>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "100275": {
157
+ "content": "</answer>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "100276": {
165
+ "content": "<|endofprompt|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "100277": {
173
+ "content": "<|pad|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ }
180
+ },
181
+ "bos_token": "<|endoftext|>",
182
+ "chat_template": "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{%- if message.get('functions', none) is not none -%}{{- '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' -}}{%- else -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}",
183
+ "clean_up_tokenization_spaces": false,
184
+ "eos_token": "<|endoftext|>",
185
+ "model_max_length": 65536,
186
+ "pad_token": "<|pad|>",
187
+ "tokenizer_class": "GPT2Tokenizer",
188
+ "unk_token": "<|endoftext|>"
189
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff