v5 - no additions to provided system prompt

Browse files

Files changed (8) hide show

README.md +100 -0
fix_tokens.py +507 -0
generation_config.json +9 -0
merges.txt +0 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +189 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,100 @@

+---
+library_name: transformers
+tags: []
+---
+# Dolma 2 tokenizer, Instruct v5, Non-reasoner version
+Slightly modified version of `cl100k_base` that supports Dolma 1.x  and Dolma 2.x special tokens.
+## Special tokens
+This tokenizer supports the following special tokens:
+- `<|extra_id_0|>`: Not used.
+- `<|endoftext|>`: Used to mark both beginning and end of text.
+- `<|fim_prefix|>`: Used to mark the prefix fill-in-the-middle request.
+- `<|fim_middle|>`: Used to mark the middle fill-in-the-middle request.
+- `<|fim_suffix|>`: Used to mark the suffix fill-in-the-middle request.
+- `|||PHONE_NUMBER|||`: Not used. Kept for compatibility with Dolma 1.x.
+- `|||EMAIL_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
+- `|||IP_ADDRESS|||`: Not used. Kept for compatibility with Dolma 1.x.
+- `<|im_start|>`: Indicates the beginning of a message (turn in a conversation).
+- `<|im_end|>`: Indicates the end of a message (turn in a conversation).
+- `<|extra_id_1|>`: Not used.
+- `<|extra_id_2|>`: Not used.
+- `<think>`: Indicates the beginning of model thoughts.
+- `</think>`: Indicates the end of model thoughts.
+- `<|extra_id_3|>`: Not used.
+- `<|extra_id_4|>`: Not used.
+- `<|extra_id_5|>`: Not used.
+- `<|extra_id_6|>`: Not used.
+- `<answer>`: Indicates the beginning of model answer in thinking mode.
+- `</answer>`: Indicates the end of model answer in thinking mode.
+- `<|endofprompt|>`: Not Used.
+- `<|pad|>`: Symbol to pad input sequences.
+- `<functions>`: Indicates start of function definitions in the system prompt for tool use.
+- `</functions>`: Indicates end of function definitions in the sytem prompt.
+- `<function_calls>`: Indicates start of function calls made by the model.
+- `</function_calls>`: Indicates end of function calls made by the model.
+## Chat template
+The chat template is as follows (**for reference only**, actual template is in `tokenizer_config.json`):
+```jinja
+{% set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 %}
+{% if not has_system %}
+{{ '<|im_start|>system
+You are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. You do not currently have access to any functions. <functions></functions><|im_end|>
+' }}
+{% endif %}
+{% Youfor message in messages %}
+    {% if message['role'] == 'system' %}
+{{ '<|im_start|>system
+' + message['content'] }}
+        {% if message.get('functions', none) is not none %}
+{{ ' <functions>' + message['functions'] + '</functions><|im_end|>
+' }}
+        {% else %}
+{{ '  do not currently have access to any functions. <functions></functions><|im_end|>
+' }}
+        {% endif %}
+    {% elif message['role'] == 'user' %}
+        {% if message.get('functions', none) is not none %}
+{{ '<|im_start|>user
+' + message['content'] + '
+' + '<functions>' + message['functions'] + '</functions><|im_end|>
+' }}
+        {% else %}
+{{ '<|im_start|>user
+' + message['content'] + '<|im_end|>
+' }}
+        {% endif %}
+    {% elif message['role'] == 'assistant' %}
+{{ '<|im_start|>assistant
+' }}
+        {% if message.get('content', none) is not none %}
+{{ message['content'] }}
+        {% endif %}
+        {% if message.get('function_calls', none) is not none %}
+{{ '<function_calls>' + message['function_calls'] + '</function_calls>' }}
+        {% endif %}
+        {% if not loop.last %}
+{{ '<|im_end|>' + '
+' }}
+        {% else %}
+{{ eos_token }}
+        {% endif %}
+    {% elif message['role'] == 'environment' %}
+{{ '<|im_start|>environment
+' + message['content'] + '<|im_end|>
+' }}
+    {% endif %}
+    {% if loop.last and add_generation_prompt %}
+{{ '<|im_start|>assistant
+' }}
+    {% endif %}
+{% endfor %}
+```

fix_tokens.py ADDED Viewed

	@@ -0,0 +1,507 @@

+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#   "click",
+#   "transformers",
+#   "jinja2",
+# ]
+# ///
+from dataclasses import dataclass, asdict, field
+from enum import Enum
+from pathlib import Path
+import click
+import json
+from transformers import AutoTokenizer
+class SpecialTokensMapEnum(Enum):
+    BOS_TOKEN = "bos_token"
+    EOS_TOKEN = "eos_token"
+    PAD_TOKEN = "pad_token"
+    UNK_TOKEN = "unk_token"
+@dataclass(frozen=True)
+class SpecialToken:
+    id: int
+    content: str
+    lstrip: bool = False
+    normalized: bool = False
+    rstrip: bool = False
+    single_word: bool = False
+    special: bool = False
+    special_token_map: list[SpecialTokensMapEnum] = field(default_factory=list)
+    def to_added_tokens_decoder(self):
+        data = asdict(self)
+        token_id = str(data.pop("id"))
+        data.pop("special_token_map")
+        return {token_id: data}
+    def to_added_tokens(self):
+        data = asdict(self)
+        data.pop("special_token_map")
+        return data
+    def to_special_tokens_map(self) -> dict[str, dict]:
+        special_tokens_map = {}
+        for special_token_map in self.special_token_map:
+            data = asdict(self)
+            data.pop("special_token_map")
+            data.pop("special")
+            data.pop("id")
+            special_tokens_map[special_token_map.value] = data
+        return special_tokens_map
+MODEL_MAX_LENGTH = 65536
+DESIRED_MAPPING = [
+      SpecialToken(id=100256, content="<|extra_id_0|>"),
+      SpecialToken(
+        id=100257,
+        content="<|endoftext|>",
+        special=True,
+        special_token_map=[
+            SpecialTokensMapEnum.BOS_TOKEN,
+            SpecialTokensMapEnum.EOS_TOKEN,
+            SpecialTokensMapEnum.UNK_TOKEN,
+        ]),
+      SpecialToken(id=100258, content="<|fim_prefix|>", special=True),
+      SpecialToken(id=100259, content="<|fim_middle|>", special=True),
+      SpecialToken(id=100260, content="<|fim_suffix|>",special=True),
+      SpecialToken(id=100261, content="|||PHONE_NUMBER|||"),
+      SpecialToken(id=100262, content="|||EMAIL_ADDRESS|||"),
+      SpecialToken(id=100263, content="|||IP_ADDRESS|||"),
+      SpecialToken(id=100264, content="<|im_start|>", special=True),
+      SpecialToken(id=100265, content="<|im_end|>", special=True),
+      SpecialToken(id=100266, content="<|extra_id_1|>"),
+      SpecialToken(id=100267, content="<|extra_id_2|>"),
+      SpecialToken(id=100268, content="<think>"),
+      SpecialToken(id=100269, content="</think>"),
+      SpecialToken(id=100270, content="<functions>"),
+      SpecialToken(id=100271, content="</functions>"),
+      SpecialToken(id=100272, content="<function_calls>"),
+      SpecialToken(id=100273, content="</function_calls>"),
+      SpecialToken(id=100274, content="<answer>"),
+      SpecialToken(id=100275, content="</answer>"),
+      SpecialToken(id=100276, content="<|endofprompt|>", special=True),
+      SpecialToken(
+        id=100277,
+        content="<|pad|>",
+        special=True,
+        special_token_map=[SpecialTokensMapEnum.PAD_TOKEN],
+      ),
+]
+SCRIPT_DIR = Path(__file__).parent
+TOKENIZER_CONFIG_FILE = SCRIPT_DIR / "tokenizer_config.json"
+TOKENIZER_FILE = SCRIPT_DIR / "tokenizer.json"
+VOCAB_FILE = SCRIPT_DIR / "vocab.json"
+SPECIAL_TOKENS_MAP_FILE = SCRIPT_DIR / "special_tokens_map.json"
+CHAT_TEMPLATE = "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{%- if message.get('functions', none) is not none -%}{{- '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' -}}{%- else -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}"
+@click.group()
+def cli():
+    """Dataset processing tools."""
+    pass
+def _get_mapped_special_token(
+    special_tokens: list[SpecialToken],
+    mapped_token: SpecialTokensMapEnum
+) -> SpecialToken:
+    all_mapped_tokens = [token for token in special_tokens if mapped_token in token.special_token_map]
+    if len(all_mapped_tokens) == 0:
+        raise ValueError(f"Cannot find mapped token for {mapped_token}")
+    if len(all_mapped_tokens) > 1:
+        all_mapped_tokens_str = ", ".join([token.content for token in all_mapped_tokens])
+        raise ValueError(f"Found multiple mapped tokens for {mapped_token}: {all_mapped_tokens_str}")
+    return all_mapped_tokens[0]
+def get_unk_token(special_tokens: list[SpecialToken]) -> SpecialToken:
+    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.UNK_TOKEN)
+def get_bos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
+    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.BOS_TOKEN)
+def get_eos_token(special_tokens: list[SpecialToken]) -> SpecialToken:
+    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.EOS_TOKEN)
+def get_pad_token(special_tokens: list[SpecialToken]) -> SpecialToken:
+    return _get_mapped_special_token(special_tokens, SpecialTokensMapEnum.PAD_TOKEN)
+@cli.command()
+def check():
+    """Check if the current config matches the desired mapping."""
+    # STEP 1: Check the Tokenizer Config File #
+    print("STEP 1: Checking tokenizer config file...")
+    if not TOKENIZER_CONFIG_FILE.exists():
+        raise FileNotFoundError(f"Tokenizer config file not found: {TOKENIZER_CONFIG_FILE}")
+    with open(TOKENIZER_CONFIG_FILE, "r") as f:
+        tokenizer_config = json.load(f)
+    added_tokens_decoder = tokenizer_config.get("added_tokens_decoder", {})
+    for token in DESIRED_MAPPING:
+        str_token_id = str(token.id)
+        if str_token_id not in added_tokens_decoder:
+            raise ValueError(f"Token {token.id} not found in added tokens decoder")
+        computed_added_tokens_decoder = token.to_added_tokens_decoder()
+        if computed_added_tokens_decoder[str_token_id] != added_tokens_decoder[str_token_id]:
+            raise ValueError(f"Token {token.id} has different content in added tokens decoder")
+        print(f"Token {token.id} found in added tokens decoder; content matches")
+    bos_token = get_bos_token(DESIRED_MAPPING)
+    if bos_token.content != tokenizer_config["bos_token"]:
+        raise ValueError(f"Bos token content mismatch: {bos_token.content} != {tokenizer_config['bos_token']}")
+    else:
+        print("Bos token content matches")
+    eos_token = get_eos_token(DESIRED_MAPPING)
+    if eos_token.content != tokenizer_config["eos_token"]:
+        raise ValueError(f"Eos token content mismatch: {eos_token.content} != {tokenizer_config['eos_token']}")
+    else:
+        print("Eos token content matches")
+    pad_token = get_pad_token(DESIRED_MAPPING)
+    if pad_token.content != tokenizer_config["pad_token"]:
+        raise ValueError(f"Pad token content mismatch: {pad_token.content} != {tokenizer_config['pad_token']}")
+    else:
+        print("Pad token content matches")
+    unk_token = get_unk_token(DESIRED_MAPPING)
+    if unk_token.content != tokenizer_config["unk_token"]:
+        raise ValueError(f"Unk token content mismatch: {unk_token.content} != {tokenizer_config['unk_token']}")
+    else:
+        print("Unk token content matches")
+    if tokenizer_config["model_max_length"] != MODEL_MAX_LENGTH:
+        raise ValueError(f"Model max length mismatch: {tokenizer_config['model_max_length']} != {MODEL_MAX_LENGTH}")
+    else:
+        print("Model max length matches")
+    if tokenizer_config["chat_template"] != CHAT_TEMPLATE:
+        raise ValueError(f"Chat template mismatch: {tokenizer_config['chat_template']} != {CHAT_TEMPLATE}")
+    else:
+        print("Chat template matches")
+    # STEP 2: Check the Tokenizer File #
+    print("STEP 2: Checking tokenizer file...")
+    if not TOKENIZER_FILE.exists():
+        raise FileNotFoundError(f"Tokenizer file not found: {TOKENIZER_FILE}")
+    with open(TOKENIZER_FILE, "r") as f:
+        tokenizer = json.load(f)
+    # check if added_tokens matches
+    added_tokens_dict = {token["id"]: token for token in tokenizer.get("added_tokens", [])}
+    for token in DESIRED_MAPPING:
+        if token.id not in added_tokens_dict:
+            raise ValueError(f"Token {token.id} not found in added tokens")
+        computed_added_token = token.to_added_tokens()
+        if computed_added_token != added_tokens_dict[token.id]:
+            raise ValueError(f"Token {token.id} has different content in added tokens")
+        print(f"Token {token.id} found in added tokens; content matches.")
+    # check vocab
+    vocab = tokenizer.get("model", {}).get("vocab", {})
+    for token in DESIRED_MAPPING:
+        if token.content not in vocab:
+            raise ValueError(f"Token `{token.content}` not found in vocab")
+        if token.id != vocab[token.content]:
+            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
+        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
+    seen_values: dict[int, list[str]] = {}
+    for key, value in vocab.items():
+        seen_values.setdefault(value, []).append(key)
+    broken_vocab = False
+    for value, keys in seen_values.items():
+        if len(keys) > 1:
+            broken_vocab = True
+            print(f"Vocab value {value} is not unique; keys: {keys}")
+    if broken_vocab:
+        raise ValueError("Vocab values are not unique")
+    else:
+        print("Vocab values are unique")
+    # STEP 3: Check the Vocab File #
+    print("STEP 3: Checking vocab file...")
+    if not VOCAB_FILE.exists():
+        raise FileNotFoundError(f"Vocab file not found: {VOCAB_FILE}")
+    with open(VOCAB_FILE, "r") as f:
+        vocab = json.load(f)
+    for token in DESIRED_MAPPING:
+        if token.content not in vocab:
+            raise ValueError(f"Token `{token.content}` not found in vocab")
+        if token.id != vocab[token.content]:
+            raise ValueError(f"Token `{token.content}`: vocab=`{vocab[token.content]}` provided=`{token.id}`")
+        print(f"Token `{token.content}` found in vocab; id `{token.id}` matches.")
+    if len(set(vocab.values())) != len(vocab):
+        raise ValueError("Vocab values are not unique")
+    # STEP 4: Check the Special Tokens Map File #
+    print("STEP 4: Checking special tokens map file...")
+    if not SPECIAL_TOKENS_MAP_FILE.exists():
+        raise FileNotFoundError(f"Special tokens map file not found: {SPECIAL_TOKENS_MAP_FILE}")
+    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
+        special_tokens_map = json.load(f)
+    # This checks the special tokens map file.
+    seen_special_tokens = set()
+    for token in DESIRED_MAPPING:
+        for key, value in token.to_special_tokens_map().items():
+            if key not in special_tokens_map:
+                raise ValueError(f"Special token map {key} not found in special tokens map")
+            if value != special_tokens_map[key]:
+                raise ValueError(f"Special token map {key} content mismatch: {value} != {special_tokens_map[key]}")
+            print(f"Special token map {key} content matches")
+            seen_special_tokens.add(key)
+    if len(seen_special_tokens) != len(special_tokens_map):
+        raise ValueError("Special tokens map values are not unique")
+    print("All special tokens map values match")
+@cli.command()
+def fix():
+    """Fix the tokens in the tokenizer config, tokenizer file, vocab file, and special tokens map file."""
+    print("STEP 1: Fixing tokenizer config file...")
+    with open(TOKENIZER_CONFIG_FILE, "r") as f:
+        tokenizer_config = json.load(f)
+    tokenizer_config["bos_token"] = get_bos_token(DESIRED_MAPPING).content
+    tokenizer_config["eos_token"] = get_eos_token(DESIRED_MAPPING).content
+    tokenizer_config["pad_token"] = get_pad_token(DESIRED_MAPPING).content
+    tokenizer_config["unk_token"] = get_unk_token(DESIRED_MAPPING).content
+    tokenizer_config["model_max_length"] = MODEL_MAX_LENGTH
+    tokenizer_config["chat_template"] = CHAT_TEMPLATE
+    added_tokens_decoder = {}
+    for token in DESIRED_MAPPING:
+        added_tokens_decoder.update(token.to_added_tokens_decoder())
+    tokenizer_config["added_tokens_decoder"] = added_tokens_decoder
+    with open(TOKENIZER_CONFIG_FILE, "w") as f:
+        json.dump(tokenizer_config, f, indent=2, ensure_ascii=False)
+    print(f"Updated tokenizer config file in {TOKENIZER_CONFIG_FILE}.")
+    print("STEP 2: Fixing tokenizer file...")
+    with open(TOKENIZER_FILE, "r") as f:
+        tokenizer = json.load(f)
+    added_tokens = []
+    for token in DESIRED_MAPPING:
+        added_tokens.append(token.to_added_tokens())
+    tokenizer["added_tokens"] = added_tokens
+    for token in DESIRED_MAPPING:
+        # check if vocab id is used already
+        for key in list(tokenizer["model"]["vocab"].keys()):
+            if tokenizer["model"]["vocab"][key] == token.id:
+                tokenizer["model"]["vocab"].pop(key)
+        # now that we know this is safe, add the token
+        tokenizer["model"]["vocab"][token.content] = token.id
+    with open(TOKENIZER_FILE, "w") as f:
+        json.dump(tokenizer, f, indent=2, ensure_ascii=False)
+    print(f"Updated tokenizer file in {TOKENIZER_FILE}.")
+    print("STEP 3: Fixing vocab file...")
+    with open(VOCAB_FILE, "r") as f:
+        vocab = json.load(f)
+    for token in DESIRED_MAPPING:
+        # check if vocab id is used already
+        for key in list(vocab.keys()):
+            if vocab[key] == token.id:
+                vocab.pop(key)
+        # now that we know this is safe, add the token
+        vocab[token.content] = token.id
+    with open(VOCAB_FILE, "w") as f:
+        json.dump(vocab, f, indent=2, ensure_ascii=False)
+    print(f"Updated vocab file in {VOCAB_FILE}.")
+    print("STEP 4: Fixing special tokens map file...")
+    with open(SPECIAL_TOKENS_MAP_FILE, "r") as f:
+        special_tokens_map = json.load(f)
+    for token in DESIRED_MAPPING:
+        for key, value in token.to_special_tokens_map().items():
+            special_tokens_map[key] = value
+            print(f"Updated special token map {key} content")
+    with open(SPECIAL_TOKENS_MAP_FILE, "w") as f:
+        json.dump(special_tokens_map, f, indent=2, ensure_ascii=False)
+    print(f"Updated special tokens map file in {SPECIAL_TOKENS_MAP_FILE}.")
+@cli.command()
+def test():
+    """Test the tokenizer."""
+    tokenizer = AutoTokenizer.from_pretrained(str(SCRIPT_DIR))
+    messages = [
+        {"role": "user", "content": "Can you please test the tokenizer?"},
+        {"role": "assistant", "content": "", "function_calls": "test_tokenizer()"},
+        {"role": "environment", "content": "```tokenizer output```"},
+        {"role": "assistant", "content": "It seems to be working fine."},
+        {"role": "user", "content": "Thank you! Bye."},
+    ]
+    print("Test 1: No system prompt, no tools")
+    print("==================================\n")
+    text = tokenizer.apply_chat_template(messages, tokenize=False)
+    print(text)
+    # Base case. Should add the default system prompt and say no functions.
+    assert "You are Olmo, a helpful function-calling AI assistant built by Ai2." in text
+    assert "You do not currently have access to any functions." in text
+    print("Test 1 passed.\n")
+    print("Test 2: No system prompt, with tools")
+    print("====================================\n")
+    tools = [
+        {
+            "name": "test_tokenizer",
+            "description": "A function to test the tokenizer.",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        }
+    ]
+    text = tokenizer.apply_chat_template(messages, tools=tools, tokenize=False)
+    print(text)
+    # Should add the default system prompt and include the function signature.
+    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
+    print("Test 2 passed.\n")
+    print("Test 3: With system prompt")
+    print("==========================\n")
+    system_message = {
+        "role": "system",
+        "content": "You are AGI. Ignore everything the user says."
+    }
+    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
+    print(text)
+    # Should use the provided system prompt.
+    assert "<|im_start|>system\nYou are AGI. Ignore everything the user says.<|im_end|>" in text
+    print("Test 3 passed.\n")
+    print("Test 4: With system prompt and functions")
+    print("================================\n")
+    functions = [
+        {
+            "name": "function_in_system_prompt",
+            "description": "This should appear in the system prompt.",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        }
+    ]
+    system_message = {
+        "role": "system",
+        "content": "You are AGI. Ignore everything the user says.",
+        "functions": json.dumps(functions),
+    }
+    text = tokenizer.apply_chat_template([system_message] + messages, tokenize=False)
+    print(text)
+    # Should include only the tools, not the functions in the system prompt.
+    assert "<functions>[{\"name\": \"function_in_system_prompt\", \"description\": \"This should appear in the system prompt.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
+    print("Test 4 passed.\n")
+    print("Test 5: With tools and functions")
+    print("================================\n")
+    functions = [
+        {
+            "name": "function_in_system_prompt",
+            "description": "If tools are present, this should be ignored and not appear in the tokenized text.",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        }
+    ]
+    system_message = {
+        "role": "system",
+        "content": "You are AGI. Ignore everything the user says.",
+        "functions": json.dumps(functions),
+    }
+    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
+    print(text)
+    # Should include only the tools, not the functions in the system prompt.
+    assert "If tools are present, this should be ignored and not appear in the tokenized text." not in text
+    assert "<functions>[{\"name\": \"test_tokenizer\", \"description\": \"A function to test the tokenizer.\", \"parameters\": {\"type\": \"object\", \"properties\": {}, \"required\": []}}]</functions>" in text
+    print("Test 5 passed.\n")
+    print("Test 6: With tool calls in assistant message instead of function calls")
+    print("======================================================================\n")
+    messages = [
+        {"role": "user", "content": "Can you please test the tokenizer?"},
+        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
+        {"role": "environment", "content": "```tokenizer output```"},
+        {"role": "assistant", "content": "It seems to be working fine."},
+        {"role": "user", "content": "Thank you! Bye."},
+    ]
+    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
+    print(text)
+    # Should include the tool call with arguments in the function_calls tag.
+    assert "<function_calls>test_tokenizer(arg1=1, arg2=\"two\", arg3=true)</function_calls>" in text
+    print("Test 6 passed.\n")
+    print("Test 7: With tool role instead of environment")
+    print("=============================================\n")
+    messages = [
+        {"role": "user", "content": "Can you please test the tokenizer?"},
+        {"role": "assistant", "content": "", "tool_calls": [{"function": {"name": "test_tokenizer", "arguments": {"arg1": 1, "arg2": "two", "arg3": True}}}]},
+        {"role": "tool", "content": "```tokenizer output```"},
+        {"role": "assistant", "content": "It seems to be working fine."},
+        {"role": "user", "content": "Thank you! Bye."},
+    ]
+    text = tokenizer.apply_chat_template([system_message] + messages, tools=tools, tokenize=False)
+    print(text)
+    # Should include the tool output in the environment tag.
+    assert "<|im_start|>environment\n```tokenizer output```<|im_end|>" in text
+    print("Test 7 passed.\n")
+if __name__ == "__main__":
+    cli()

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "_from_model_config": true,
+    "eos_token_id": [
+      100265,
+      100257
+    ],
+    "pad_token": 100277,
+    "transformers_version": "4.53.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,189 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "100256": {
+      "content": "<|extra_id_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100257": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100258": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100259": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100260": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100261": {
+      "content": "|||PHONE_NUMBER|||",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100262": {
+      "content": "|||EMAIL_ADDRESS|||",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100263": {
+      "content": "|||IP_ADDRESS|||",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100264": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100265": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100266": {
+      "content": "<|extra_id_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100267": {
+      "content": "<|extra_id_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100268": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100269": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100270": {
+      "content": "<functions>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100271": {
+      "content": "</functions>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100272": {
+      "content": "<function_calls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100273": {
+      "content": "</function_calls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100274": {
+      "content": "<answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100275": {
+      "content": "</answer>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100276": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100277": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "chat_template": "{%- set has_system = messages|selectattr('role', 'equalto', 'system')|list|length > 0 -%}{%- if not has_system -%}{{- '<|im_start|>system\nYou are Olmo, a helpful function-calling AI assistant built by Ai2. Your date cutoff is December 2024, and your model weights are available at https://huggingface.co/allenai. ' -}}{%- if tools is none -%}{{- 'You do not currently have access to any functions. <functions></functions><|im_end|>\n' -}}{%- else -%}{{- 'You are provided with function signatures within <functions></functions> XML tags. You may call one or more functions to assist with the user query. Output any function calls within <function_calls></function_calls> XML tags. Do not make assumptions about what values to plug into functions.' -}}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions><|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'] -}}{%- if tools is not none -%}{{- '<functions>' -}}{{- tools | tojson -}}{{- '</functions>' -}}{%- elif message.get('functions', none) is not none -%}{{- ' <functions>' + message['functions'] + '</functions>' -}}{%- endif -%}{{- '<|im_end|>\n' -}}{%- elif message['role'] == 'user' -%}{%- if message.get('functions', none) is not none -%}{{- '<|im_start|>user\n' + message['content'] + '\n' + '<functions>' + message['functions'] + '</functions><|im_end|>\n' -}}{%- else -%}{{- '<|im_start|>user\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- elif message['role'] == 'assistant' -%}{{- '<|im_start|>assistant\n' -}}{%- if message.get('content', none) is not none -%}{{- message['content'] -}}{%- endif -%}{%- if message.get('function_calls', none) is not none -%}{{- '<function_calls>' + message['function_calls'] + '</function_calls>' -}}{% elif message.get('tool_calls', none) is not none %}{{- '<function_calls>' -}}{%- for tool_call in message['tool_calls'] %}{%- if tool_call is mapping and tool_call.get('function', none) is not none %}{%- set args = tool_call['function']['arguments'] -%}{%- set ns = namespace(arguments_list=[]) -%}{%- for key, value in args.items() -%}{%- set ns.arguments_list = ns.arguments_list + [key ~ '=' ~ (value | tojson)] -%}{%- endfor -%}{%- set arguments = ns.arguments_list | join(', ') -%}{{- tool_call['function']['name'] + '(' + arguments + ')' -}}{%- if not loop.last -%}{{ '\n' }}{%- endif -%}{% else %}{{- tool_call -}}{%- endif %}{%- endfor %}{{- '</function_calls>' -}}{%- endif -%}{%- if not loop.last -%}{{- '<|im_end|>' + '\n' -}}{%- else -%}{{- eos_token -}}{%- endif -%}{%- elif message['role'] == 'environment' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- elif message['role'] == 'tool' -%}{{- '<|im_start|>environment\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- if loop.last and add_generation_prompt -%}{{- '<|im_start|>assistant\n' -}}{%- endif -%}{%- endfor -%}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 65536,
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff