llama3-reasoning-template-single / replace_reserved_tokens.py
Per Egil Kummervold
updated with single tokens
5bc4f61
#!/usr/bin/env python3
"""
replace_reserved_tokens.py
This script updates the reserved special tokens in a Hugging Face tokenizer directory.
It replaces the tokens with the following mapping:
ID 128013: "<|think|>"
ID 128014: "<|/think|>"
ID 128015: "<|answer|>"
ID 128016: "<|/answer|>"
It updates all key files if they exist: tokenizer_config.json, tokenizer.json,
added_tokens.json, and special_tokens_map.json.
Usage:
python3 replace_reserved_tokens.py --tokenizer_dir /path/to/tokenizer_dir
A backup (.backup) of each file updated is created.
"""
import argparse
import json
import os
import sys
# Define the replacement mapping as a dictionary with keys as strings
# (these are the token IDs as stored in the JSON) and values as the new token content.
REPLACEMENT_TOKENS = {
"128013": "<|think|>",
"128014": "<|/think|>",
"128015": "<|answer|>",
"128016": "<|/answer|>"
}
def update_json_file(file_path, updater_func):
"""
Load a JSON file, update it using updater_func(data), and if changes occur,
backup the original file and write out the modified JSON.
"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return False
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as e:
print(f"Error reading {file_path}: {e}", file=sys.stderr)
return False
changed = updater_func(data)
if changed:
backup_path = file_path + ".backup"
os.rename(file_path, backup_path)
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"Updated '{file_path}'. Backup saved to '{backup_path}'.")
else:
print(f"No changes needed for '{file_path}'.")
return changed
def update_tokenizer_config(data):
"""
Update the "added_tokens_decoder" field in tokenizer_config.json.
"""
changed = False
if "added_tokens_decoder" in data:
for token_id, new_content in REPLACEMENT_TOKENS.items():
if token_id in data["added_tokens_decoder"]:
current = data["added_tokens_decoder"][token_id].get("content", "")
if current != new_content:
print(f"[tokenizer_config.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
data["added_tokens_decoder"][token_id]["content"] = new_content
changed = True
else:
print(f"[tokenizer_config.json] Token id {token_id} already set to '{new_content}'.")
else:
print(f"[tokenizer_config.json] Warning: token id {token_id} not found.", file=sys.stderr)
else:
print("Key 'added_tokens_decoder' not found in tokenizer_config.json.", file=sys.stderr)
return changed
def update_tokenizer_json(data):
"""
Update the "added_tokens" list in tokenizer.json.
The structure is assumed to be a dictionary that includes an "added_tokens" key.
"""
changed = False
if "added_tokens" in data and isinstance(data["added_tokens"], list):
for token in data["added_tokens"]:
# The token "id" might be an integer or string; compare after converting to string.
token_id = str(token.get("id"))
if token_id in REPLACEMENT_TOKENS:
current = token.get("content", "")
new_content = REPLACEMENT_TOKENS[token_id]
if current != new_content:
print(f"[tokenizer.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
token["content"] = new_content
changed = True
else:
print(f"[tokenizer.json] Token id {token_id} already set to '{new_content}'.")
else:
print("Key 'added_tokens' not found or not a list in tokenizer.json.", file=sys.stderr)
return changed
def update_added_tokens_json(data):
"""
Update the added_tokens.json file if it exists.
Assume data is a dict mapping token IDs to token info.
"""
changed = False
for token_id, new_content in REPLACEMENT_TOKENS.items():
if token_id in data:
current = data[token_id].get("content", "")
if current != new_content:
print(f"[added_tokens.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
data[token_id]["content"] = new_content
changed = True
else:
print(f"[added_tokens.json] Token id {token_id} already set to '{new_content}'.")
else:
print(f"[added_tokens.json] Warning: token id {token_id} not found.", file=sys.stderr)
return changed
def update_special_tokens_map(data):
"""
Update special_tokens_map.json if needed.
This file maps roles (e.g. bos_token) to token strings.
If any of our replacement tokens appear here, update them.
(Often, these reserved tokens are not referenced here,
so this function may be a no-op.)
"""
changed = False
for key, value in data.items():
if isinstance(value, str):
# If the current value equals one of our original reserved tokens,
# you might want to update it. However, without the original values,
# we leave it unchanged unless needed.
pass
return changed
def main():
parser = argparse.ArgumentParser(
description="Replace reserved tokens in a Hugging Face tokenizer directory."
)
parser.add_argument(
"--tokenizer_dir",
type=str,
required=True,
help="Directory containing the tokenizer files (e.g., tokenizer_config.json, tokenizer.json, etc.)"
)
args = parser.parse_args()
tokenizer_dir = args.tokenizer_dir
if not os.path.isdir(tokenizer_dir):
print(f"Error: Directory '{tokenizer_dir}' not found.", file=sys.stderr)
sys.exit(1)
# List of (filename, updater_function)
files_to_update = [
("tokenizer_config.json", update_tokenizer_config),
("tokenizer.json", update_tokenizer_json),
("added_tokens.json", update_added_tokens_json),
("special_tokens_map.json", update_special_tokens_map),
]
for filename, updater in files_to_update:
file_path = os.path.join(tokenizer_dir, filename)
if os.path.exists(file_path):
print(f"\nProcessing '{filename}'...")
update_json_file(file_path, updater)
else:
print(f"Skipping '{filename}': not found.")
if __name__ == "__main__":
main()