llama3-reasoning-template-single / replace_reserved_tokens.py

Per Egil Kummervold

updated with single tokens

5bc4f61 about 1 year ago

6.69 kB

	#!/usr/bin/env python3
	"""
	replace_reserved_tokens.py

	This script updates the reserved special tokens in a Hugging Face tokenizer directory.
	It replaces the tokens with the following mapping:
	ID 128013: "<\|think\|>"
	ID 128014: "<\|/think\|>"
	ID 128015: "<\|answer\|>"
	ID 128016: "<\|/answer\|>"

	It updates all key files if they exist: tokenizer_config.json, tokenizer.json,
	added_tokens.json, and special_tokens_map.json.

	Usage:
	python3 replace_reserved_tokens.py --tokenizer_dir /path/to/tokenizer_dir
	A backup (.backup) of each file updated is created.
	"""

	import argparse
	import json
	import os
	import sys

	# Define the replacement mapping as a dictionary with keys as strings
	# (these are the token IDs as stored in the JSON) and values as the new token content.
	REPLACEMENT_TOKENS = {
	"128013": "<\|think\|>",
	"128014": "<\|/think\|>",
	"128015": "<\|answer\|>",
	"128016": "<\|/answer\|>"
	}

	def update_json_file(file_path, updater_func):
	"""
	Load a JSON file, update it using updater_func(data), and if changes occur,
	backup the original file and write out the modified JSON.
	"""
	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return False
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	except Exception as e:
	print(f"Error reading {file_path}: {e}", file=sys.stderr)
	return False

	changed = updater_func(data)
	if changed:
	backup_path = file_path + ".backup"
	os.rename(file_path, backup_path)
	with open(file_path, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	print(f"Updated '{file_path}'. Backup saved to '{backup_path}'.")
	else:
	print(f"No changes needed for '{file_path}'.")
	return changed

	def update_tokenizer_config(data):
	"""
	Update the "added_tokens_decoder" field in tokenizer_config.json.
	"""
	changed = False
	if "added_tokens_decoder" in data:
	for token_id, new_content in REPLACEMENT_TOKENS.items():
	if token_id in data["added_tokens_decoder"]:
	current = data["added_tokens_decoder"][token_id].get("content", "")
	if current != new_content:
	print(f"[tokenizer_config.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
	data["added_tokens_decoder"][token_id]["content"] = new_content
	changed = True
	else:
	print(f"[tokenizer_config.json] Token id {token_id} already set to '{new_content}'.")
	else:
	print(f"[tokenizer_config.json] Warning: token id {token_id} not found.", file=sys.stderr)
	else:
	print("Key 'added_tokens_decoder' not found in tokenizer_config.json.", file=sys.stderr)
	return changed

	def update_tokenizer_json(data):
	"""
	Update the "added_tokens" list in tokenizer.json.
	The structure is assumed to be a dictionary that includes an "added_tokens" key.
	"""
	changed = False
	if "added_tokens" in data and isinstance(data["added_tokens"], list):
	for token in data["added_tokens"]:
	# The token "id" might be an integer or string; compare after converting to string.
	token_id = str(token.get("id"))
	if token_id in REPLACEMENT_TOKENS:
	current = token.get("content", "")
	new_content = REPLACEMENT_TOKENS[token_id]
	if current != new_content:
	print(f"[tokenizer.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
	token["content"] = new_content
	changed = True
	else:
	print(f"[tokenizer.json] Token id {token_id} already set to '{new_content}'.")
	else:
	print("Key 'added_tokens' not found or not a list in tokenizer.json.", file=sys.stderr)
	return changed

	def update_added_tokens_json(data):
	"""
	Update the added_tokens.json file if it exists.
	Assume data is a dict mapping token IDs to token info.
	"""
	changed = False
	for token_id, new_content in REPLACEMENT_TOKENS.items():
	if token_id in data:
	current = data[token_id].get("content", "")
	if current != new_content:
	print(f"[added_tokens.json] Replacing token id {token_id}: '{current}' -> '{new_content}'")
	data[token_id]["content"] = new_content
	changed = True
	else:
	print(f"[added_tokens.json] Token id {token_id} already set to '{new_content}'.")
	else:
	print(f"[added_tokens.json] Warning: token id {token_id} not found.", file=sys.stderr)
	return changed

	def update_special_tokens_map(data):
	"""
	Update special_tokens_map.json if needed.
	This file maps roles (e.g. bos_token) to token strings.
	If any of our replacement tokens appear here, update them.
	(Often, these reserved tokens are not referenced here,
	so this function may be a no-op.)
	"""
	changed = False
	for key, value in data.items():
	if isinstance(value, str):
	# If the current value equals one of our original reserved tokens,
	# you might want to update it. However, without the original values,
	# we leave it unchanged unless needed.
	pass
	return changed

	def main():
	parser = argparse.ArgumentParser(
	description="Replace reserved tokens in a Hugging Face tokenizer directory."
	)
	parser.add_argument(
	"--tokenizer_dir",
	type=str,
	required=True,
	help="Directory containing the tokenizer files (e.g., tokenizer_config.json, tokenizer.json, etc.)"
	)
	args = parser.parse_args()

	tokenizer_dir = args.tokenizer_dir
	if not os.path.isdir(tokenizer_dir):
	print(f"Error: Directory '{tokenizer_dir}' not found.", file=sys.stderr)
	sys.exit(1)

	# List of (filename, updater_function)
	files_to_update = [
	("tokenizer_config.json", update_tokenizer_config),
	("tokenizer.json", update_tokenizer_json),
	("added_tokens.json", update_added_tokens_json),
	("special_tokens_map.json", update_special_tokens_map),
	]

	for filename, updater in files_to_update:
	file_path = os.path.join(tokenizer_dir, filename)
	if os.path.exists(file_path):
	print(f"\nProcessing '{filename}'...")
	update_json_file(file_path, updater)
	else:
	print(f"Skipping '{filename}': not found.")

	if __name__ == "__main__":
	main()