| |
| """ |
| replace_reserved_tokens.py |
| |
| This script updates the reserved special tokens in a Hugging Face tokenizer directory. |
| It replaces the tokens with the following mapping: |
| ID 128013: "<|think|>" |
| ID 128014: "<|/think|>" |
| ID 128015: "<|answer|>" |
| ID 128016: "<|/answer|>" |
| |
| It updates all key files if they exist: tokenizer_config.json, tokenizer.json, |
| added_tokens.json, and special_tokens_map.json. |
| |
| Usage: |
| python3 replace_reserved_tokens.py --tokenizer_dir /path/to/tokenizer_dir |
| A backup (.backup) of each file updated is created. |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import sys |
|
|
| |
| |
| REPLACEMENT_TOKENS = { |
| "128013": "<|think|>", |
| "128014": "<|/think|>", |
| "128015": "<|answer|>", |
| "128016": "<|/answer|>" |
| } |
|
|
| def update_json_file(file_path, updater_func): |
| """ |
| Load a JSON file, update it using updater_func(data), and if changes occur, |
| backup the original file and write out the modified JSON. |
| """ |
| if not os.path.exists(file_path): |
| print(f"File not found: {file_path}") |
| return False |
| try: |
| with open(file_path, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| except Exception as e: |
| print(f"Error reading {file_path}: {e}", file=sys.stderr) |
| return False |
|
|
| changed = updater_func(data) |
| if changed: |
| backup_path = file_path + ".backup" |
| os.rename(file_path, backup_path) |
| with open(file_path, "w", encoding="utf-8") as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
| print(f"Updated '{file_path}'. Backup saved to '{backup_path}'.") |
| else: |
| print(f"No changes needed for '{file_path}'.") |
| return changed |
|
|
| def update_tokenizer_config(data): |
| """ |
| Update the "added_tokens_decoder" field in tokenizer_config.json. |
| """ |
| changed = False |
| if "added_tokens_decoder" in data: |
| for token_id, new_content in REPLACEMENT_TOKENS.items(): |
| if token_id in data["added_tokens_decoder"]: |
| current = data["added_tokens_decoder"][token_id].get("content", "") |
| if current != new_content: |
| print(f"[tokenizer_config.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") |
| data["added_tokens_decoder"][token_id]["content"] = new_content |
| changed = True |
| else: |
| print(f"[tokenizer_config.json] Token id {token_id} already set to '{new_content}'.") |
| else: |
| print(f"[tokenizer_config.json] Warning: token id {token_id} not found.", file=sys.stderr) |
| else: |
| print("Key 'added_tokens_decoder' not found in tokenizer_config.json.", file=sys.stderr) |
| return changed |
|
|
| def update_tokenizer_json(data): |
| """ |
| Update the "added_tokens" list in tokenizer.json. |
| The structure is assumed to be a dictionary that includes an "added_tokens" key. |
| """ |
| changed = False |
| if "added_tokens" in data and isinstance(data["added_tokens"], list): |
| for token in data["added_tokens"]: |
| |
| token_id = str(token.get("id")) |
| if token_id in REPLACEMENT_TOKENS: |
| current = token.get("content", "") |
| new_content = REPLACEMENT_TOKENS[token_id] |
| if current != new_content: |
| print(f"[tokenizer.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") |
| token["content"] = new_content |
| changed = True |
| else: |
| print(f"[tokenizer.json] Token id {token_id} already set to '{new_content}'.") |
| else: |
| print("Key 'added_tokens' not found or not a list in tokenizer.json.", file=sys.stderr) |
| return changed |
|
|
| def update_added_tokens_json(data): |
| """ |
| Update the added_tokens.json file if it exists. |
| Assume data is a dict mapping token IDs to token info. |
| """ |
| changed = False |
| for token_id, new_content in REPLACEMENT_TOKENS.items(): |
| if token_id in data: |
| current = data[token_id].get("content", "") |
| if current != new_content: |
| print(f"[added_tokens.json] Replacing token id {token_id}: '{current}' -> '{new_content}'") |
| data[token_id]["content"] = new_content |
| changed = True |
| else: |
| print(f"[added_tokens.json] Token id {token_id} already set to '{new_content}'.") |
| else: |
| print(f"[added_tokens.json] Warning: token id {token_id} not found.", file=sys.stderr) |
| return changed |
|
|
| def update_special_tokens_map(data): |
| """ |
| Update special_tokens_map.json if needed. |
| This file maps roles (e.g. bos_token) to token strings. |
| If any of our replacement tokens appear here, update them. |
| (Often, these reserved tokens are not referenced here, |
| so this function may be a no-op.) |
| """ |
| changed = False |
| for key, value in data.items(): |
| if isinstance(value, str): |
| |
| |
| |
| pass |
| return changed |
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Replace reserved tokens in a Hugging Face tokenizer directory." |
| ) |
| parser.add_argument( |
| "--tokenizer_dir", |
| type=str, |
| required=True, |
| help="Directory containing the tokenizer files (e.g., tokenizer_config.json, tokenizer.json, etc.)" |
| ) |
| args = parser.parse_args() |
|
|
| tokenizer_dir = args.tokenizer_dir |
| if not os.path.isdir(tokenizer_dir): |
| print(f"Error: Directory '{tokenizer_dir}' not found.", file=sys.stderr) |
| sys.exit(1) |
|
|
| |
| files_to_update = [ |
| ("tokenizer_config.json", update_tokenizer_config), |
| ("tokenizer.json", update_tokenizer_json), |
| ("added_tokens.json", update_added_tokens_json), |
| ("special_tokens_map.json", update_special_tokens_map), |
| ] |
|
|
| for filename, updater in files_to_update: |
| file_path = os.path.join(tokenizer_dir, filename) |
| if os.path.exists(file_path): |
| print(f"\nProcessing '{filename}'...") |
| update_json_file(file_path, updater) |
| else: |
| print(f"Skipping '{filename}': not found.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|