File size: 5,011 Bytes
24c2665 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/usr/bin/env python3
import subprocess
import json
import os
import shutil
import sys
import argparse
def run_huggingface_download(model_name):
"""Run huggingface-cli download and return the model path."""
try:
# Run the huggingface-cli download command
env = os.environ.copy()
result = subprocess.run(
['huggingface-cli', 'download', model_name],
capture_output=True,
text=True,
env=env,
check=True
)
# The path is typically the last line of output
model_path = result.stdout.strip().split('\n')[-1]
print(f"Model downloaded to: {model_path}")
return model_path
except subprocess.CalledProcessError as e:
print(f"Error downloading model: {e}")
print(f"Error output: {e.stderr}")
sys.exit(1)
def backup_and_modify_tokenizer_config(model_path, revert=False):
"""Backup tokenizer_config.json and remove specified keys."""
tokenizer_config_path = os.path.join(model_path, 'tokenizer_config.json')
backup_path = os.path.join(model_path, 'tokenizer_config.json.old')
# Check if tokenizer_config.json exists
if not os.path.exists(tokenizer_config_path):
print(f"Warning: tokenizer_config.json not found in {model_path}")
return
# Create backup
try:
# Remove existing backup if it exists
if os.path.exists(backup_path):
os.remove(backup_path)
print(f"Removed existing backup: {backup_path}")
# Create new backup
shutil.copy2(tokenizer_config_path, backup_path)
print(f"Backup created: {backup_path}")
except Exception as e:
print(f"Error creating backup: {e}")
print(f"Attempting to continue without backup...")
# Don't exit, just warn and continue
# Load and modify the JSON
try:
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# Check if added_tokens_decoder exists
if 'added_tokens_decoder' not in config:
print("Warning: 'added_tokens_decoder' key not found in tokenizer_config.json")
return
# Remove the specified keys
keys_to_remove = ["151667", "151668"]
removed_keys = []
if revert:
config['added_tokens_decoder']['151667'] = {
"content": "<think>",
"lstrip": False,
"normalized": False,
"rstrip": False,
"single_word": False,
"special": False
}
config['added_tokens_decoder']['151668'] = {
"content": "</think>",
"lstrip": False,
"normalized": False,
"rstrip": False,
"single_word": False,
"special": False
}
else:
for key in keys_to_remove:
if key in config['added_tokens_decoder']:
del config['added_tokens_decoder'][key]
removed_keys.append(key)
if removed_keys:
print(f"Removed keys from added_tokens_decoder: {removed_keys}")
elif revert:
print("Reverted tokenizer config to the original")
else:
print("Keys 151667 and 151668 not found in added_tokens_decoder")
# Write the modified config back
with open(tokenizer_config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, indent=2, ensure_ascii=False)
print(f"Modified tokenizer_config.json saved")
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
sys.exit(1)
except Exception as e:
print(f"Error modifying tokenizer config: {e}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description='Download HuggingFace model and fix tokenizer config')
parser.add_argument('--model_name', help='HuggingFace model name (e.g., Qwen/Qwen3-4B-Base)')
parser.add_argument('--model_path', help='Direct path to already downloaded model directory')
parser.add_argument('--revert', action='store_true', help='Revert the tokenizer config to the original')
args = parser.parse_args()
if args.model_path:
# Use existing model path
model_path = args.model_path
print(f"Using existing model path: {model_path}")
elif args.model_name:
# Download model
print(f"Downloading model: {args.model_name}")
model_path = run_huggingface_download(args.model_name)
else:
print("Error: Either --model_name or --model_path must be provided")
sys.exit(1)
print(f"Processing tokenizer config in: {model_path}")
backup_and_modify_tokenizer_config(model_path, args.revert)
print("Done!")
if __name__ == "__main__":
main() |