File size: 5,011 Bytes
24c2665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
import subprocess
import json
import os
import shutil
import sys
import argparse

def run_huggingface_download(model_name):
    """Run huggingface-cli download and return the model path."""
    try:
        # Run the huggingface-cli download command
        env = os.environ.copy()
        
        result = subprocess.run(
            ['huggingface-cli', 'download', model_name],
            capture_output=True,
            text=True,
            env=env,
            check=True
        )
        
        # The path is typically the last line of output
        model_path = result.stdout.strip().split('\n')[-1]
        print(f"Model downloaded to: {model_path}")
        return model_path
        
    except subprocess.CalledProcessError as e:
        print(f"Error downloading model: {e}")
        print(f"Error output: {e.stderr}")
        sys.exit(1)

def backup_and_modify_tokenizer_config(model_path, revert=False):
    """Backup tokenizer_config.json and remove specified keys."""
    tokenizer_config_path = os.path.join(model_path, 'tokenizer_config.json')
    backup_path = os.path.join(model_path, 'tokenizer_config.json.old')
    
    # Check if tokenizer_config.json exists
    if not os.path.exists(tokenizer_config_path):
        print(f"Warning: tokenizer_config.json not found in {model_path}")
        return
    
    # Create backup
    try:
        # Remove existing backup if it exists
        if os.path.exists(backup_path):
            os.remove(backup_path)
            print(f"Removed existing backup: {backup_path}")
        
        # Create new backup
        shutil.copy2(tokenizer_config_path, backup_path)
        print(f"Backup created: {backup_path}")
    except Exception as e:
        print(f"Error creating backup: {e}")
        print(f"Attempting to continue without backup...")
        # Don't exit, just warn and continue
    
    # Load and modify the JSON
    try:
        with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        
        # Check if added_tokens_decoder exists
        if 'added_tokens_decoder' not in config:
            print("Warning: 'added_tokens_decoder' key not found in tokenizer_config.json")
            return
        
        # Remove the specified keys
        keys_to_remove = ["151667", "151668"]
        removed_keys = []
        
        if revert:
            config['added_tokens_decoder']['151667'] = {
                "content": "<think>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": False
            }
            config['added_tokens_decoder']['151668'] = {
                "content": "</think>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": False
            }

        else:
            for key in keys_to_remove:
                if key in config['added_tokens_decoder']:
                    del config['added_tokens_decoder'][key]
                    removed_keys.append(key)
        
        if removed_keys:
            print(f"Removed keys from added_tokens_decoder: {removed_keys}")
        elif revert:
            print("Reverted tokenizer config to the original")
        else:
            print("Keys 151667 and 151668 not found in added_tokens_decoder")
        
        # Write the modified config back
        with open(tokenizer_config_path, 'w', encoding='utf-8') as f:
            json.dump(config, f, indent=2, ensure_ascii=False)
        
        print(f"Modified tokenizer_config.json saved")
        
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Error modifying tokenizer config: {e}")
        sys.exit(1)

def main():
    parser = argparse.ArgumentParser(description='Download HuggingFace model and fix tokenizer config')
    parser.add_argument('--model_name', help='HuggingFace model name (e.g., Qwen/Qwen3-4B-Base)')
    parser.add_argument('--model_path', help='Direct path to already downloaded model directory')
    parser.add_argument('--revert', action='store_true', help='Revert the tokenizer config to the original')
    
    args = parser.parse_args()
    
    if args.model_path:
        # Use existing model path
        model_path = args.model_path
        print(f"Using existing model path: {model_path}")
    elif args.model_name:
        # Download model
        print(f"Downloading model: {args.model_name}")
        model_path = run_huggingface_download(args.model_name)
    else:
        print("Error: Either --model_name or --model_path must be provided")
        sys.exit(1)
    
    print(f"Processing tokenizer config in: {model_path}")
    backup_and_modify_tokenizer_config(model_path, args.revert)
    
    print("Done!")

if __name__ == "__main__":
    main()