| |
| import torch |
| import os |
| import argparse |
| from tqdm import tqdm |
| from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
| from accelerate import init_empty_weights |
|
|
| def main(foundation_model_id, donor_model_id, output_path): |
| """ |
| Creates the definitive 'Aligned' donor model by manually handling all architectural mismatches. |
| This version MANUALLY INSTANTIATES new tensors to defeat memory sharing optimizations. |
| """ |
| print("--- Phase 1: Building the target Qwen3 80-Layer Architecture ---") |
| |
| foundation_config = AutoConfig.from_pretrained(foundation_model_id, trust_remote_code=True) |
| |
| target_config = foundation_config |
| target_config.num_hidden_layers = 80 |
| target_config.hidden_size = 8192 |
| target_config.intermediate_size = 29568 |
| target_config.vocab_size = 151936 |
| target_config.torch_dtype = torch.bfloat16 |
| |
| print("Creating empty Qwen3 80-layer model shell...") |
| with init_empty_weights(): |
| aligned_model = AutoModelForCausalLM.from_config(target_config, trust_remote_code=True) |
| print("Empty shell created successfully.") |
|
|
| print("\n--- Phase 2: Loading and Manually Aligning Donor Weights ---") |
| print(f"Loading weights from donor: {donor_model_id}") |
| |
| donor_state_dict = AutoModelForCausalLM.from_pretrained( |
| donor_model_id, torch_dtype=torch.bfloat16, device_map="cpu", trust_remote_code=True |
| ).state_dict() |
|
|
| target_state_dict = aligned_model.state_dict() |
| new_state_dict = {} |
|
|
| print("Copying and aligning tensors one-by-one...") |
| for name, target_tensor in tqdm(target_state_dict.items(), desc="Aligning Tensors"): |
| if name in donor_state_dict: |
| |
| donor_tensor = donor_state_dict[name] |
| if donor_tensor.shape == target_tensor.shape: |
| new_state_dict[name] = donor_tensor.clone() |
| else: |
| vocab_dim = target_tensor.shape[0] |
| new_state_dict[name] = donor_tensor[:vocab_dim, :].clone() |
| else: |
| |
| |
| |
| |
| new_state_dict[name] = torch.ones(target_tensor.shape, dtype=torch.bfloat16) |
| |
| print("Loading the fully aligned state_dict into the Qwen3 shell...") |
| aligned_model.load_state_dict(new_state_dict, strict=True, assign=True) |
| |
| |
| aligned_model.tie_weights() |
|
|
| print("\n--- Phase 3: Saving the Aligned Donor ---") |
| tokenizer = AutoTokenizer.from_pretrained(foundation_model_id, trust_remote_code=True) |
| |
| print(f"Saving the architecturally aligned model to: {output_path}") |
| os.makedirs(output_path, exist_ok=True) |
| aligned_model.save_pretrained(output_path) |
| tokenizer.save_pretrained(output_path) |
|
|
| print("\nDonor preparation complete! This is the definitive donor model.") |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Prepare a Qwen2.5 donor model for merging with Qwen3.") |
| parser.add_argument("--foundation_model", type=str, default="Qwen/Qwen3-32B", help="Model to use for the Qwen3 architecture blueprint.") |
| parser.add_argument("--donor_model", type=str, default="Qwen/Qwen2.5-72B-Instruct", help="The donor model providing the weights.") |
| parser.add_argument("--output_path", type=str, required=True, help="The local directory path to save the prepared donor model.") |
| args = parser.parse_args() |
| |
| main(args.foundation_model, args.donor_model, args.output_path) |
|
|