|
|
|
|
|
""" |
|
|
Graft INTELLECT-3 language model weights into GLM-4.6V vision-language model. |
|
|
|
|
|
This script: |
|
|
1. Loads both models into CPU memory |
|
|
2. Copies model.layers.* from INTELLECT-3 to model.language_model.layers.* in GLM-4.6V |
|
|
3. Copies model.norm.weight from INTELLECT-3 to model.language_model.norm.weight in GLM-4.6V |
|
|
4. Saves the resulting model to a new directory |
|
|
|
|
|
Does NOT touch: |
|
|
- model.language_model.embed_tokens (needed for vision token compatibility) |
|
|
- lm_head (kept aligned with embed_tokens) |
|
|
- model.visual.* (vision encoder preserved) |
|
|
""" |
|
|
|
|
|
import os |
|
|
import argparse |
|
|
import json |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
from safetensors import safe_open |
|
|
from safetensors.torch import save_file |
|
|
import torch |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
def get_safetensor_files(model_dir: Path) -> list[Path]: |
|
|
"""Get all safetensor files in a model directory.""" |
|
|
files = sorted(model_dir.glob("*.safetensors")) |
|
|
if not files: |
|
|
raise FileNotFoundError(f"No safetensor files found in {model_dir}") |
|
|
return files |
|
|
|
|
|
|
|
|
def load_state_dict_from_safetensors(model_dir: Path) -> dict[str, torch.Tensor]: |
|
|
"""Load all tensors from safetensor files into a state dict.""" |
|
|
state_dict = {} |
|
|
files = get_safetensor_files(model_dir) |
|
|
|
|
|
for f in tqdm(files, desc=f"Loading {model_dir.name}"): |
|
|
with safe_open(f, framework="pt", device="cpu") as st: |
|
|
for key in st.keys(): |
|
|
state_dict[key] = st.get_tensor(key) |
|
|
|
|
|
return state_dict |
|
|
|
|
|
|
|
|
def graft_weights( |
|
|
intellect3_state: dict[str, torch.Tensor], |
|
|
glm_state: dict[str, torch.Tensor] |
|
|
) -> dict[str, torch.Tensor]: |
|
|
""" |
|
|
Graft INTELLECT-3 weights into GLM-4.6V state dict. |
|
|
|
|
|
Mapping: |
|
|
- model.layers.* -> model.language_model.layers.* |
|
|
- model.norm.weight -> model.language_model.norm.weight |
|
|
""" |
|
|
grafted_state = dict(glm_state) |
|
|
|
|
|
grafted_count = 0 |
|
|
skipped_keys = [] |
|
|
|
|
|
for intellect_key, tensor in tqdm(intellect3_state.items(), desc="Grafting weights"): |
|
|
|
|
|
if "embed_tokens" in intellect_key or "lm_head" in intellect_key: |
|
|
skipped_keys.append(intellect_key) |
|
|
continue |
|
|
|
|
|
|
|
|
if intellect_key.startswith("model.layers."): |
|
|
glm_key = intellect_key.replace("model.layers.", "model.language_model.layers.") |
|
|
|
|
|
elif intellect_key == "model.norm.weight": |
|
|
glm_key = "model.language_model.norm.weight" |
|
|
else: |
|
|
skipped_keys.append(intellect_key) |
|
|
continue |
|
|
|
|
|
|
|
|
if glm_key not in grafted_state: |
|
|
print(f"WARNING: {glm_key} not found in GLM-4.6V state dict!") |
|
|
continue |
|
|
|
|
|
if grafted_state[glm_key].shape != tensor.shape: |
|
|
print(f"WARNING: Shape mismatch for {glm_key}:") |
|
|
print(f" INTELLECT-3: {tensor.shape}") |
|
|
print(f" GLM-4.6V: {grafted_state[glm_key].shape}") |
|
|
continue |
|
|
|
|
|
grafted_state[glm_key] = tensor |
|
|
grafted_count += 1 |
|
|
|
|
|
print(f"\nGrafted {grafted_count} tensors from INTELLECT-3") |
|
|
print(f"Skipped {len(skipped_keys)} tensors: {skipped_keys[:5]}{'...' if len(skipped_keys) > 5 else ''}") |
|
|
|
|
|
return grafted_state |
|
|
|
|
|
|
|
|
def save_state_dict_to_safetensors( |
|
|
state_dict: dict[str, torch.Tensor], |
|
|
output_dir: Path, |
|
|
max_shard_size: int = 5 * 1024 ** 3 |
|
|
): |
|
|
"""Save state dict to sharded safetensor files.""" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
tensors_by_size = [(k, v, v.numel() * v.element_size()) for k, v in state_dict.items()] |
|
|
total_size = sum(size for _, _, size in tensors_by_size) |
|
|
|
|
|
print(f"\nTotal model size: {total_size / 1024**3:.2f} GB") |
|
|
|
|
|
|
|
|
shards = [] |
|
|
current_shard = {} |
|
|
current_size = 0 |
|
|
|
|
|
for key, tensor, size in tensors_by_size: |
|
|
if current_size + size > max_shard_size and current_shard: |
|
|
shards.append(current_shard) |
|
|
current_shard = {} |
|
|
current_size = 0 |
|
|
|
|
|
current_shard[key] = tensor |
|
|
current_size += size |
|
|
|
|
|
if current_shard: |
|
|
shards.append(current_shard) |
|
|
|
|
|
print(f"Saving to {len(shards)} shard(s)...") |
|
|
|
|
|
|
|
|
weight_map = {} |
|
|
|
|
|
for i, shard in enumerate(tqdm(shards, desc="Saving shards")): |
|
|
if len(shards) == 1: |
|
|
filename = "model.safetensors" |
|
|
else: |
|
|
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors" |
|
|
|
|
|
filepath = output_dir / filename |
|
|
save_file(shard, filepath) |
|
|
|
|
|
for key in shard.keys(): |
|
|
weight_map[key] = filename |
|
|
|
|
|
|
|
|
if len(shards) > 1: |
|
|
index = { |
|
|
"metadata": {"total_size": total_size}, |
|
|
"weight_map": weight_map |
|
|
} |
|
|
with open(output_dir / "model.safetensors.index.json", "w") as f: |
|
|
json.dump(index, f, indent=2) |
|
|
|
|
|
return weight_map |
|
|
|
|
|
|
|
|
def copy_config_files(src_dir: Path, dst_dir: Path): |
|
|
"""Copy config files from source to destination.""" |
|
|
config_files = [ |
|
|
"config.json", |
|
|
"tokenizer.json", |
|
|
"tokenizer_config.json", |
|
|
"special_tokens_map.json", |
|
|
"generation_config.json", |
|
|
"preprocessor_config.json", |
|
|
"chat_template.json", |
|
|
] |
|
|
|
|
|
for filename in config_files: |
|
|
src_file = src_dir / filename |
|
|
if src_file.exists(): |
|
|
shutil.copy2(src_file, dst_dir / filename) |
|
|
print(f"Copied {filename}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Graft INTELLECT-3 weights into GLM-4.6V" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--intellect3", |
|
|
type=Path, |
|
|
default=Path.home() / "models" / "INTELLECT-3", |
|
|
help="Path to INTELLECT-3 model directory" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--glm", |
|
|
type=Path, |
|
|
default=Path.home() / "models" / "GLM-4.6V", |
|
|
help="Path to GLM-4.6V model directory" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
type=Path, |
|
|
default=Path.home() / "models" / "INTELLECT-3-V", |
|
|
help="Path to output directory" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--shard-size", |
|
|
type=int, |
|
|
default=5, |
|
|
help="Maximum shard size in GB (default: 5)" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("=" * 60) |
|
|
print("INTELLECT-3 -> GLM-4.6V Weight Grafting") |
|
|
print("=" * 60) |
|
|
print(f"INTELLECT-3 source: {args.intellect3}") |
|
|
print(f"GLM-4.6V source: {args.glm}") |
|
|
print(f"Output directory: {args.output}") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if not args.intellect3.exists(): |
|
|
raise FileNotFoundError(f"INTELLECT-3 directory not found: {args.intellect3}") |
|
|
if not args.glm.exists(): |
|
|
raise FileNotFoundError(f"GLM-4.6V directory not found: {args.glm}") |
|
|
|
|
|
|
|
|
print("\nStep 1: Loading models into CPU memory...") |
|
|
intellect3_state = load_state_dict_from_safetensors(args.intellect3) |
|
|
glm_state = load_state_dict_from_safetensors(args.glm) |
|
|
|
|
|
print(f"\nINTELLECT-3 tensors: {len(intellect3_state)}") |
|
|
print(f"GLM-4.6V tensors: {len(glm_state)}") |
|
|
|
|
|
|
|
|
print("\nStep 2: Grafting INTELLECT-3 weights into GLM-4.6V...") |
|
|
grafted_state = graft_weights(intellect3_state, glm_state) |
|
|
|
|
|
|
|
|
del intellect3_state |
|
|
del glm_state |
|
|
|
|
|
|
|
|
print("\nStep 3: Saving grafted model...") |
|
|
save_state_dict_to_safetensors( |
|
|
grafted_state, |
|
|
args.output, |
|
|
max_shard_size=args.shard_size * 1024 ** 3 |
|
|
) |
|
|
|
|
|
|
|
|
print("\nStep 4: Copying config files from GLM-4.6V...") |
|
|
copy_config_files(args.glm, args.output) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Done! Grafted model saved to:", args.output) |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|