rpDungeon
/

GLM-4.7-Flash-SCM

+"""
+Convert GLM-4.7-Flash from HuggingFace format to ScatterMoE (SCM) format.
+Usage:
+    python convert_hf_to_scm.py <input_model_path> <output_model_path>
+Example:
+    python convert_hf_to_scm.py ~/.cache/huggingface/hub/models--zai-org--GLM-4.7-Flash/snapshots/<hash> ./GLM-4.7-Flash-SCM
+"""
+import glob
+import os
+import re
+import shutil
+import sys
+import accelerate
+import torch
+from safetensors import safe_open
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from configuration_glm_scm import Glm4MoeLiteSCMConfig
+from modeling_glm_scm import Glm4MoeLiteSCMForCausalLM
+input_model = sys.argv[1]
+output_model_path = sys.argv[2]
+auto_map = {
+    "AutoConfig": "configuration_glm_scm.Glm4MoeLiteSCMConfig",
+    "AutoModel": "modeling_glm_scm.Glm4MoeLiteSCMModel",
+    "AutoModelForCausalLM": "modeling_glm_scm.Glm4MoeLiteSCMForCausalLM",
+}
+# Load original config - use our config class which can parse the original format
+import json
+with open(os.path.join(input_model, "config.json")) as f:
+    orig_config = json.load(f)
+cfg_scm = Glm4MoeLiteSCMConfig(
+    auto_map=auto_map,
+    architectures=["Glm4MoeLiteSCMForCausalLM"],
+    vocab_size=orig_config["vocab_size"],
+    hidden_size=orig_config["hidden_size"],
+    intermediate_size=orig_config["intermediate_size"],
+    moe_intermediate_size=orig_config["moe_intermediate_size"],
+    num_hidden_layers=orig_config["num_hidden_layers"],
+    num_attention_heads=orig_config["num_attention_heads"],
+    num_key_value_heads=orig_config["num_key_value_heads"],
+    n_shared_experts=orig_config["n_shared_experts"],
+    n_routed_experts=orig_config["n_routed_experts"],
+    routed_scaling_factor=orig_config["routed_scaling_factor"],
+    kv_lora_rank=orig_config["kv_lora_rank"],
+    q_lora_rank=orig_config["q_lora_rank"],
+    qk_rope_head_dim=orig_config["qk_rope_head_dim"],
+    v_head_dim=orig_config["v_head_dim"],
+    qk_nope_head_dim=orig_config["qk_nope_head_dim"],
+    n_group=orig_config["n_group"],
+    topk_group=orig_config["topk_group"],
+    num_experts_per_tok=orig_config["num_experts_per_tok"],
+    norm_topk_prob=orig_config["norm_topk_prob"],
+    topk_method=orig_config["topk_method"],
+    first_k_dense_replace=orig_config.get("first_k_dense_replace", 1),
+    num_nextn_predict_layers=orig_config.get("num_nextn_predict_layers", 1),
+    hidden_act=orig_config["hidden_act"],
+    max_position_embeddings=orig_config["max_position_embeddings"],
+    rms_norm_eps=orig_config["rms_norm_eps"],
+    rope_theta=orig_config["rope_theta"],
+    rope_scaling=orig_config.get("rope_scaling", None),
+    rope_interleave=orig_config.get("rope_interleave", True),
+    attention_bias=orig_config.get("attention_bias", False),
+    attention_dropout=orig_config.get("attention_dropout", 0.0),
+    tie_word_embeddings=orig_config.get("tie_word_embeddings", False),
+    bos_token_id=orig_config.get("bos_token_id", 0),
+    eos_token_id=orig_config.get("eos_token_id", 1),
+    pad_token_id=orig_config.get("pad_token_id", None),
+    torch_dtype=orig_config.get("dtype", "bfloat16"),
+)
+num_experts = cfg_scm.n_routed_experts
+num_layers = cfg_scm.num_hidden_layers
+# Create empty model
+with accelerate.init_empty_weights():
+    model_scm = Glm4MoeLiteSCMForCausalLM(cfg_scm)
+model_scm = model_scm.to(torch.bfloat16)
+# Load all tensors from safetensors files
+new_state_dict = {}
+pattern = f"{input_model}/model-*-of-*.safetensors"
+files = sorted(glob.glob(pattern))
+if len(files) == 0:
+    pattern = f"{input_model}/model.safetensors"
+    files = sorted(glob.glob(pattern))
+if len(files) == 0:
+    raise FileNotFoundError(f"No safetensors files found in {input_model}")
+tensors = {}
+for file_path in files:
+    print(f"Loading {file_path}")
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+print(f"Loaded {len(tensors)} tensors")
+# Filter out layer 47+ (next-token prediction layers) if present
+filtered_tensors = {}
+for key in tensors:
+    layer_match = re.search(r"layers\.(\d+)", key)
+    if layer_match and int(layer_match.group(1)) >= num_layers:
+        print(f"Skipping next-token prediction layer key: {key}")
+        continue
+    filtered_tensors[key] = tensors[key]
+tensors = filtered_tensors
+# Convert weights
+processed_layers = set()
+for key in tensors:
+    if "mlp.experts" not in key or "shared_experts" in key:
+        # Non-expert weights: copy directly
+        new_state_dict[key] = tensors[key]
+    elif "experts.0." in key:
+        # First expert triggers conversion for the whole layer
+        layer_num = int(re.search(r"layers\.(\d+)", key).group(1))
+        if layer_num in processed_layers:
+            continue
+        processed_layers.add(layer_num)
+        print(f"Converting experts for layer {layer_num}")
+        # Stack down_proj -> output_experts.weight [n_experts, hidden_size, moe_intermediate_size]
+        new_state_dict[
+            f"model.layers.{layer_num}.mlp.moe_mlp.output_experts.weight"
+        ] = torch.stack(
+            [
+                tensors[f"model.layers.{layer_num}.mlp.experts.{i}.down_proj.weight"]
+                for i in range(num_experts)
+            ]
+        )
+        # Stack cat(up_proj, gate_proj) -> experts.weight [n_experts, 2*moe_intermediate_size, hidden_size]
+        new_state_dict[
+            f"model.layers.{layer_num}.mlp.moe_mlp.experts.weight"
+        ] = torch.stack(
+            [
+                torch.cat(
+                    [
+                        tensors[f"model.layers.{layer_num}.mlp.experts.{i}.up_proj.weight"],
+                        tensors[f"model.layers.{layer_num}.mlp.experts.{i}.gate_proj.weight"],
+                    ],
+                    dim=0,
+                )
+                for i in range(num_experts)
+            ]
+        )
+print(f"Converted state dict has {len(new_state_dict)} keys")
+# Load and save
+model_scm.load_state_dict(new_state_dict, strict=True, assign=True)
+model_scm.save_pretrained(output_model_path)
+cfg_scm.save_pretrained(output_model_path)
+# Copy modeling and config files
+script_dir = os.path.dirname(os.path.abspath(__file__))
+for fname in ["modeling_glm_scm.py", "configuration_glm_scm.py"]:
+    shutil.copy(os.path.join(script_dir, fname), os.path.join(output_model_path, fname))
+# Copy tokenizer files
+for fname in os.listdir(input_model):
+    if fname.startswith("tokenizer") or fname in [
+        "special_tokens_map.json",
+        "chat_template.jinja",
+    ]:
+        src = os.path.join(input_model, fname)
+        if os.path.isfile(src):
+            shutil.copy(src, os.path.join(output_model_path, fname))
+print(f"Model saved to {output_model_path}")