| |
| """Generate a minimized ~20B Kimi-K2.5-NVFP4 model for architecture testing. |
| |
| This creates random weights with the correct tensor names, shapes, and dtypes |
| to match the NVFP4 quantization format used by the original model. |
| |
| Mini model specs (TP=2 compatible): |
| hidden_size=4096, heads=32, layers=12, experts=64, moe_intermediate=2048 |
| """ |
|
|
| import json |
| import os |
| import struct |
| from pathlib import Path |
|
|
| import numpy as np |
| from safetensors.numpy import save_file |
|
|
|
|
| |
| |
| |
| HIDDEN = 4096 |
| NUM_HEADS = 32 |
| NUM_KV_HEADS = 32 |
| NUM_LAYERS = 12 |
| INTERMEDIATE = 11008 |
| VOCAB = 163840 |
| N_ROUTED_EXPERTS = 64 |
| N_SHARED_EXPERTS = 1 |
| NUM_EXPERTS_PER_TOK = 8 |
| MOE_INTERMEDIATE = 2048 |
| Q_LORA_RANK = 1536 |
| KV_LORA_RANK = 512 |
| QK_NOPE_HEAD_DIM = 128 |
| QK_ROPE_HEAD_DIM = 64 |
| V_HEAD_DIM = 128 |
| FIRST_K_DENSE_REPLACE = 1 |
| GROUP_SIZE = 16 |
|
|
| |
| VT_HIDDEN = 1152 |
| VT_LAYERS = 4 |
| VT_HEADS = 16 |
| VT_INTERMEDIATE = 4304 |
| PATCH_SIZE = 14 |
| MERGE_KERNEL = [2, 2] |
| MM_HIDDEN = VT_HIDDEN |
| MM_PROJECTED = MM_HIDDEN * MERGE_KERNEL[0] * MERGE_KERNEL[1] |
|
|
|
|
| def make_bf16(shape): |
| """Random BF16 tensor (stored as uint16 in numpy).""" |
| return np.random.randint(0, 65535, size=shape, dtype=np.uint16) |
|
|
|
|
| def make_fp4_weight(out_features, in_features): |
| """FP4 packed weight: [out, in//2] as uint8.""" |
| return np.random.randint(0, 255, size=(out_features, in_features // 2), dtype=np.uint8) |
|
|
|
|
| def make_fp8_scale(out_features, in_features): |
| """FP8 E4M3 weight scale: [out, in//group_size] as uint8.""" |
| return np.random.randint(0, 255, size=(out_features, in_features // GROUP_SIZE), dtype=np.uint8) |
|
|
|
|
| def make_scalar_f32(): |
| """Scalar float32.""" |
| return np.array(1.0, dtype=np.float32) |
|
|
|
|
| def add_quantized_linear(tensors, prefix, out_features, in_features): |
| """Add NVFP4 quantized linear layer tensors.""" |
| tensors[f"{prefix}.weight"] = make_fp4_weight(out_features, in_features) |
| tensors[f"{prefix}.weight_scale"] = make_fp8_scale(out_features, in_features) |
| tensors[f"{prefix}.weight_scale_2"] = make_scalar_f32() |
| tensors[f"{prefix}.input_scale"] = make_scalar_f32() |
|
|
|
|
| def add_bf16_linear(tensors, prefix, out_features, in_features, bias=False): |
| """Add BF16 linear layer tensors.""" |
| tensors[f"{prefix}.weight"] = make_bf16((out_features, in_features)) |
| if bias: |
| tensors[f"{prefix}.bias"] = make_bf16((out_features,)) |
|
|
|
|
| def add_attention(tensors, layer_prefix): |
| """Add MLA attention tensors (all BF16, excluded from quantization).""" |
| p = f"{layer_prefix}.self_attn" |
| |
| tensors[f"{p}.q_a_proj.weight"] = make_bf16((Q_LORA_RANK, HIDDEN)) |
| tensors[f"{p}.q_a_layernorm.weight"] = make_bf16((Q_LORA_RANK,)) |
| q_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + QK_ROPE_HEAD_DIM) |
| tensors[f"{p}.q_b_proj.weight"] = make_bf16((q_b_out, Q_LORA_RANK)) |
| |
| kv_a_out = KV_LORA_RANK + QK_ROPE_HEAD_DIM |
| tensors[f"{p}.kv_a_proj_with_mqa.weight"] = make_bf16((kv_a_out, HIDDEN)) |
| tensors[f"{p}.kv_a_layernorm.weight"] = make_bf16((KV_LORA_RANK,)) |
| kv_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + V_HEAD_DIM) |
| tensors[f"{p}.kv_b_proj.weight"] = make_bf16((kv_b_out, KV_LORA_RANK)) |
| |
| o_in = NUM_HEADS * V_HEAD_DIM |
| tensors[f"{p}.o_proj.weight"] = make_bf16((HIDDEN, o_in)) |
| |
| tensors[f"{p}.k_proj.k_scale"] = make_scalar_f32() |
| tensors[f"{p}.v_proj.v_scale"] = make_scalar_f32() |
|
|
|
|
| def add_dense_mlp(tensors, layer_prefix): |
| """Add dense MLP (layer 0) - quantized.""" |
| p = f"{layer_prefix}.mlp" |
| add_quantized_linear(tensors, f"{p}.gate_proj", INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{p}.up_proj", INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{p}.down_proj", HIDDEN, INTERMEDIATE) |
|
|
|
|
| def add_moe_mlp(tensors, layer_prefix): |
| """Add MoE MLP (layers 1+) - experts quantized.""" |
| p = f"{layer_prefix}.mlp" |
| |
| tensors[f"{p}.gate.weight"] = make_bf16((N_ROUTED_EXPERTS, HIDDEN)) |
| tensors[f"{p}.gate.e_score_correction_bias"] = make_bf16((N_ROUTED_EXPERTS,)) |
| |
| add_quantized_linear(tensors, f"{p}.shared_experts.gate_proj", MOE_INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{p}.shared_experts.up_proj", MOE_INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{p}.shared_experts.down_proj", HIDDEN, MOE_INTERMEDIATE) |
| |
| for e in range(N_ROUTED_EXPERTS): |
| ep = f"{p}.experts.{e}" |
| add_quantized_linear(tensors, f"{ep}.gate_proj", MOE_INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{ep}.up_proj", MOE_INTERMEDIATE, HIDDEN) |
| add_quantized_linear(tensors, f"{ep}.down_proj", HIDDEN, MOE_INTERMEDIATE) |
|
|
|
|
| def add_vision_tower(tensors): |
| """Add vision tower tensors (all BF16).""" |
| |
| tensors["vision_tower.patch_embed.proj.weight"] = make_bf16( |
| (VT_HIDDEN, 3, PATCH_SIZE, PATCH_SIZE) |
| ) |
| tensors["vision_tower.patch_embed.proj.bias"] = make_bf16((VT_HIDDEN,)) |
| tensors["vision_tower.patch_embed.pos_emb.weight"] = make_bf16((64, 64, VT_HIDDEN)) |
|
|
| |
| for b in range(VT_LAYERS): |
| bp = f"vision_tower.encoder.blocks.{b}" |
| |
| tensors[f"{bp}.wqkv.weight"] = make_bf16((3 * VT_HIDDEN, VT_HIDDEN)) |
| tensors[f"{bp}.wqkv.bias"] = make_bf16((3 * VT_HIDDEN,)) |
| |
| tensors[f"{bp}.wo.weight"] = make_bf16((VT_HIDDEN, VT_HIDDEN)) |
| tensors[f"{bp}.wo.bias"] = make_bf16((VT_HIDDEN,)) |
| |
| tensors[f"{bp}.norm0.weight"] = make_bf16((VT_HIDDEN,)) |
| tensors[f"{bp}.norm0.bias"] = make_bf16((VT_HIDDEN,)) |
| tensors[f"{bp}.norm1.weight"] = make_bf16((VT_HIDDEN,)) |
| tensors[f"{bp}.norm1.bias"] = make_bf16((VT_HIDDEN,)) |
| |
| tensors[f"{bp}.mlp.fc0.weight"] = make_bf16((VT_INTERMEDIATE, VT_HIDDEN)) |
| tensors[f"{bp}.mlp.fc0.bias"] = make_bf16((VT_INTERMEDIATE,)) |
| tensors[f"{bp}.mlp.fc1.weight"] = make_bf16((VT_HIDDEN, VT_INTERMEDIATE)) |
| tensors[f"{bp}.mlp.fc1.bias"] = make_bf16((VT_HIDDEN,)) |
|
|
| |
| tensors["vision_tower.encoder.final_layernorm.weight"] = make_bf16((VT_HIDDEN,)) |
| tensors["vision_tower.encoder.final_layernorm.bias"] = make_bf16((VT_HIDDEN,)) |
|
|
|
|
| def add_mm_projector(tensors): |
| """Add multimodal projector tensors (BF16).""" |
| tensors["mm_projector.pre_norm.weight"] = make_bf16((MM_HIDDEN,)) |
| tensors["mm_projector.pre_norm.bias"] = make_bf16((MM_HIDDEN,)) |
| tensors["mm_projector.proj.0.weight"] = make_bf16((MM_PROJECTED, MM_PROJECTED)) |
| tensors["mm_projector.proj.0.bias"] = make_bf16((MM_PROJECTED,)) |
| tensors["mm_projector.proj.2.weight"] = make_bf16((HIDDEN, MM_PROJECTED)) |
| tensors["mm_projector.proj.2.bias"] = make_bf16((HIDDEN,)) |
|
|
|
|
| def generate_all_tensors(): |
| """Generate all model tensors.""" |
| tensors = {} |
|
|
| |
| tensors["language_model.model.embed_tokens.weight"] = make_bf16((VOCAB, HIDDEN)) |
|
|
| |
| for layer_idx in range(NUM_LAYERS): |
| lp = f"language_model.model.layers.{layer_idx}" |
| tensors[f"{lp}.input_layernorm.weight"] = make_bf16((HIDDEN,)) |
| tensors[f"{lp}.post_attention_layernorm.weight"] = make_bf16((HIDDEN,)) |
|
|
| |
| add_attention(tensors, lp) |
|
|
| |
| if layer_idx < FIRST_K_DENSE_REPLACE: |
| add_dense_mlp(tensors, lp) |
| else: |
| add_moe_mlp(tensors, lp) |
|
|
| |
| tensors["language_model.model.norm.weight"] = make_bf16((HIDDEN,)) |
|
|
| |
| tensors["language_model.lm_head.weight"] = make_bf16((VOCAB, HIDDEN)) |
|
|
| |
| add_vision_tower(tensors) |
|
|
| |
| add_mm_projector(tensors) |
|
|
| return tensors |
|
|
|
|
| def compute_total_params(tensors): |
| """Count total parameters.""" |
| total = 0 |
| for name, arr in tensors.items(): |
| if name.endswith(".weight") and not name.endswith( |
| (".weight_scale", ".weight_scale_2") |
| ): |
| if arr.dtype == np.uint8 and "weight_scale" not in name: |
| |
| total += arr.shape[0] * arr.shape[1] * 2 |
| else: |
| total += arr.size |
| elif name.endswith(".bias"): |
| total += arr.size |
| return total |
|
|
|
|
| def save_sharded(tensors, output_dir, max_shard_bytes=5_000_000_000): |
| """Save tensors as sharded safetensors with index file.""" |
| output_dir = Path(output_dir) |
|
|
| |
| sorted_names = sorted(tensors.keys()) |
|
|
| |
| def tensor_bytes(arr): |
| return arr.nbytes |
|
|
| |
| shards = [] |
| current_shard = {} |
| current_size = 0 |
|
|
| for name in sorted_names: |
| arr = tensors[name] |
| size = tensor_bytes(arr) |
| if current_size + size > max_shard_bytes and current_shard: |
| shards.append(current_shard) |
| current_shard = {} |
| current_size = 0 |
| current_shard[name] = arr |
| current_size += size |
|
|
| if current_shard: |
| shards.append(current_shard) |
|
|
| num_shards = len(shards) |
| weight_map = {} |
| total_size = 0 |
|
|
| for i, shard in enumerate(shards, 1): |
| filename = f"model-{i:05d}-of-{num_shards:05d}.safetensors" |
| filepath = output_dir / filename |
|
|
| |
| shard_data = {} |
| for name, arr in shard.items(): |
| shard_data[name] = arr |
|
|
| save_file(shard_data, str(filepath)) |
| print(f" Saved {filename} ({len(shard)} tensors, {sum(a.nbytes for a in shard.values()) / 1e9:.2f} GB)") |
|
|
| for name in shard: |
| weight_map[name] = filename |
| total_size += tensors[name].nbytes |
|
|
| |
| index = { |
| "metadata": { |
| "total_size": total_size, |
| }, |
| "weight_map": weight_map, |
| } |
|
|
| index_path = output_dir / "model.safetensors.index.json" |
| with open(index_path, "w") as f: |
| json.dump(index, f, indent=2, sort_keys=True) |
| print(f" Saved index ({len(weight_map)} tensors, {num_shards} shards, {total_size / 1e9:.2f} GB total)") |
|
|
| return num_shards |
|
|
|
|
| def update_config(output_dir): |
| """Update config.json with mini dimensions.""" |
| config_path = Path(output_dir) / "config.json" |
| with open(config_path) as f: |
| config = json.load(f) |
|
|
| |
| tc = config["text_config"] |
| tc["hidden_size"] = HIDDEN |
| tc["num_attention_heads"] = NUM_HEADS |
| tc["num_key_value_heads"] = NUM_KV_HEADS |
| tc["num_hidden_layers"] = NUM_LAYERS |
| tc["intermediate_size"] = INTERMEDIATE |
| tc["n_routed_experts"] = N_ROUTED_EXPERTS |
| tc["n_shared_experts"] = N_SHARED_EXPERTS |
| tc["num_experts_per_tok"] = NUM_EXPERTS_PER_TOK |
| tc["moe_intermediate_size"] = MOE_INTERMEDIATE |
| tc["q_lora_rank"] = Q_LORA_RANK |
| tc["kv_lora_rank"] = KV_LORA_RANK |
| tc["qk_nope_head_dim"] = QK_NOPE_HEAD_DIM |
| tc["qk_rope_head_dim"] = QK_ROPE_HEAD_DIM |
| tc["v_head_dim"] = V_HEAD_DIM |
| tc["first_k_dense_replace"] = FIRST_K_DENSE_REPLACE |
|
|
| |
| vc = config["vision_config"] |
| vc["vt_num_hidden_layers"] = VT_LAYERS |
| vc["text_hidden_size"] = HIDDEN |
|
|
| |
| quant = config["quantization_config"] |
| ignore_list = [ |
| "language_model.lm_head", |
| "mm_projector*", |
| "vision_tower*", |
| ] |
| for i in range(NUM_LAYERS): |
| ignore_list.append(f"language_model.model.layers.{i}.self_attn*") |
| quant["ignore"] = sorted(ignore_list) |
|
|
| with open(config_path, "w") as f: |
| json.dump(config, f, indent=4) |
| print(f" Updated config.json") |
|
|
|
|
| def update_hf_quant_config(output_dir): |
| """Update hf_quant_config.json exclude list.""" |
| path = Path(output_dir) / "hf_quant_config.json" |
| with open(path) as f: |
| config = json.load(f) |
|
|
| exclude = [ |
| "language_model.lm_head", |
| "mm_projector*", |
| "vision_tower*", |
| ] |
| for i in range(NUM_LAYERS): |
| exclude.append(f"language_model.model.layers.{i}.self_attn*") |
| config["quantization"]["exclude_modules"] = sorted(exclude) |
|
|
| with open(path, "w") as f: |
| json.dump(config, f, indent=4) |
| print(f" Updated hf_quant_config.json") |
|
|
|
|
| def main(): |
| output_dir = "/home/ubuntu/.cache/huggingface/kimi-mini" |
|
|
| print("Generating mini Kimi-K2.5-NVFP4 model...") |
| print(f" Dimensions: hidden={HIDDEN}, heads={NUM_HEADS}, layers={NUM_LAYERS}") |
| print(f" MoE: {N_ROUTED_EXPERTS} experts, {NUM_EXPERTS_PER_TOK} per token") |
| print(f" Vision: {VT_LAYERS} layers, hidden={VT_HIDDEN}") |
| print() |
|
|
| print("Updating configs...") |
| update_config(output_dir) |
| update_hf_quant_config(output_dir) |
| print() |
|
|
| print("Generating tensors...") |
| tensors = generate_all_tensors() |
| total_params = compute_total_params(tensors) |
| print(f" Total tensors: {len(tensors)}") |
| print(f" Approx total params: {total_params / 1e9:.1f}B") |
| print() |
|
|
| print("Saving sharded safetensors...") |
| num_shards = save_sharded(tensors, output_dir) |
| print() |
|
|
| |
| print("Done! Mini model saved to:", output_dir) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|