#!/usr/bin/env python3 """Generate a minimized ~20B Kimi-K2.5-NVFP4 model for architecture testing. This creates random weights with the correct tensor names, shapes, and dtypes to match the NVFP4 quantization format used by the original model. Mini model specs (TP=2 compatible): hidden_size=4096, heads=32, layers=12, experts=64, moe_intermediate=2048 """ import json import os import struct from pathlib import Path import numpy as np from safetensors.numpy import save_file # ============================================================ # Mini model dimensions # ============================================================ HIDDEN = 4096 NUM_HEADS = 32 NUM_KV_HEADS = 32 # MLA uses same as heads NUM_LAYERS = 12 INTERMEDIATE = 11008 VOCAB = 163840 N_ROUTED_EXPERTS = 64 N_SHARED_EXPERTS = 1 NUM_EXPERTS_PER_TOK = 8 MOE_INTERMEDIATE = 2048 Q_LORA_RANK = 1536 # keep original to match FlashInfer MLA head_size KV_LORA_RANK = 512 # keep original: head_size = 512+64 = 576 QK_NOPE_HEAD_DIM = 128 QK_ROPE_HEAD_DIM = 64 V_HEAD_DIM = 128 FIRST_K_DENSE_REPLACE = 1 GROUP_SIZE = 16 # Vision tower VT_HIDDEN = 1152 VT_LAYERS = 4 # reduced from 27 VT_HEADS = 16 VT_INTERMEDIATE = 4304 PATCH_SIZE = 14 MERGE_KERNEL = [2, 2] MM_HIDDEN = VT_HIDDEN # 1152 MM_PROJECTED = MM_HIDDEN * MERGE_KERNEL[0] * MERGE_KERNEL[1] # 4608 def make_bf16(shape): """Random BF16 tensor (stored as uint16 in numpy).""" return np.random.randint(0, 65535, size=shape, dtype=np.uint16) def make_fp4_weight(out_features, in_features): """FP4 packed weight: [out, in//2] as uint8.""" return np.random.randint(0, 255, size=(out_features, in_features // 2), dtype=np.uint8) def make_fp8_scale(out_features, in_features): """FP8 E4M3 weight scale: [out, in//group_size] as uint8.""" return np.random.randint(0, 255, size=(out_features, in_features // GROUP_SIZE), dtype=np.uint8) def make_scalar_f32(): """Scalar float32.""" return np.array(1.0, dtype=np.float32) def add_quantized_linear(tensors, prefix, out_features, in_features): """Add NVFP4 quantized linear layer tensors.""" tensors[f"{prefix}.weight"] = make_fp4_weight(out_features, in_features) tensors[f"{prefix}.weight_scale"] = make_fp8_scale(out_features, in_features) tensors[f"{prefix}.weight_scale_2"] = make_scalar_f32() tensors[f"{prefix}.input_scale"] = make_scalar_f32() def add_bf16_linear(tensors, prefix, out_features, in_features, bias=False): """Add BF16 linear layer tensors.""" tensors[f"{prefix}.weight"] = make_bf16((out_features, in_features)) if bias: tensors[f"{prefix}.bias"] = make_bf16((out_features,)) def add_attention(tensors, layer_prefix): """Add MLA attention tensors (all BF16, excluded from quantization).""" p = f"{layer_prefix}.self_attn" # q path tensors[f"{p}.q_a_proj.weight"] = make_bf16((Q_LORA_RANK, HIDDEN)) tensors[f"{p}.q_a_layernorm.weight"] = make_bf16((Q_LORA_RANK,)) q_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + QK_ROPE_HEAD_DIM) # 32*192=6144 tensors[f"{p}.q_b_proj.weight"] = make_bf16((q_b_out, Q_LORA_RANK)) # kv path kv_a_out = KV_LORA_RANK + QK_ROPE_HEAD_DIM # 384+64=448 tensors[f"{p}.kv_a_proj_with_mqa.weight"] = make_bf16((kv_a_out, HIDDEN)) tensors[f"{p}.kv_a_layernorm.weight"] = make_bf16((KV_LORA_RANK,)) kv_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + V_HEAD_DIM) # 32*256=8192 tensors[f"{p}.kv_b_proj.weight"] = make_bf16((kv_b_out, KV_LORA_RANK)) # output o_in = NUM_HEADS * V_HEAD_DIM # 32*128=4096 tensors[f"{p}.o_proj.weight"] = make_bf16((HIDDEN, o_in)) # KV cache scales tensors[f"{p}.k_proj.k_scale"] = make_scalar_f32() tensors[f"{p}.v_proj.v_scale"] = make_scalar_f32() def add_dense_mlp(tensors, layer_prefix): """Add dense MLP (layer 0) - quantized.""" p = f"{layer_prefix}.mlp" add_quantized_linear(tensors, f"{p}.gate_proj", INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{p}.up_proj", INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{p}.down_proj", HIDDEN, INTERMEDIATE) def add_moe_mlp(tensors, layer_prefix): """Add MoE MLP (layers 1+) - experts quantized.""" p = f"{layer_prefix}.mlp" # Router gate tensors[f"{p}.gate.weight"] = make_bf16((N_ROUTED_EXPERTS, HIDDEN)) tensors[f"{p}.gate.e_score_correction_bias"] = make_bf16((N_ROUTED_EXPERTS,)) # Shared experts add_quantized_linear(tensors, f"{p}.shared_experts.gate_proj", MOE_INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{p}.shared_experts.up_proj", MOE_INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{p}.shared_experts.down_proj", HIDDEN, MOE_INTERMEDIATE) # Routed experts for e in range(N_ROUTED_EXPERTS): ep = f"{p}.experts.{e}" add_quantized_linear(tensors, f"{ep}.gate_proj", MOE_INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{ep}.up_proj", MOE_INTERMEDIATE, HIDDEN) add_quantized_linear(tensors, f"{ep}.down_proj", HIDDEN, MOE_INTERMEDIATE) def add_vision_tower(tensors): """Add vision tower tensors (all BF16).""" # Patch embedding tensors["vision_tower.patch_embed.proj.weight"] = make_bf16( (VT_HIDDEN, 3, PATCH_SIZE, PATCH_SIZE) ) tensors["vision_tower.patch_embed.proj.bias"] = make_bf16((VT_HIDDEN,)) tensors["vision_tower.patch_embed.pos_emb.weight"] = make_bf16((64, 64, VT_HIDDEN)) # Transformer blocks for b in range(VT_LAYERS): bp = f"vision_tower.encoder.blocks.{b}" # QKV fused tensors[f"{bp}.wqkv.weight"] = make_bf16((3 * VT_HIDDEN, VT_HIDDEN)) tensors[f"{bp}.wqkv.bias"] = make_bf16((3 * VT_HIDDEN,)) # Output proj tensors[f"{bp}.wo.weight"] = make_bf16((VT_HIDDEN, VT_HIDDEN)) tensors[f"{bp}.wo.bias"] = make_bf16((VT_HIDDEN,)) # Norms tensors[f"{bp}.norm0.weight"] = make_bf16((VT_HIDDEN,)) tensors[f"{bp}.norm0.bias"] = make_bf16((VT_HIDDEN,)) tensors[f"{bp}.norm1.weight"] = make_bf16((VT_HIDDEN,)) tensors[f"{bp}.norm1.bias"] = make_bf16((VT_HIDDEN,)) # MLP tensors[f"{bp}.mlp.fc0.weight"] = make_bf16((VT_INTERMEDIATE, VT_HIDDEN)) tensors[f"{bp}.mlp.fc0.bias"] = make_bf16((VT_INTERMEDIATE,)) tensors[f"{bp}.mlp.fc1.weight"] = make_bf16((VT_HIDDEN, VT_INTERMEDIATE)) tensors[f"{bp}.mlp.fc1.bias"] = make_bf16((VT_HIDDEN,)) # Final layernorm tensors["vision_tower.encoder.final_layernorm.weight"] = make_bf16((VT_HIDDEN,)) tensors["vision_tower.encoder.final_layernorm.bias"] = make_bf16((VT_HIDDEN,)) def add_mm_projector(tensors): """Add multimodal projector tensors (BF16).""" tensors["mm_projector.pre_norm.weight"] = make_bf16((MM_HIDDEN,)) tensors["mm_projector.pre_norm.bias"] = make_bf16((MM_HIDDEN,)) tensors["mm_projector.proj.0.weight"] = make_bf16((MM_PROJECTED, MM_PROJECTED)) tensors["mm_projector.proj.0.bias"] = make_bf16((MM_PROJECTED,)) tensors["mm_projector.proj.2.weight"] = make_bf16((HIDDEN, MM_PROJECTED)) tensors["mm_projector.proj.2.bias"] = make_bf16((HIDDEN,)) def generate_all_tensors(): """Generate all model tensors.""" tensors = {} # Embeddings tensors["language_model.model.embed_tokens.weight"] = make_bf16((VOCAB, HIDDEN)) # Language model layers for layer_idx in range(NUM_LAYERS): lp = f"language_model.model.layers.{layer_idx}" tensors[f"{lp}.input_layernorm.weight"] = make_bf16((HIDDEN,)) tensors[f"{lp}.post_attention_layernorm.weight"] = make_bf16((HIDDEN,)) # Attention (always MLA, always BF16) add_attention(tensors, lp) # MLP: dense for first layer, MoE for rest if layer_idx < FIRST_K_DENSE_REPLACE: add_dense_mlp(tensors, lp) else: add_moe_mlp(tensors, lp) # Final norm tensors["language_model.model.norm.weight"] = make_bf16((HIDDEN,)) # LM head (BF16, excluded from quant) tensors["language_model.lm_head.weight"] = make_bf16((VOCAB, HIDDEN)) # Vision tower add_vision_tower(tensors) # MM projector add_mm_projector(tensors) return tensors def compute_total_params(tensors): """Count total parameters.""" total = 0 for name, arr in tensors.items(): if name.endswith(".weight") and not name.endswith( (".weight_scale", ".weight_scale_2") ): if arr.dtype == np.uint8 and "weight_scale" not in name: # FP4 packed: actual params = shape[0] * shape[1] * 2 total += arr.shape[0] * arr.shape[1] * 2 else: total += arr.size elif name.endswith(".bias"): total += arr.size return total def save_sharded(tensors, output_dir, max_shard_bytes=5_000_000_000): """Save tensors as sharded safetensors with index file.""" output_dir = Path(output_dir) # Sort tensor names for deterministic sharding sorted_names = sorted(tensors.keys()) # Compute tensor sizes def tensor_bytes(arr): return arr.nbytes # Shard the tensors shards = [] current_shard = {} current_size = 0 for name in sorted_names: arr = tensors[name] size = tensor_bytes(arr) if current_size + size > max_shard_bytes and current_shard: shards.append(current_shard) current_shard = {} current_size = 0 current_shard[name] = arr current_size += size if current_shard: shards.append(current_shard) num_shards = len(shards) weight_map = {} total_size = 0 for i, shard in enumerate(shards, 1): filename = f"model-{i:05d}-of-{num_shards:05d}.safetensors" filepath = output_dir / filename # Convert to proper format for safetensors shard_data = {} for name, arr in shard.items(): shard_data[name] = arr save_file(shard_data, str(filepath)) print(f" Saved {filename} ({len(shard)} tensors, {sum(a.nbytes for a in shard.values()) / 1e9:.2f} GB)") for name in shard: weight_map[name] = filename total_size += tensors[name].nbytes # Write index file index = { "metadata": { "total_size": total_size, }, "weight_map": weight_map, } index_path = output_dir / "model.safetensors.index.json" with open(index_path, "w") as f: json.dump(index, f, indent=2, sort_keys=True) print(f" Saved index ({len(weight_map)} tensors, {num_shards} shards, {total_size / 1e9:.2f} GB total)") return num_shards def update_config(output_dir): """Update config.json with mini dimensions.""" config_path = Path(output_dir) / "config.json" with open(config_path) as f: config = json.load(f) # Update text config tc = config["text_config"] tc["hidden_size"] = HIDDEN tc["num_attention_heads"] = NUM_HEADS tc["num_key_value_heads"] = NUM_KV_HEADS tc["num_hidden_layers"] = NUM_LAYERS tc["intermediate_size"] = INTERMEDIATE tc["n_routed_experts"] = N_ROUTED_EXPERTS tc["n_shared_experts"] = N_SHARED_EXPERTS tc["num_experts_per_tok"] = NUM_EXPERTS_PER_TOK tc["moe_intermediate_size"] = MOE_INTERMEDIATE tc["q_lora_rank"] = Q_LORA_RANK tc["kv_lora_rank"] = KV_LORA_RANK tc["qk_nope_head_dim"] = QK_NOPE_HEAD_DIM tc["qk_rope_head_dim"] = QK_ROPE_HEAD_DIM tc["v_head_dim"] = V_HEAD_DIM tc["first_k_dense_replace"] = FIRST_K_DENSE_REPLACE # Update vision config vc = config["vision_config"] vc["vt_num_hidden_layers"] = VT_LAYERS vc["text_hidden_size"] = HIDDEN # Update quantization ignore list for new layer count quant = config["quantization_config"] ignore_list = [ "language_model.lm_head", "mm_projector*", "vision_tower*", ] for i in range(NUM_LAYERS): ignore_list.append(f"language_model.model.layers.{i}.self_attn*") quant["ignore"] = sorted(ignore_list) with open(config_path, "w") as f: json.dump(config, f, indent=4) print(f" Updated config.json") def update_hf_quant_config(output_dir): """Update hf_quant_config.json exclude list.""" path = Path(output_dir) / "hf_quant_config.json" with open(path) as f: config = json.load(f) exclude = [ "language_model.lm_head", "mm_projector*", "vision_tower*", ] for i in range(NUM_LAYERS): exclude.append(f"language_model.model.layers.{i}.self_attn*") config["quantization"]["exclude_modules"] = sorted(exclude) with open(path, "w") as f: json.dump(config, f, indent=4) print(f" Updated hf_quant_config.json") def main(): output_dir = "/home/ubuntu/.cache/huggingface/kimi-mini" print("Generating mini Kimi-K2.5-NVFP4 model...") print(f" Dimensions: hidden={HIDDEN}, heads={NUM_HEADS}, layers={NUM_LAYERS}") print(f" MoE: {N_ROUTED_EXPERTS} experts, {NUM_EXPERTS_PER_TOK} per token") print(f" Vision: {VT_LAYERS} layers, hidden={VT_HIDDEN}") print() print("Updating configs...") update_config(output_dir) update_hf_quant_config(output_dir) print() print("Generating tensors...") tensors = generate_all_tensors() total_params = compute_total_params(tensors) print(f" Total tensors: {len(tensors)}") print(f" Approx total params: {total_params / 1e9:.1f}B") print() print("Saving sharded safetensors...") num_shards = save_sharded(tensors, output_dir) print() # Remove old model.safetensors.index.json backup if exists print("Done! Mini model saved to:", output_dir) if __name__ == "__main__": main()