kimi-2.5-random-20B / generate_mini_model.py

Upload folder using huggingface_hub

255d759 verified 2 months ago

13.7 kB

	#!/usr/bin/env python3
	"""Generate a minimized ~20B Kimi-K2.5-NVFP4 model for architecture testing.

	This creates random weights with the correct tensor names, shapes, and dtypes
	to match the NVFP4 quantization format used by the original model.

	Mini model specs (TP=2 compatible):
	hidden_size=4096, heads=32, layers=12, experts=64, moe_intermediate=2048
	"""

	import json
	import os
	import struct
	from pathlib import Path

	import numpy as np
	from safetensors.numpy import save_file


	# ============================================================
	# Mini model dimensions
	# ============================================================
	HIDDEN = 4096
	NUM_HEADS = 32
	NUM_KV_HEADS = 32 # MLA uses same as heads
	NUM_LAYERS = 12
	INTERMEDIATE = 11008
	VOCAB = 163840
	N_ROUTED_EXPERTS = 64
	N_SHARED_EXPERTS = 1
	NUM_EXPERTS_PER_TOK = 8
	MOE_INTERMEDIATE = 2048
	Q_LORA_RANK = 1536 # keep original to match FlashInfer MLA head_size
	KV_LORA_RANK = 512 # keep original: head_size = 512+64 = 576
	QK_NOPE_HEAD_DIM = 128
	QK_ROPE_HEAD_DIM = 64
	V_HEAD_DIM = 128
	FIRST_K_DENSE_REPLACE = 1
	GROUP_SIZE = 16

	# Vision tower
	VT_HIDDEN = 1152
	VT_LAYERS = 4 # reduced from 27
	VT_HEADS = 16
	VT_INTERMEDIATE = 4304
	PATCH_SIZE = 14
	MERGE_KERNEL = [2, 2]
	MM_HIDDEN = VT_HIDDEN # 1152
	MM_PROJECTED = MM_HIDDEN * MERGE_KERNEL[0] * MERGE_KERNEL[1] # 4608


	def make_bf16(shape):
	"""Random BF16 tensor (stored as uint16 in numpy)."""
	return np.random.randint(0, 65535, size=shape, dtype=np.uint16)


	def make_fp4_weight(out_features, in_features):
	"""FP4 packed weight: [out, in//2] as uint8."""
	return np.random.randint(0, 255, size=(out_features, in_features // 2), dtype=np.uint8)


	def make_fp8_scale(out_features, in_features):
	"""FP8 E4M3 weight scale: [out, in//group_size] as uint8."""
	return np.random.randint(0, 255, size=(out_features, in_features // GROUP_SIZE), dtype=np.uint8)


	def make_scalar_f32():
	"""Scalar float32."""
	return np.array(1.0, dtype=np.float32)


	def add_quantized_linear(tensors, prefix, out_features, in_features):
	"""Add NVFP4 quantized linear layer tensors."""
	tensors[f"{prefix}.weight"] = make_fp4_weight(out_features, in_features)
	tensors[f"{prefix}.weight_scale"] = make_fp8_scale(out_features, in_features)
	tensors[f"{prefix}.weight_scale_2"] = make_scalar_f32()
	tensors[f"{prefix}.input_scale"] = make_scalar_f32()


	def add_bf16_linear(tensors, prefix, out_features, in_features, bias=False):
	"""Add BF16 linear layer tensors."""
	tensors[f"{prefix}.weight"] = make_bf16((out_features, in_features))
	if bias:
	tensors[f"{prefix}.bias"] = make_bf16((out_features,))


	def add_attention(tensors, layer_prefix):
	"""Add MLA attention tensors (all BF16, excluded from quantization)."""
	p = f"{layer_prefix}.self_attn"
	# q path
	tensors[f"{p}.q_a_proj.weight"] = make_bf16((Q_LORA_RANK, HIDDEN))
	tensors[f"{p}.q_a_layernorm.weight"] = make_bf16((Q_LORA_RANK,))
	q_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + QK_ROPE_HEAD_DIM) # 32*192=6144
	tensors[f"{p}.q_b_proj.weight"] = make_bf16((q_b_out, Q_LORA_RANK))
	# kv path
	kv_a_out = KV_LORA_RANK + QK_ROPE_HEAD_DIM # 384+64=448
	tensors[f"{p}.kv_a_proj_with_mqa.weight"] = make_bf16((kv_a_out, HIDDEN))
	tensors[f"{p}.kv_a_layernorm.weight"] = make_bf16((KV_LORA_RANK,))
	kv_b_out = NUM_HEADS * (QK_NOPE_HEAD_DIM + V_HEAD_DIM) # 32*256=8192
	tensors[f"{p}.kv_b_proj.weight"] = make_bf16((kv_b_out, KV_LORA_RANK))
	# output
	o_in = NUM_HEADS * V_HEAD_DIM # 32*128=4096
	tensors[f"{p}.o_proj.weight"] = make_bf16((HIDDEN, o_in))
	# KV cache scales
	tensors[f"{p}.k_proj.k_scale"] = make_scalar_f32()
	tensors[f"{p}.v_proj.v_scale"] = make_scalar_f32()


	def add_dense_mlp(tensors, layer_prefix):
	"""Add dense MLP (layer 0) - quantized."""
	p = f"{layer_prefix}.mlp"
	add_quantized_linear(tensors, f"{p}.gate_proj", INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{p}.up_proj", INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{p}.down_proj", HIDDEN, INTERMEDIATE)


	def add_moe_mlp(tensors, layer_prefix):
	"""Add MoE MLP (layers 1+) - experts quantized."""
	p = f"{layer_prefix}.mlp"
	# Router gate
	tensors[f"{p}.gate.weight"] = make_bf16((N_ROUTED_EXPERTS, HIDDEN))
	tensors[f"{p}.gate.e_score_correction_bias"] = make_bf16((N_ROUTED_EXPERTS,))
	# Shared experts
	add_quantized_linear(tensors, f"{p}.shared_experts.gate_proj", MOE_INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{p}.shared_experts.up_proj", MOE_INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{p}.shared_experts.down_proj", HIDDEN, MOE_INTERMEDIATE)
	# Routed experts
	for e in range(N_ROUTED_EXPERTS):
	ep = f"{p}.experts.{e}"
	add_quantized_linear(tensors, f"{ep}.gate_proj", MOE_INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{ep}.up_proj", MOE_INTERMEDIATE, HIDDEN)
	add_quantized_linear(tensors, f"{ep}.down_proj", HIDDEN, MOE_INTERMEDIATE)


	def add_vision_tower(tensors):
	"""Add vision tower tensors (all BF16)."""
	# Patch embedding
	tensors["vision_tower.patch_embed.proj.weight"] = make_bf16(
	(VT_HIDDEN, 3, PATCH_SIZE, PATCH_SIZE)
	)
	tensors["vision_tower.patch_embed.proj.bias"] = make_bf16((VT_HIDDEN,))
	tensors["vision_tower.patch_embed.pos_emb.weight"] = make_bf16((64, 64, VT_HIDDEN))

	# Transformer blocks
	for b in range(VT_LAYERS):
	bp = f"vision_tower.encoder.blocks.{b}"
	# QKV fused
	tensors[f"{bp}.wqkv.weight"] = make_bf16((3 * VT_HIDDEN, VT_HIDDEN))
	tensors[f"{bp}.wqkv.bias"] = make_bf16((3 * VT_HIDDEN,))
	# Output proj
	tensors[f"{bp}.wo.weight"] = make_bf16((VT_HIDDEN, VT_HIDDEN))
	tensors[f"{bp}.wo.bias"] = make_bf16((VT_HIDDEN,))
	# Norms
	tensors[f"{bp}.norm0.weight"] = make_bf16((VT_HIDDEN,))
	tensors[f"{bp}.norm0.bias"] = make_bf16((VT_HIDDEN,))
	tensors[f"{bp}.norm1.weight"] = make_bf16((VT_HIDDEN,))
	tensors[f"{bp}.norm1.bias"] = make_bf16((VT_HIDDEN,))
	# MLP
	tensors[f"{bp}.mlp.fc0.weight"] = make_bf16((VT_INTERMEDIATE, VT_HIDDEN))
	tensors[f"{bp}.mlp.fc0.bias"] = make_bf16((VT_INTERMEDIATE,))
	tensors[f"{bp}.mlp.fc1.weight"] = make_bf16((VT_HIDDEN, VT_INTERMEDIATE))
	tensors[f"{bp}.mlp.fc1.bias"] = make_bf16((VT_HIDDEN,))

	# Final layernorm
	tensors["vision_tower.encoder.final_layernorm.weight"] = make_bf16((VT_HIDDEN,))
	tensors["vision_tower.encoder.final_layernorm.bias"] = make_bf16((VT_HIDDEN,))


	def add_mm_projector(tensors):
	"""Add multimodal projector tensors (BF16)."""
	tensors["mm_projector.pre_norm.weight"] = make_bf16((MM_HIDDEN,))
	tensors["mm_projector.pre_norm.bias"] = make_bf16((MM_HIDDEN,))
	tensors["mm_projector.proj.0.weight"] = make_bf16((MM_PROJECTED, MM_PROJECTED))
	tensors["mm_projector.proj.0.bias"] = make_bf16((MM_PROJECTED,))
	tensors["mm_projector.proj.2.weight"] = make_bf16((HIDDEN, MM_PROJECTED))
	tensors["mm_projector.proj.2.bias"] = make_bf16((HIDDEN,))


	def generate_all_tensors():
	"""Generate all model tensors."""
	tensors = {}

	# Embeddings
	tensors["language_model.model.embed_tokens.weight"] = make_bf16((VOCAB, HIDDEN))

	# Language model layers
	for layer_idx in range(NUM_LAYERS):
	lp = f"language_model.model.layers.{layer_idx}"
	tensors[f"{lp}.input_layernorm.weight"] = make_bf16((HIDDEN,))
	tensors[f"{lp}.post_attention_layernorm.weight"] = make_bf16((HIDDEN,))

	# Attention (always MLA, always BF16)
	add_attention(tensors, lp)

	# MLP: dense for first layer, MoE for rest
	if layer_idx < FIRST_K_DENSE_REPLACE:
	add_dense_mlp(tensors, lp)
	else:
	add_moe_mlp(tensors, lp)

	# Final norm
	tensors["language_model.model.norm.weight"] = make_bf16((HIDDEN,))

	# LM head (BF16, excluded from quant)
	tensors["language_model.lm_head.weight"] = make_bf16((VOCAB, HIDDEN))

	# Vision tower
	add_vision_tower(tensors)

	# MM projector
	add_mm_projector(tensors)

	return tensors


	def compute_total_params(tensors):
	"""Count total parameters."""
	total = 0
	for name, arr in tensors.items():
	if name.endswith(".weight") and not name.endswith(
	(".weight_scale", ".weight_scale_2")
	):
	if arr.dtype == np.uint8 and "weight_scale" not in name:
	# FP4 packed: actual params = shape[0] * shape[1] * 2
	total += arr.shape[0] * arr.shape[1] * 2
	else:
	total += arr.size
	elif name.endswith(".bias"):
	total += arr.size
	return total


	def save_sharded(tensors, output_dir, max_shard_bytes=5_000_000_000):
	"""Save tensors as sharded safetensors with index file."""
	output_dir = Path(output_dir)

	# Sort tensor names for deterministic sharding
	sorted_names = sorted(tensors.keys())

	# Compute tensor sizes
	def tensor_bytes(arr):
	return arr.nbytes

	# Shard the tensors
	shards = []
	current_shard = {}
	current_size = 0

	for name in sorted_names:
	arr = tensors[name]
	size = tensor_bytes(arr)
	if current_size + size > max_shard_bytes and current_shard:
	shards.append(current_shard)
	current_shard = {}
	current_size = 0
	current_shard[name] = arr
	current_size += size

	if current_shard:
	shards.append(current_shard)

	num_shards = len(shards)
	weight_map = {}
	total_size = 0

	for i, shard in enumerate(shards, 1):
	filename = f"model-{i:05d}-of-{num_shards:05d}.safetensors"
	filepath = output_dir / filename

	# Convert to proper format for safetensors
	shard_data = {}
	for name, arr in shard.items():
	shard_data[name] = arr

	save_file(shard_data, str(filepath))
	print(f" Saved {filename} ({len(shard)} tensors, {sum(a.nbytes for a in shard.values()) / 1e9:.2f} GB)")

	for name in shard:
	weight_map[name] = filename
	total_size += tensors[name].nbytes

	# Write index file
	index = {
	"metadata": {
	"total_size": total_size,
	},
	"weight_map": weight_map,
	}

	index_path = output_dir / "model.safetensors.index.json"
	with open(index_path, "w") as f:
	json.dump(index, f, indent=2, sort_keys=True)
	print(f" Saved index ({len(weight_map)} tensors, {num_shards} shards, {total_size / 1e9:.2f} GB total)")

	return num_shards


	def update_config(output_dir):
	"""Update config.json with mini dimensions."""
	config_path = Path(output_dir) / "config.json"
	with open(config_path) as f:
	config = json.load(f)

	# Update text config
	tc = config["text_config"]
	tc["hidden_size"] = HIDDEN
	tc["num_attention_heads"] = NUM_HEADS
	tc["num_key_value_heads"] = NUM_KV_HEADS
	tc["num_hidden_layers"] = NUM_LAYERS
	tc["intermediate_size"] = INTERMEDIATE
	tc["n_routed_experts"] = N_ROUTED_EXPERTS
	tc["n_shared_experts"] = N_SHARED_EXPERTS
	tc["num_experts_per_tok"] = NUM_EXPERTS_PER_TOK
	tc["moe_intermediate_size"] = MOE_INTERMEDIATE
	tc["q_lora_rank"] = Q_LORA_RANK
	tc["kv_lora_rank"] = KV_LORA_RANK
	tc["qk_nope_head_dim"] = QK_NOPE_HEAD_DIM
	tc["qk_rope_head_dim"] = QK_ROPE_HEAD_DIM
	tc["v_head_dim"] = V_HEAD_DIM
	tc["first_k_dense_replace"] = FIRST_K_DENSE_REPLACE

	# Update vision config
	vc = config["vision_config"]
	vc["vt_num_hidden_layers"] = VT_LAYERS
	vc["text_hidden_size"] = HIDDEN

	# Update quantization ignore list for new layer count
	quant = config["quantization_config"]
	ignore_list = [
	"language_model.lm_head",
	"mm_projector*",
	"vision_tower*",
	]
	for i in range(NUM_LAYERS):
	ignore_list.append(f"language_model.model.layers.{i}.self_attn*")
	quant["ignore"] = sorted(ignore_list)

	with open(config_path, "w") as f:
	json.dump(config, f, indent=4)
	print(f" Updated config.json")


	def update_hf_quant_config(output_dir):
	"""Update hf_quant_config.json exclude list."""
	path = Path(output_dir) / "hf_quant_config.json"
	with open(path) as f:
	config = json.load(f)

	exclude = [
	"language_model.lm_head",
	"mm_projector*",
	"vision_tower*",
	]
	for i in range(NUM_LAYERS):
	exclude.append(f"language_model.model.layers.{i}.self_attn*")
	config["quantization"]["exclude_modules"] = sorted(exclude)

	with open(path, "w") as f:
	json.dump(config, f, indent=4)
	print(f" Updated hf_quant_config.json")


	def main():
	output_dir = "/home/ubuntu/.cache/huggingface/kimi-mini"

	print("Generating mini Kimi-K2.5-NVFP4 model...")
	print(f" Dimensions: hidden={HIDDEN}, heads={NUM_HEADS}, layers={NUM_LAYERS}")
	print(f" MoE: {N_ROUTED_EXPERTS} experts, {NUM_EXPERTS_PER_TOK} per token")
	print(f" Vision: {VT_LAYERS} layers, hidden={VT_HIDDEN}")
	print()

	print("Updating configs...")
	update_config(output_dir)
	update_hf_quant_config(output_dir)
	print()

	print("Generating tensors...")
	tensors = generate_all_tensors()
	total_params = compute_total_params(tensors)
	print(f" Total tensors: {len(tensors)}")
	print(f" Approx total params: {total_params / 1e9:.1f}B")
	print()

	print("Saving sharded safetensors...")
	num_shards = save_sharded(tensors, output_dir)
	print()

	# Remove old model.safetensors.index.json backup if exists
	print("Done! Mini model saved to:", output_dir)


	if __name__ == "__main__":
	main()