|
|
| """
|
| Convert NanoGPT model to GGUF format for use with llama.cpp
|
| Requires: pip install gguf sentencepiece numpy
|
| """
|
|
|
| import os
|
| import sys
|
| import json
|
| import struct
|
| import argparse
|
| import numpy as np
|
| import torch
|
| from pathlib import Path
|
| from typing import Dict, Any
|
| import gguf
|
|
|
|
|
| ARCH = "gpt2"
|
| BLOCK_SIZE = 1024
|
| VOCAB_SIZE = 50304
|
| N_LAYER = 12
|
| N_HEAD = 12
|
| N_EMBD = 768
|
|
|
| def load_nanogpt_model(checkpoint_path: str) -> Dict[str, torch.Tensor]:
|
| """Load NanoGPT checkpoint and return state dict"""
|
| print(f"Loading checkpoint from {checkpoint_path}")
|
|
|
|
|
| import sys
|
| import types
|
|
|
|
|
| train_gpt2_module = types.ModuleType('train_gpt2')
|
|
|
| class DummyGPTConfig:
|
| def __init__(self, **kwargs):
|
| for k, v in kwargs.items():
|
| setattr(self, k, v)
|
|
|
| train_gpt2_module.GPTConfig = DummyGPTConfig
|
| sys.modules['train_gpt2'] = train_gpt2_module
|
|
|
| try:
|
| checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
|
| finally:
|
|
|
| if 'train_gpt2' in sys.modules:
|
| del sys.modules['train_gpt2']
|
|
|
| return checkpoint['model']
|
|
|
| def convert_key_names(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
|
| """Convert NanoGPT key names to GGUF/llama.cpp format"""
|
| key_map = {
|
|
|
| "transformer.wte.weight": "token_embd.weight",
|
| "transformer.wpe.weight": "position_embd.weight",
|
|
|
|
|
| "transformer.ln_f.weight": "output_norm.weight",
|
| "transformer.ln_f.bias": "output_norm.bias",
|
|
|
|
|
| "lm_head.weight": "output.weight",
|
| }
|
|
|
|
|
| for i in range(N_LAYER):
|
|
|
| key_map[f"transformer.h.{i}.ln_1.weight"] = f"blk.{i}.attn_norm.weight"
|
| key_map[f"transformer.h.{i}.ln_1.bias"] = f"blk.{i}.attn_norm.bias"
|
|
|
|
|
| key_map[f"transformer.h.{i}.attn.c_attn.weight"] = f"blk.{i}.attn_qkv.weight"
|
| key_map[f"transformer.h.{i}.attn.c_attn.bias"] = f"blk.{i}.attn_qkv.bias"
|
|
|
|
|
| key_map[f"transformer.h.{i}.attn.c_proj.weight"] = f"blk.{i}.attn_output.weight"
|
| key_map[f"transformer.h.{i}.attn.c_proj.bias"] = f"blk.{i}.attn_output.bias"
|
|
|
|
|
| key_map[f"transformer.h.{i}.ln_2.weight"] = f"blk.{i}.ffn_norm.weight"
|
| key_map[f"transformer.h.{i}.ln_2.bias"] = f"blk.{i}.ffn_norm.bias"
|
|
|
|
|
| key_map[f"transformer.h.{i}.mlp.c_fc.weight"] = f"blk.{i}.ffn_up.weight"
|
| key_map[f"transformer.h.{i}.mlp.c_fc.bias"] = f"blk.{i}.ffn_up.bias"
|
| key_map[f"transformer.h.{i}.mlp.c_proj.weight"] = f"blk.{i}.ffn_down.weight"
|
| key_map[f"transformer.h.{i}.mlp.c_proj.bias"] = f"blk.{i}.ffn_down.bias"
|
|
|
|
|
| converted = {}
|
| for old_key, tensor in state_dict.items():
|
| new_key = key_map.get(old_key, old_key)
|
| converted[new_key] = tensor
|
|
|
| return converted
|
|
|
| def create_gguf_model(state_dict: Dict[str, torch.Tensor], output_path: str,
|
| quantization: str = "f16"):
|
| """Create GGUF file from state dict"""
|
|
|
|
|
| writer = gguf.GGUFWriter(output_path, ARCH)
|
|
|
|
|
| writer.add_name("ursa-minor-smashed")
|
| writer.add_description("Ursa Minor Smashed - GPT-2 124M trained on FineWeb-edu dataset")
|
| writer.add_context_length(BLOCK_SIZE)
|
| writer.add_embedding_length(N_EMBD)
|
| writer.add_block_count(N_LAYER)
|
| writer.add_head_count(N_HEAD)
|
| writer.add_head_count_kv(N_HEAD)
|
| writer.add_layer_norm_eps(1e-5)
|
| writer.add_file_type(gguf.GGMLQuantizationType.F16 if quantization == "f16" else gguf.GGMLQuantizationType.F32)
|
|
|
|
|
| writer.add_tokenizer_model("gpt2")
|
| writer.add_token_list([f"token_{i}" for i in range(VOCAB_SIZE)])
|
| writer.add_token_types([gguf.TokenType.NORMAL] * VOCAB_SIZE)
|
| writer.add_bos_token_id(50256)
|
| writer.add_eos_token_id(50256)
|
| writer.add_unk_token_id(50256)
|
| writer.add_sep_token_id(50256)
|
| writer.add_pad_token_id(50256)
|
|
|
|
|
| for name, tensor in state_dict.items():
|
| print(f"Processing {name} with shape {tensor.shape}")
|
|
|
|
|
| data = tensor.numpy()
|
|
|
|
|
| if quantization == "f16" and data.dtype == np.float32:
|
| data = data.astype(np.float16)
|
|
|
|
|
| if "attn_qkv.weight" in name:
|
|
|
| q, k, v = np.split(data, 3, axis=0)
|
| layer_idx = name.split('.')[1]
|
|
|
| writer.add_tensor(f"blk.{layer_idx}.attn_q.weight", q)
|
| writer.add_tensor(f"blk.{layer_idx}.attn_k.weight", k)
|
| writer.add_tensor(f"blk.{layer_idx}.attn_v.weight", v)
|
| elif "attn_qkv.bias" in name:
|
|
|
| q, k, v = np.split(data, 3)
|
| layer_idx = name.split('.')[1]
|
|
|
| writer.add_tensor(f"blk.{layer_idx}.attn_q.bias", q)
|
| writer.add_tensor(f"blk.{layer_idx}.attn_k.bias", k)
|
| writer.add_tensor(f"blk.{layer_idx}.attn_v.bias", v)
|
| else:
|
| writer.add_tensor(name, data)
|
|
|
|
|
| writer.write_header_to_file()
|
| writer.write_kv_data_to_file()
|
| writer.write_tensors_to_file()
|
| writer.close()
|
|
|
| print(f"GGUF model saved to {output_path}")
|
|
|
| def quantize_gguf(input_path: str, output_path: str, quantization: str):
|
| """Quantize GGUF model using llama.cpp quantize tool"""
|
| import subprocess
|
|
|
|
|
| quant_map = {
|
| "q4_0": "q4_0",
|
| "q4_1": "q4_1",
|
| "q5_0": "q5_0",
|
| "q5_1": "q5_1",
|
| "q8_0": "q8_0",
|
| "q4_k_m": "q4_K_M",
|
| "q5_k_m": "q5_K_M",
|
| "q6_k": "q6_K"
|
| }
|
|
|
| if quantization not in quant_map:
|
| print(f"Unknown quantization: {quantization}")
|
| return
|
|
|
|
|
| quantize_exe = "./quantize"
|
| if not os.path.exists(quantize_exe):
|
| print("quantize tool not found. Build it from llama.cpp:")
|
| print(" git clone https://github.com/ggerganov/llama.cpp")
|
| print(" cd llama.cpp && make quantize")
|
| return
|
|
|
|
|
| cmd = [quantize_exe, input_path, output_path, quant_map[quantization]]
|
| print(f"Running: {' '.join(cmd)}")
|
| subprocess.run(cmd, check=True)
|
|
|
| def main():
|
| parser = argparse.ArgumentParser(description="Convert NanoGPT to GGUF format")
|
| parser.add_argument("input", type=str, help="Path to NanoGPT checkpoint (.pt file)")
|
| parser.add_argument("output", type=str, help="Output GGUF file path")
|
| parser.add_argument("--quantization", type=str, default="f16",
|
| choices=["f32", "f16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q4_k_m", "q5_k_m", "q6_k"],
|
| help="Quantization type")
|
|
|
| args = parser.parse_args()
|
|
|
|
|
| state_dict = load_nanogpt_model(args.input)
|
|
|
|
|
| converted_state_dict = convert_key_names(state_dict)
|
|
|
|
|
| if args.quantization in ["f32", "f16"]:
|
| create_gguf_model(converted_state_dict, args.output, args.quantization)
|
| else:
|
|
|
| temp_path = args.output.replace(".gguf", "_f16.gguf")
|
| create_gguf_model(converted_state_dict, temp_path, "f16")
|
| quantize_gguf(temp_path, args.output, args.quantization)
|
| os.remove(temp_path)
|
|
|
| print(f"Conversion complete! Output saved to {args.output}")
|
|
|
|
|
| print("\nTo use with llama.cpp:")
|
| print(f" ./main -m {args.output} -p \"Hello, I'm a language model\" -n 100")
|
|
|
| if __name__ == "__main__":
|
| main() |