Ursa_Minor_Smashed / convert_to_gguf.py
Kaileh57's picture
Upload folder using huggingface_hub
d575ce4 verified
raw
history blame
8.62 kB
#!/usr/bin/env python3
"""
Convert NanoGPT model to GGUF format for use with llama.cpp
Requires: pip install gguf sentencepiece numpy
"""
import os
import sys
import json
import struct
import argparse
import numpy as np
import torch
from pathlib import Path
from typing import Dict, Any
import gguf
# Model architecture constants
ARCH = "gpt2"
BLOCK_SIZE = 1024
VOCAB_SIZE = 50304
N_LAYER = 12
N_HEAD = 12
N_EMBD = 768
def load_nanogpt_model(checkpoint_path: str) -> Dict[str, torch.Tensor]:
"""Load NanoGPT checkpoint and return state dict"""
print(f"Loading checkpoint from {checkpoint_path}")
# Create a dummy class to handle train_gpt2.GPTConfig references
import sys
import types
# Create a fake train_gpt2 module to handle the reference
train_gpt2_module = types.ModuleType('train_gpt2')
class DummyGPTConfig:
def __init__(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
train_gpt2_module.GPTConfig = DummyGPTConfig
sys.modules['train_gpt2'] = train_gpt2_module
try:
checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
finally:
# Clean up
if 'train_gpt2' in sys.modules:
del sys.modules['train_gpt2']
return checkpoint['model']
def convert_key_names(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
"""Convert NanoGPT key names to GGUF/llama.cpp format"""
key_map = {
# Token and position embeddings
"transformer.wte.weight": "token_embd.weight",
"transformer.wpe.weight": "position_embd.weight",
# Final layer norm
"transformer.ln_f.weight": "output_norm.weight",
"transformer.ln_f.bias": "output_norm.bias",
# Output head (shares weight with token embeddings)
"lm_head.weight": "output.weight",
}
# Add layer-specific mappings
for i in range(N_LAYER):
# Attention layer norms
key_map[f"transformer.h.{i}.ln_1.weight"] = f"blk.{i}.attn_norm.weight"
key_map[f"transformer.h.{i}.ln_1.bias"] = f"blk.{i}.attn_norm.bias"
# Attention weights (need to be split for GGUF)
key_map[f"transformer.h.{i}.attn.c_attn.weight"] = f"blk.{i}.attn_qkv.weight"
key_map[f"transformer.h.{i}.attn.c_attn.bias"] = f"blk.{i}.attn_qkv.bias"
# Attention output projection
key_map[f"transformer.h.{i}.attn.c_proj.weight"] = f"blk.{i}.attn_output.weight"
key_map[f"transformer.h.{i}.attn.c_proj.bias"] = f"blk.{i}.attn_output.bias"
# FFN layer norms
key_map[f"transformer.h.{i}.ln_2.weight"] = f"blk.{i}.ffn_norm.weight"
key_map[f"transformer.h.{i}.ln_2.bias"] = f"blk.{i}.ffn_norm.bias"
# FFN weights
key_map[f"transformer.h.{i}.mlp.c_fc.weight"] = f"blk.{i}.ffn_up.weight"
key_map[f"transformer.h.{i}.mlp.c_fc.bias"] = f"blk.{i}.ffn_up.bias"
key_map[f"transformer.h.{i}.mlp.c_proj.weight"] = f"blk.{i}.ffn_down.weight"
key_map[f"transformer.h.{i}.mlp.c_proj.bias"] = f"blk.{i}.ffn_down.bias"
# Convert keys
converted = {}
for old_key, tensor in state_dict.items():
new_key = key_map.get(old_key, old_key)
converted[new_key] = tensor
return converted
def create_gguf_model(state_dict: Dict[str, torch.Tensor], output_path: str,
quantization: str = "f16"):
"""Create GGUF file from state dict"""
# Create GGUF writer
writer = gguf.GGUFWriter(output_path, ARCH)
# Add metadata
writer.add_name("ursa-minor-smashed")
writer.add_description("Ursa Minor Smashed - GPT-2 124M trained on FineWeb-edu dataset")
writer.add_context_length(BLOCK_SIZE)
writer.add_embedding_length(N_EMBD)
writer.add_block_count(N_LAYER)
writer.add_head_count(N_HEAD)
writer.add_head_count_kv(N_HEAD) # GPT-2 uses full attention
writer.add_layer_norm_eps(1e-5)
writer.add_file_type(gguf.GGMLQuantizationType.F16 if quantization == "f16" else gguf.GGMLQuantizationType.F32)
# Add tokenizer info (using GPT-2 tokenizer)
writer.add_tokenizer_model("gpt2")
writer.add_token_list([f"token_{i}" for i in range(VOCAB_SIZE)]) # Placeholder
writer.add_token_types([gguf.TokenType.NORMAL] * VOCAB_SIZE)
writer.add_bos_token_id(50256)
writer.add_eos_token_id(50256)
writer.add_unk_token_id(50256)
writer.add_sep_token_id(50256)
writer.add_pad_token_id(50256)
# Process and add tensors
for name, tensor in state_dict.items():
print(f"Processing {name} with shape {tensor.shape}")
# Convert to numpy
data = tensor.numpy()
# Handle quantization
if quantization == "f16" and data.dtype == np.float32:
data = data.astype(np.float16)
# Special handling for QKV weights (need to split)
if "attn_qkv.weight" in name:
# Split QKV weight into separate Q, K, V
q, k, v = np.split(data, 3, axis=0)
layer_idx = name.split('.')[1]
writer.add_tensor(f"blk.{layer_idx}.attn_q.weight", q)
writer.add_tensor(f"blk.{layer_idx}.attn_k.weight", k)
writer.add_tensor(f"blk.{layer_idx}.attn_v.weight", v)
elif "attn_qkv.bias" in name:
# Split QKV bias
q, k, v = np.split(data, 3)
layer_idx = name.split('.')[1]
writer.add_tensor(f"blk.{layer_idx}.attn_q.bias", q)
writer.add_tensor(f"blk.{layer_idx}.attn_k.bias", k)
writer.add_tensor(f"blk.{layer_idx}.attn_v.bias", v)
else:
writer.add_tensor(name, data)
# Write file
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_tensors_to_file()
writer.close()
print(f"GGUF model saved to {output_path}")
def quantize_gguf(input_path: str, output_path: str, quantization: str):
"""Quantize GGUF model using llama.cpp quantize tool"""
import subprocess
# Map quantization names
quant_map = {
"q4_0": "q4_0",
"q4_1": "q4_1",
"q5_0": "q5_0",
"q5_1": "q5_1",
"q8_0": "q8_0",
"q4_k_m": "q4_K_M",
"q5_k_m": "q5_K_M",
"q6_k": "q6_K"
}
if quantization not in quant_map:
print(f"Unknown quantization: {quantization}")
return
# Check if quantize tool exists
quantize_exe = "./quantize"
if not os.path.exists(quantize_exe):
print("quantize tool not found. Build it from llama.cpp:")
print(" git clone https://github.com/ggerganov/llama.cpp")
print(" cd llama.cpp && make quantize")
return
# Run quantization
cmd = [quantize_exe, input_path, output_path, quant_map[quantization]]
print(f"Running: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
def main():
parser = argparse.ArgumentParser(description="Convert NanoGPT to GGUF format")
parser.add_argument("input", type=str, help="Path to NanoGPT checkpoint (.pt file)")
parser.add_argument("output", type=str, help="Output GGUF file path")
parser.add_argument("--quantization", type=str, default="f16",
choices=["f32", "f16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "q4_k_m", "q5_k_m", "q6_k"],
help="Quantization type")
args = parser.parse_args()
# Load model
state_dict = load_nanogpt_model(args.input)
# Convert key names
converted_state_dict = convert_key_names(state_dict)
# Create GGUF
if args.quantization in ["f32", "f16"]:
create_gguf_model(converted_state_dict, args.output, args.quantization)
else:
# Create f16 first, then quantize
temp_path = args.output.replace(".gguf", "_f16.gguf")
create_gguf_model(converted_state_dict, temp_path, "f16")
quantize_gguf(temp_path, args.output, args.quantization)
os.remove(temp_path)
print(f"Conversion complete! Output saved to {args.output}")
# Print usage instructions
print("\nTo use with llama.cpp:")
print(f" ./main -m {args.output} -p \"Hello, I'm a language model\" -n 100")
if __name__ == "__main__":
main()