Codette-Reasoning / adapters /convert_peft_to_gguf.py

Jonathan Harrison

Full Codette codebase sync — transparency release

74f2af5 14 days ago

6.81 kB

	#!/usr/bin/env python3
	"""Convert PEFT LoRA safetensors to llama.cpp GGUF LoRA format.

	Lightweight converter — no torch/transformers dependency.
	Only needs: safetensors, gguf, numpy, struct.

	Matches the exact format produced by llama.cpp's convert_lora_to_gguf.py.
	"""

	import json
	import struct
	import sys
	from pathlib import Path
	import numpy as np

	# gguf uses its own writer
	from gguf import GGUFWriter, GGMLQuantizationType


	# PEFT tensor name -> GGUF tensor name mapping for LLama
	# PEFT: base_model.model.model.layers.{i}.self_attn.{proj}.lora_{AB}.weight
	# GGUF: blk.{i}.attn_{mapped_proj}.weight.lora_{ab}
	PROJ_MAP = {
	"q_proj": "attn_q",
	"k_proj": "attn_k",
	"v_proj": "attn_v",
	"o_proj": "attn_output",
	}


	def bf16_to_f16(data_bytes: bytes) -> np.ndarray:
	"""Convert bfloat16 raw bytes to float16 numpy array.

	bf16: sign(1) + exp(8) + mantissa(7)
	f16: sign(1) + exp(5) + mantissa(10)

	We go bf16 -> f32 -> f16 to avoid precision edge cases.
	"""
	# Read as uint16 (same byte layout as bf16)
	bf16 = np.frombuffer(data_bytes, dtype=np.uint16)
	# Convert bf16 to f32: shift left 16 bits
	f32_bytes = np.zeros(len(bf16), dtype=np.uint32)
	f32_bytes[:] = bf16.astype(np.uint32) << 16
	f32 = f32_bytes.view(np.float32)
	# Convert f32 to f16
	return f32.astype(np.float16)


	def read_safetensors(path: Path) -> dict:
	"""Read safetensors file, handling bf16 manually."""
	with open(path, "rb") as f:
	# Header: 8-byte little-endian uint64 = header size
	header_size = struct.unpack("<Q", f.read(8))[0]
	header_json = f.read(header_size)
	header = json.loads(header_json)

	data_start = 8 + header_size
	tensors = {}

	for name, info in header.items():
	if name == "__metadata__":
	continue
	dtype = info["dtype"]
	shape = info["shape"]
	offsets = info["data_offsets"]
	start, end = offsets

	f.seek(data_start + start)
	raw = f.read(end - start)

	if dtype == "BF16":
	arr = bf16_to_f16(raw).reshape(shape)
	elif dtype == "F16":
	arr = np.frombuffer(raw, dtype=np.float16).reshape(shape)
	elif dtype == "F32":
	arr = np.frombuffer(raw, dtype=np.float32).reshape(shape)
	arr = arr.astype(np.float16)
	else:
	raise ValueError(f"Unsupported dtype: {dtype}")

	tensors[name] = arr

	return tensors


	def peft_name_to_gguf(peft_name: str) -> str \| None:
	"""Map PEFT tensor name to GGUF tensor name.

	Input: base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight
	Output: blk.0.attn_q.weight.lora_a
	"""
	parts = peft_name.split(".")
	# Expected: base_model.model.model.layers.{i}.self_attn.{proj}.lora_{AB}.weight
	try:
	layer_idx = parts[4] # layer number
	proj = parts[6] # q_proj, k_proj, etc.
	lora_part = parts[7] # lora_A or lora_B
	except IndexError:
	return None

	gguf_proj = PROJ_MAP.get(proj)
	if gguf_proj is None:
	return None

	ab = lora_part.lower() # lora_a or lora_b
	return f"blk.{layer_idx}.{gguf_proj}.weight.{ab}"


	def convert(adapter_dir: Path, output_path: Path, adapter_name: str):
	"""Convert a PEFT LoRA adapter to GGUF format."""
	config_path = adapter_dir / "adapter_config.json"
	safetensors_path = adapter_dir / "adapter_model.safetensors"

	if not config_path.exists():
	raise FileNotFoundError(f"No adapter_config.json in {adapter_dir}")
	if not safetensors_path.exists():
	raise FileNotFoundError(f"No adapter_model.safetensors in {adapter_dir}")

	# Read config
	with open(config_path) as f:
	config = json.load(f)

	lora_alpha = config.get("lora_alpha", 32)
	lora_rank = config.get("r", 16)
	print(f" Config: rank={lora_rank}, alpha={lora_alpha}")

	# Read tensors
	print(f" Reading safetensors...")
	tensors = read_safetensors(safetensors_path)
	print(f" Loaded {len(tensors)} tensors")

	# Create GGUF writer
	writer = GGUFWriter(str(output_path), arch="llama")

	# Write metadata (matching the newton GGUF format exactly)
	writer.add_string("general.type", "adapter")
	writer.add_string("adapter.type", "lora")
	writer.add_string("general.name", adapter_name)
	writer.add_uint32("general.base_model.count", 1)
	writer.add_string("general.base_model.0.name", "Llama 3.1 8B Instruct")
	writer.add_string("general.base_model.0.organization", "Meta Llama")
	writer.add_string("general.base_model.0.repo_url",
	"https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct")
	writer.add_array("general.tags", [
	"base_model:adapter:meta-llama/Llama-3.1-8B-Instruct",
	"lora", "sft", "transformers", "trl", "text-generation",
	])
	writer.add_float32("adapter.lora.alpha", float(lora_alpha))
	writer.add_uint32("general.quantization_version", 2)

	# Convert and add tensors
	converted = 0
	for peft_name, data in sorted(tensors.items()):
	gguf_name = peft_name_to_gguf(peft_name)
	if gguf_name is None:
	print(f" SKIP: {peft_name}")
	continue

	# GGUF LoRA expects F16 (type=1)
	writer.add_tensor(gguf_name, data, raw_dtype=GGMLQuantizationType.F16)
	converted += 1

	print(f" Converted {converted} tensors")

	# Write file
	writer.write_header_to_file()
	writer.write_kv_data_to_file()
	writer.write_tensors_to_file()
	writer.close()

	size_mb = output_path.stat().st_size / 1024 / 1024
	print(f" Output: {output_path} ({size_mb:.1f} MB)")


	def main():
	adapters_dir = Path("J:/codette-training-lab/adapters")
	hf_dir = adapters_dir / "hf_download"

	# Convert all adapters that have safetensors but no GGUF yet
	to_convert = []
	for name in ["empathy", "philosophy", "quantum",
	"consciousness", "multi_perspective", "systems_architecture"]:
	src = hf_dir / name
	dst = adapters_dir / f"{name}-lora-f16.gguf"
	if src.exists() and (src / "adapter_model.safetensors").exists():
	if dst.exists():
	print(f"SKIP {name}: GGUF already exists")
	else:
	to_convert.append((name, src, dst))
	else:
	print(f"SKIP {name}: no safetensors found")

	if not to_convert:
	print("Nothing to convert!")
	return

	for name, src, dst in to_convert:
	print(f"\nConverting {name}...")
	try:
	convert(src, dst, name)
	print(f"OK: {name}")
	except Exception as e:
	print(f"FAIL: {name}: {e}")


	if __name__ == "__main__":
	main()