export_gguf.py · AaryanK/ModelGate at main

ModelGate / export_gguf.py

Upload export_gguf.py with huggingface_hub

e6aab9e verified 7 days ago

3.21 kB

	#!/usr/bin/env python3
	"""Export stock and no-CoT fine-tuned models to GGUF Q8_0.
	Does each model one at a time to avoid OOM on 8GB VRAM."""

	import torch
	import subprocess
	import shutil
	import sys
	from pathlib import Path

	BASE_MODEL = "katanemo/Arch-Router-1.5B"
	LORA_PATH = "finetuning/modelgate_arch_router_nocot_lora"
	LLAMA_CPP_DIR = Path("finetuning/llama.cpp")

	step = sys.argv[1] if len(sys.argv) > 1 else "all"

	# Clone llama.cpp for the converter script
	if not LLAMA_CPP_DIR.exists():
	print("Cloning llama.cpp for GGUF converter...")
	subprocess.run(
	["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", str(LLAMA_CPP_DIR)],
	check=True,
	)

	CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert_hf_to_gguf.py"

	if step in ("stock", "all"):
	stock_dir = Path("finetuning/stock_hf_temp")
	stock_gguf = Path("finetuning/stock_arch_router.Q8_0.gguf")

	if not stock_gguf.exists():
	print("=== Step 1: Save stock model to HF format ===")
	from transformers import AutoModelForCausalLM, AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	# Load in float16 on CPU to avoid GPU OOM
	model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
	model.save_pretrained(str(stock_dir))
	tokenizer.save_pretrained(str(stock_dir))
	del model, tokenizer
	print("Stock HF model saved\n")

	print("=== Step 2: Convert stock to GGUF Q8_0 ===")
	subprocess.run(
	["python3", str(CONVERT_SCRIPT), str(stock_dir),
	"--outfile", str(stock_gguf), "--outtype", "q8_0"],
	check=True,
	)
	print(f"Stock GGUF saved: {stock_gguf} ({stock_gguf.stat().st_size / 1e6:.0f} MB)\n")
	shutil.rmtree(str(stock_dir), ignore_errors=True)
	else:
	print(f"Stock GGUF already exists: {stock_gguf}")

	if step in ("nocot", "all"):
	nocot_dir = Path("finetuning/nocot_hf_temp")
	nocot_gguf = Path("finetuning/nocot_arch_router.Q8_0.gguf")

	if not nocot_gguf.exists():
	print("=== Step 3: Merge LoRA and save no-CoT model ===")
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
	model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
	model = PeftModel.from_pretrained(model, LORA_PATH, device_map="cpu")
	model = model.merge_and_unload()
	model.save_pretrained(str(nocot_dir))
	tokenizer.save_pretrained(str(nocot_dir))
	del model, tokenizer
	print("No-CoT merged HF model saved\n")

	print("=== Step 4: Convert no-CoT to GGUF Q8_0 ===")
	subprocess.run(
	["python3", str(CONVERT_SCRIPT), str(nocot_dir),
	"--outfile", str(nocot_gguf), "--outtype", "q8_0"],
	check=True,
	)
	print(f"No-CoT GGUF saved: {nocot_gguf} ({nocot_gguf.stat().st_size / 1e6:.0f} MB)\n")
	shutil.rmtree(str(nocot_dir), ignore_errors=True)
	else:
	print(f"No-CoT GGUF already exists: {nocot_gguf}")

	print("Done!")