training-scripts / convert_to_gguf.py

Upload convert_to_gguf.py with huggingface_hub

59c9a78 verified about 1 month ago

4.54 kB

	# /// script
	# dependencies = ["transformers", "peft", "huggingface_hub", "torch"]
	# ///

	"""
	Convert fine-tuned LoRA model to GGUF format with quantization.
	Merges adapter with base model, converts to GGUF, and uploads to Hub.
	"""

	import os
	import subprocess
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	import torch

	# Configuration from environment variables or defaults
	ADAPTER_MODEL = os.getenv("ADAPTER_MODEL", "nathens/qwen-codeforces-sft")
	BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
	OUTPUT_REPO = os.getenv("OUTPUT_REPO", "nathens/my-model-gguf")
	QUANTIZATION = os.getenv("QUANTIZATION", "Q4_K_M")

	print(f"🔧 Converting model to GGUF")
	print(f" Base model: {BASE_MODEL}")
	print(f" Adapter: {ADAPTER_MODEL}")
	print(f" Output: {OUTPUT_REPO}")
	print(f" Quantization: {QUANTIZATION}")

	# Step 1: Load base model and tokenizer
	print("\n📦 Loading base model and tokenizer...")
	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

	# Step 2: Load and merge LoRA adapter
	print(f"🔀 Loading and merging LoRA adapter from {ADAPTER_MODEL}...")
	model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
	print("⚙️ Merging adapter weights into base model...")
	merged_model = model.merge_and_unload()

	# Step 3: Save merged model
	print("💾 Saving merged model...")
	merged_dir = "./merged_model"
	merged_model.save_pretrained(merged_dir)
	tokenizer.save_pretrained(merged_dir)
	print(f"✅ Merged model saved to {merged_dir}")

	# Step 4: Install llama.cpp for conversion
	print("\n📥 Installing llama.cpp for GGUF conversion...")
	subprocess.run(["apt-get", "update", "-qq"], check=True)
	subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True)
	subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)

	# Get number of processors
	nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True)
	nproc = nproc_result.stdout.strip()
	print(f"Building llama.cpp with {nproc} cores using CMake...")

	# Use CMake to build (CPU only - CUDA not needed for conversion/quantization)
	os.makedirs("llama.cpp/build", exist_ok=True)
	subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True)
	subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True)

	# Step 5: Convert to GGUF format
	print("\n🔄 Converting to GGUF format...")
	subprocess.run([
	"python3", "llama.cpp/convert_hf_to_gguf.py",
	merged_dir,
	"--outfile", "./model-f16.gguf",
	"--outtype", "f16"
	], check=True)
	print("✅ Converted to FP16 GGUF")

	# Step 6: Quantize to specified format
	print(f"\n⚡ Quantizing to {QUANTIZATION}...")
	subprocess.run([
	"./llama.cpp/build/bin/llama-quantize",
	"./model-f16.gguf",
	f"./model-{QUANTIZATION}.gguf",
	QUANTIZATION
	], check=True)
	print(f"✅ Quantized to {QUANTIZATION}")

	# Step 7: Upload to Hub
	print(f"\n📤 Uploading to {OUTPUT_REPO}...")
	from huggingface_hub import HfApi
	api = HfApi()

	# Create repo if it doesn't exist
	try:
	api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True)
	except Exception as e:
	print(f"Note: {e}")

	# Upload the quantized GGUF file
	api.upload_file(
	path_or_fileobj=f"./model-{QUANTIZATION}.gguf",
	path_in_repo=f"model-{QUANTIZATION}.gguf",
	repo_id=OUTPUT_REPO,
	repo_type="model"
	)

	# Also upload the original FP16 version
	api.upload_file(
	path_or_fileobj="./model-f16.gguf",
	path_in_repo="model-f16.gguf",
	repo_id=OUTPUT_REPO,
	repo_type="model"
	)

	# Upload tokenizer files
	for file in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json"]:
	try:
	api.upload_file(
	path_or_fileobj=f"{merged_dir}/{file}",
	path_in_repo=file,
	repo_id=OUTPUT_REPO,
	repo_type="model"
	)
	except Exception:
	pass # Some files may not exist

	print(f"\n✅ Conversion complete!")
	print(f"📁 GGUF model available at: https://huggingface.co/{OUTPUT_REPO}")
	print(f"\n💡 To use with Ollama:")
	print(f" 1. Download: huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf")
	print(f" 2. Create Modelfile with the downloaded GGUF")
	print(f" 3. Run: ollama create my-model -f Modelfile")
	print(f" 4. Use: ollama run my-model")