training-scripts / convert_to_gguf.py
nathens's picture
Upload convert_to_gguf.py with huggingface_hub
59c9a78 verified
# /// script
# dependencies = ["transformers", "peft", "huggingface_hub", "torch"]
# ///
"""
Convert fine-tuned LoRA model to GGUF format with quantization.
Merges adapter with base model, converts to GGUF, and uploads to Hub.
"""
import os
import subprocess
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
# Configuration from environment variables or defaults
ADAPTER_MODEL = os.getenv("ADAPTER_MODEL", "nathens/qwen-codeforces-sft")
BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
OUTPUT_REPO = os.getenv("OUTPUT_REPO", "nathens/my-model-gguf")
QUANTIZATION = os.getenv("QUANTIZATION", "Q4_K_M")
print(f"πŸ”§ Converting model to GGUF")
print(f" Base model: {BASE_MODEL}")
print(f" Adapter: {ADAPTER_MODEL}")
print(f" Output: {OUTPUT_REPO}")
print(f" Quantization: {QUANTIZATION}")
# Step 1: Load base model and tokenizer
print("\nπŸ“¦ Loading base model and tokenizer...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
# Step 2: Load and merge LoRA adapter
print(f"πŸ”€ Loading and merging LoRA adapter from {ADAPTER_MODEL}...")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
print("βš™οΈ Merging adapter weights into base model...")
merged_model = model.merge_and_unload()
# Step 3: Save merged model
print("πŸ’Ύ Saving merged model...")
merged_dir = "./merged_model"
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)
print(f"βœ… Merged model saved to {merged_dir}")
# Step 4: Install llama.cpp for conversion
print("\nπŸ“₯ Installing llama.cpp for GGUF conversion...")
subprocess.run(["apt-get", "update", "-qq"], check=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True)
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)
# Get number of processors
nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True)
nproc = nproc_result.stdout.strip()
print(f"Building llama.cpp with {nproc} cores using CMake...")
# Use CMake to build (CPU only - CUDA not needed for conversion/quantization)
os.makedirs("llama.cpp/build", exist_ok=True)
subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True)
subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True)
# Step 5: Convert to GGUF format
print("\nπŸ”„ Converting to GGUF format...")
subprocess.run([
"python3", "llama.cpp/convert_hf_to_gguf.py",
merged_dir,
"--outfile", "./model-f16.gguf",
"--outtype", "f16"
], check=True)
print("βœ… Converted to FP16 GGUF")
# Step 6: Quantize to specified format
print(f"\n⚑ Quantizing to {QUANTIZATION}...")
subprocess.run([
"./llama.cpp/build/bin/llama-quantize",
"./model-f16.gguf",
f"./model-{QUANTIZATION}.gguf",
QUANTIZATION
], check=True)
print(f"βœ… Quantized to {QUANTIZATION}")
# Step 7: Upload to Hub
print(f"\nπŸ“€ Uploading to {OUTPUT_REPO}...")
from huggingface_hub import HfApi
api = HfApi()
# Create repo if it doesn't exist
try:
api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True)
except Exception as e:
print(f"Note: {e}")
# Upload the quantized GGUF file
api.upload_file(
path_or_fileobj=f"./model-{QUANTIZATION}.gguf",
path_in_repo=f"model-{QUANTIZATION}.gguf",
repo_id=OUTPUT_REPO,
repo_type="model"
)
# Also upload the original FP16 version
api.upload_file(
path_or_fileobj="./model-f16.gguf",
path_in_repo="model-f16.gguf",
repo_id=OUTPUT_REPO,
repo_type="model"
)
# Upload tokenizer files
for file in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json"]:
try:
api.upload_file(
path_or_fileobj=f"{merged_dir}/{file}",
path_in_repo=file,
repo_id=OUTPUT_REPO,
repo_type="model"
)
except Exception:
pass # Some files may not exist
print(f"\nβœ… Conversion complete!")
print(f"πŸ“ GGUF model available at: https://huggingface.co/{OUTPUT_REPO}")
print(f"\nπŸ’‘ To use with Ollama:")
print(f" 1. Download: huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf")
print(f" 2. Create Modelfile with the downloaded GGUF")
print(f" 3. Run: ollama create my-model -f Modelfile")
print(f" 4. Use: ollama run my-model")