training-scripts / convert_to_gguf.py
albertlieadrian's picture
Upload convert_to_gguf.py with huggingface_hub
253f8fb verified
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "transformers>=4.36.0",
# "peft>=0.7.0",
# "torch>=2.0.0",
# "accelerate>=0.24.0",
# "huggingface_hub>=0.20.0",
# "sentencepiece>=0.1.99",
# "protobuf>=3.20.0",
# "numpy",
# "gguf",
# ]
# system_dependencies = ["build-essential", "cmake", "git"]
# ///
"""
GGUF Conversion - Q4_K_M Only
Converts fine-tuned model to GGUF with Q4_K_M quantization.
"""
import os
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi
import subprocess
def install_build_tools():
"""Install build tools required for llama.cpp."""
print(" Installing build tools...")
try:
# Update and install build tools
subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True)
subprocess.run([
"apt-get", "install", "-y", "-qq",
"build-essential", "cmake", "git"
], check=True, capture_output=True)
print(" βœ… Build tools installed")
return True
except Exception as e:
print(f" ❌ Failed to install build tools: {e}")
return False
def run_command(cmd, description):
"""Run a command with error handling."""
print(f" {description}...")
try:
result = subprocess.run(
cmd,
check=True,
capture_output=True,
text=True
)
if result.stdout:
print(f" {result.stdout[:200]}")
return True
except subprocess.CalledProcessError as e:
print(f" ❌ Command failed: {' '.join(cmd)}")
if e.stderr:
print(f" STDERR: {e.stderr[:500]}")
return False
except FileNotFoundError:
print(f" ❌ Command not found: {cmd[0]}")
return False
print("πŸ”„ GGUF Conversion - Q4_K_M")
print("=" * 60)
# Install build tools FIRST (before cloning llama.cpp)
install_build_tools()
# Configuration from environment variables
ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "albertlieadrian/qwen3-0.6b-codeforces-sft")
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen3-0.6B")
OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "albertlieadrian/qwen3-0.6b-codeforces-sft-gguf")
HF_USERNAME = os.environ.get("HF_USERNAME", "albertlieadrian")
print(f"\nπŸ“¦ Configuration:")
print(f" Base model: {BASE_MODEL}")
print(f" Adapter model: {ADAPTER_MODEL}")
print(f" Output repo: {OUTPUT_REPO}")
# Step 1: Load base model and adapter
print("\nπŸ”§ Step 1: Loading base model and LoRA adapter...")
try:
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
print(" βœ… Base model loaded")
except Exception as e:
print(f" ❌ Failed to load base model: {e}")
sys.exit(1)
try:
print(" Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
print(" βœ… Adapter loaded")
print(" Merging adapter with base model...")
merged_model = model.merge_and_unload()
print(" βœ… Models merged!")
except Exception as e:
print(f" ❌ Failed to merge models: {e}")
sys.exit(1)
try:
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
print(" βœ… Tokenizer loaded")
except Exception as e:
print(f" ❌ Failed to load tokenizer: {e}")
sys.exit(1)
# Step 2: Save merged model
print("\nπŸ’Ύ Step 2: Saving merged model...")
merged_dir = "/tmp/merged_model"
try:
merged_model.save_pretrained(merged_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_dir)
print(f" βœ… Merged model saved to {merged_dir}")
except Exception as e:
print(f" ❌ Failed to save merged model: {e}")
sys.exit(1)
# Step 3: Setup llama.cpp
print("\nπŸ“₯ Step 3: Setting up llama.cpp...")
if not run_command(
["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
"Cloning llama.cpp"
):
sys.exit(1)
print(" Installing Python dependencies...")
run_command(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], "Installing requirements")
run_command(["pip", "install", "sentencepiece", "protobuf"], "Installing tokenizer deps")
# Step 4: Convert to GGUF (FP16)
print("\nπŸ”„ Step 4: Converting to GGUF format (FP16)...")
gguf_output_dir = "/tmp/gguf_output"
os.makedirs(gguf_output_dir, exist_ok=True)
convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
model_name = ADAPTER_MODEL.split('/')[-1]
gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
if not run_command(
[sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
"Converting to FP16"
):
print(" ❌ Conversion failed!")
sys.exit(1)
print(f" βœ… FP16 GGUF created: {gguf_file}")
# Step 5: Quantize to Q4_K_M
print("\nβš™οΈ Step 5: Quantizing to Q4_K_M...")
# Build quantize tool with CMake
print(" Building quantize tool with CMake...")
os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
if not run_command(
["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
"Configuring with CMake"
):
sys.exit(1)
if not run_command(
["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
"Building llama-quantize"
):
sys.exit(1)
print(" βœ… Quantize tool built")
quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
quant_file = f"{gguf_output_dir}/{model_name}-q4_k_m.gguf"
print(f" Creating Q4_K_M quantization...")
if not run_command([quantize_bin, gguf_file, quant_file, "Q4_K_M"], "Quantizing to Q4_K_M"):
print(" ❌ Quantization failed!")
sys.exit(1)
size_mb = os.path.getsize(quant_file) / (1024 * 1024)
print(f" βœ… Q4_K_M: {size_mb:.1f} MB")
# Step 6: Upload to Hub
print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
api = HfApi()
print(f" Creating repository: {OUTPUT_REPO}")
try:
api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
print(" βœ… Repository ready")
except Exception as e:
print(f" ℹ️ Repository may already exist: {e}")
# Upload Q4_K_M
print(" Uploading Q4_K_M GGUF...")
try:
api.upload_file(
path_or_fileobj=quant_file,
path_in_repo=f"{model_name}-q4_k_m.gguf",
repo_id=OUTPUT_REPO,
)
print(" βœ… Q4_K_M uploaded")
except Exception as e:
print(f" ❌ Upload failed: {e}")
sys.exit(1)
# Create README
print("\nπŸ“ Creating README...")
readme_content = f"""---
base_model: {BASE_MODEL}
tags:
- gguf
- llama.cpp
- quantized
- trl
- sft
---
# {model_name}-gguf
This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).
## Model Details
- **Base Model:** {BASE_MODEL}
- **Fine-tuned Model:** {ADAPTER_MODEL}
- **Training:** Supervised Fine-Tuning (SFT) with TRL
- **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.)
## Quantization
| File | Quant | Size | Description |
|------|-------|------|-------------|
| {model_name}-q4_k_m.gguf | Q4_K_M | ~{size_mb:.0f}MB | 4-bit medium (recommended) |
## Usage
### With llama.cpp
```bash
huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt"
```
### With Ollama
1. Create a `Modelfile`:
```
FROM ./{model_name}-q4_k_m.gguf
```
2. Create and run:
```bash
ollama create my-model -f Modelfile
ollama run my-model
```
### With LM Studio
1. Download the `.gguf` file
2. Import into LM Studio
3. Start chatting!
## License
Inherits the license from the base model: {BASE_MODEL}
---
*Converted to GGUF format using llama.cpp*
"""
try:
api.upload_file(
path_or_fileobj=readme_content.encode(),
path_in_repo="README.md",
repo_id=OUTPUT_REPO,
)
print(" βœ… README uploaded")
except Exception as e:
print(f" ❌ README upload failed: {e}")
print("\n" + "=" * 60)
print("βœ… GGUF Conversion Complete!")
print(f"πŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}")
print(f"\nπŸ“₯ Download with:")
print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
print(f"\nπŸš€ Use with Ollama:")
print(f" 1. Download the GGUF file")
print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
print(" 3. ollama create my-model -f Modelfile")
print(" 4. ollama run my-model")
print("=" * 60)