File size: 4,541 Bytes
1fbdc40 7d112ab 1fbdc40 f3c2b7f 1fbdc40 7d112ab f3c2b7f 59c9a78 f3c2b7f 59c9a78 f3c2b7f 1fbdc40 f3c2b7f 1fbdc40 f3c2b7f 1fbdc40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# /// script
# dependencies = ["transformers", "peft", "huggingface_hub", "torch"]
# ///
"""
Convert fine-tuned LoRA model to GGUF format with quantization.
Merges adapter with base model, converts to GGUF, and uploads to Hub.
"""
import os
import subprocess
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
# Configuration from environment variables or defaults
ADAPTER_MODEL = os.getenv("ADAPTER_MODEL", "nathens/qwen-codeforces-sft")
BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
OUTPUT_REPO = os.getenv("OUTPUT_REPO", "nathens/my-model-gguf")
QUANTIZATION = os.getenv("QUANTIZATION", "Q4_K_M")
print(f"๐ง Converting model to GGUF")
print(f" Base model: {BASE_MODEL}")
print(f" Adapter: {ADAPTER_MODEL}")
print(f" Output: {OUTPUT_REPO}")
print(f" Quantization: {QUANTIZATION}")
# Step 1: Load base model and tokenizer
print("\n๐ฆ Loading base model and tokenizer...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
# Step 2: Load and merge LoRA adapter
print(f"๐ Loading and merging LoRA adapter from {ADAPTER_MODEL}...")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
print("โ๏ธ Merging adapter weights into base model...")
merged_model = model.merge_and_unload()
# Step 3: Save merged model
print("๐พ Saving merged model...")
merged_dir = "./merged_model"
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)
print(f"โ
Merged model saved to {merged_dir}")
# Step 4: Install llama.cpp for conversion
print("\n๐ฅ Installing llama.cpp for GGUF conversion...")
subprocess.run(["apt-get", "update", "-qq"], check=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True)
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)
# Get number of processors
nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True)
nproc = nproc_result.stdout.strip()
print(f"Building llama.cpp with {nproc} cores using CMake...")
# Use CMake to build (CPU only - CUDA not needed for conversion/quantization)
os.makedirs("llama.cpp/build", exist_ok=True)
subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True)
subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True)
# Step 5: Convert to GGUF format
print("\n๐ Converting to GGUF format...")
subprocess.run([
"python3", "llama.cpp/convert_hf_to_gguf.py",
merged_dir,
"--outfile", "./model-f16.gguf",
"--outtype", "f16"
], check=True)
print("โ
Converted to FP16 GGUF")
# Step 6: Quantize to specified format
print(f"\nโก Quantizing to {QUANTIZATION}...")
subprocess.run([
"./llama.cpp/build/bin/llama-quantize",
"./model-f16.gguf",
f"./model-{QUANTIZATION}.gguf",
QUANTIZATION
], check=True)
print(f"โ
Quantized to {QUANTIZATION}")
# Step 7: Upload to Hub
print(f"\n๐ค Uploading to {OUTPUT_REPO}...")
from huggingface_hub import HfApi
api = HfApi()
# Create repo if it doesn't exist
try:
api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True)
except Exception as e:
print(f"Note: {e}")
# Upload the quantized GGUF file
api.upload_file(
path_or_fileobj=f"./model-{QUANTIZATION}.gguf",
path_in_repo=f"model-{QUANTIZATION}.gguf",
repo_id=OUTPUT_REPO,
repo_type="model"
)
# Also upload the original FP16 version
api.upload_file(
path_or_fileobj="./model-f16.gguf",
path_in_repo="model-f16.gguf",
repo_id=OUTPUT_REPO,
repo_type="model"
)
# Upload tokenizer files
for file in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json"]:
try:
api.upload_file(
path_or_fileobj=f"{merged_dir}/{file}",
path_in_repo=file,
repo_id=OUTPUT_REPO,
repo_type="model"
)
except Exception:
pass # Some files may not exist
print(f"\nโ
Conversion complete!")
print(f"๐ GGUF model available at: https://huggingface.co/{OUTPUT_REPO}")
print(f"\n๐ก To use with Ollama:")
print(f" 1. Download: huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf")
print(f" 2. Create Modelfile with the downloaded GGUF")
print(f" 3. Run: ollama create my-model -f Modelfile")
print(f" 4. Use: ollama run my-model")
|