|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Convert fine-tuned LoRA model to GGUF format with Q4_K_M quantization. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import subprocess |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from peft import PeftModel |
|
|
import torch |
|
|
|
|
|
|
|
|
ADAPTER_MODEL = "nathens/qwen-codeforces-sft" |
|
|
BASE_MODEL = "Qwen/Qwen2.5-0.5B" |
|
|
OUTPUT_REPO = "nathens/my-model-gguf" |
|
|
QUANTIZATION = "Q4_K_M" |
|
|
|
|
|
print(f"π§ Converting model to GGUF") |
|
|
print(f" Base model: {BASE_MODEL}") |
|
|
print(f" Adapter: {ADAPTER_MODEL}") |
|
|
print(f" Output: {OUTPUT_REPO}") |
|
|
print(f" Quantization: {QUANTIZATION}") |
|
|
|
|
|
|
|
|
print("\nπ¦ Loading base model and tokenizer...") |
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
|
BASE_MODEL, |
|
|
dtype=torch.float16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
|
|
|
|
|
|
print(f"π Loading and merging LoRA adapter from {ADAPTER_MODEL}...") |
|
|
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) |
|
|
print("βοΈ Merging adapter weights into base model...") |
|
|
merged_model = model.merge_and_unload() |
|
|
|
|
|
|
|
|
print("πΎ Saving merged model...") |
|
|
merged_dir = "./merged_model" |
|
|
merged_model.save_pretrained(merged_dir) |
|
|
tokenizer.save_pretrained(merged_dir) |
|
|
print(f"β
Merged model saved to {merged_dir}") |
|
|
|
|
|
|
|
|
print("\nπ₯ Installing llama.cpp for GGUF conversion...") |
|
|
subprocess.run(["apt-get", "update", "-qq"], check=True) |
|
|
subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True) |
|
|
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True) |
|
|
|
|
|
|
|
|
nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True) |
|
|
nproc = nproc_result.stdout.strip() |
|
|
print(f"Building llama.cpp with {nproc} cores using CMake...") |
|
|
|
|
|
os.makedirs("llama.cpp/build", exist_ok=True) |
|
|
subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True) |
|
|
subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True) |
|
|
|
|
|
|
|
|
print("\nπ Converting to GGUF format...") |
|
|
subprocess.run([ |
|
|
"python3", "llama.cpp/convert_hf_to_gguf.py", |
|
|
merged_dir, |
|
|
"--outfile", "./model-f16.gguf", |
|
|
"--outtype", "f16" |
|
|
], check=True) |
|
|
print("β
Converted to FP16 GGUF") |
|
|
|
|
|
|
|
|
print(f"\nβ‘ Quantizing to {QUANTIZATION}...") |
|
|
subprocess.run([ |
|
|
"./llama.cpp/build/bin/llama-quantize", |
|
|
"./model-f16.gguf", |
|
|
f"./model-{QUANTIZATION}.gguf", |
|
|
QUANTIZATION |
|
|
], check=True) |
|
|
print(f"β
Quantized to {QUANTIZATION}") |
|
|
|
|
|
|
|
|
print(f"\nπ€ Uploading to {OUTPUT_REPO}...") |
|
|
from huggingface_hub import HfApi |
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
try: |
|
|
api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True) |
|
|
except Exception as e: |
|
|
print(f"Note: {e}") |
|
|
|
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=f"./model-{QUANTIZATION}.gguf", |
|
|
path_in_repo=f"model-{QUANTIZATION}.gguf", |
|
|
repo_id=OUTPUT_REPO, |
|
|
repo_type="model" |
|
|
) |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj="./model-f16.gguf", |
|
|
path_in_repo="model-f16.gguf", |
|
|
repo_id=OUTPUT_REPO, |
|
|
repo_type="model" |
|
|
) |
|
|
|
|
|
|
|
|
for file in ["tokenizer.json", "tokenizer_config.json"]: |
|
|
try: |
|
|
api.upload_file( |
|
|
path_or_fileobj=f"{merged_dir}/{file}", |
|
|
path_in_repo=file, |
|
|
repo_id=OUTPUT_REPO, |
|
|
repo_type="model" |
|
|
) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
print(f"\nβ
Conversion complete!") |
|
|
print(f"π GGUF model available at: https://huggingface.co/{OUTPUT_REPO}") |
|
|
print(f"\nπ‘ To use with Ollama:") |
|
|
print(f" huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf") |
|
|
|