"""Merge LoRA adapter and export to GGUF for llama.cpp."""
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
ADAPTER_DIR = "./adapter-model"
MERGED_DIR = "./merged-model"

print("Loading base model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True
)

print("Loading adapter...")
model = PeftModel.from_pretrained(model, ADAPTER_DIR)

print("Merging...")
model = model.merge_and_unload()

print(f"Saving merged model to {MERGED_DIR}...")
model.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

print(f"""
Done! Now convert to GGUF:

  pip install llama-cpp-python
  python -m llama_cpp.convert {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0

Or use llama.cpp's convert script:
  python /opt/llama.cpp/convert_hf_to_gguf.py {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0
""")