"""Merge LoRA adapter and export to GGUF for llama.cpp.""" import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" ADAPTER_DIR = "./adapter-model" MERGED_DIR = "./merged-model" print("Loading base model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True ) print("Loading adapter...") model = PeftModel.from_pretrained(model, ADAPTER_DIR) print("Merging...") model = model.merge_and_unload() print(f"Saving merged model to {MERGED_DIR}...") model.save_pretrained(MERGED_DIR) tokenizer.save_pretrained(MERGED_DIR) print(f""" Done! Now convert to GGUF: pip install llama-cpp-python python -m llama_cpp.convert {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0 Or use llama.cpp's convert script: python /opt/llama.cpp/convert_hf_to_gguf.py {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0 """)