| """Merge LoRA adapter and export to GGUF for llama.cpp.""" |
| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from peft import PeftModel |
|
|
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
| ADAPTER_DIR = "./adapter-model" |
| MERGED_DIR = "./merged-model" |
|
|
| print("Loading base model...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True |
| ) |
|
|
| print("Loading adapter...") |
| model = PeftModel.from_pretrained(model, ADAPTER_DIR) |
|
|
| print("Merging...") |
| model = model.merge_and_unload() |
|
|
| print(f"Saving merged model to {MERGED_DIR}...") |
| model.save_pretrained(MERGED_DIR) |
| tokenizer.save_pretrained(MERGED_DIR) |
|
|
| print(f""" |
| Done! Now convert to GGUF: |
| |
| pip install llama-cpp-python |
| python -m llama_cpp.convert {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0 |
| |
| Or use llama.cpp's convert script: |
| python /opt/llama.cpp/convert_hf_to_gguf.py {MERGED_DIR} --outfile adapter-q8.gguf --outtype q8_0 |
| """) |
|
|