| | |
| | """ |
| | Merge Codette LoRA Adapter with Base Model β v2 |
| | Merges HuggingFace PEFT adapter into base model using llama.cpp's export tool. |
| | |
| | Run this AFTER training completes and the adapter is on HuggingFace. |
| | |
| | Two paths: |
| | A) HuggingFace format β merged safetensors (for further conversion) |
| | B) GGUF base + GGUF LoRA β merged GGUF (if you have GGUF versions of both) |
| | |
| | Usage: |
| | $env:HF_TOKEN = "your_token" |
| | python merge_lora_adapter.py |
| | """ |
| |
|
| | import os |
| | import sys |
| | import subprocess |
| | from pathlib import Path |
| |
|
| | |
| | HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| |
|
| | |
| | BASE_MODEL_HF = "meta-llama/Llama-3.2-1B-Instruct" |
| | ADAPTER_REPO = "Raiff1982/codette-llama-adapter" |
| | MERGED_HF_DIR = Path("J:/TheAI/models/codette-v2-merged") |
| |
|
| | |
| | BASE_GGUF = Path("J:/TheAI/models/codette-v2-gguf/codette-v2.gguf") |
| | LORA_GGUF = Path("J:/TheAI/models/codette-rc-xi-lora.bin") |
| | OUTPUT_GGUF = Path("J:/TheAI/models/codette-v2-merged.gguf") |
| | LLAMA_TOOL = Path("J:/TheAI/llama.cpp/build/bin/Release/llama-export-lora.exe") |
| |
|
| | |
| | if not HF_TOKEN: |
| | print("[!] HF_TOKEN not set. Run:") |
| | print(' $env:HF_TOKEN = "your_token_here"') |
| | sys.exit(1) |
| |
|
| | print("=" * 80) |
| | print("MERGE CODETTE v2 LORA ADAPTER WITH BASE MODEL") |
| | print("=" * 80) |
| | print() |
| | print("Select merge path:") |
| | print(" A) HuggingFace format (PEFT adapter + HF base β merged safetensors)") |
| | print(" B) GGUF format (GGUF base + GGUF LoRA β merged GGUF)") |
| | print() |
| |
|
| | choice = input("Enter A or B [default: A]: ").strip().upper() or "A" |
| |
|
| | |
| | if choice == "A": |
| | print() |
| | print("=" * 60) |
| | print("PATH A: HuggingFace PEFT Merge") |
| | print("=" * 60) |
| |
|
| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | from peft import PeftModel |
| |
|
| | MERGED_HF_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| | print(f"[*] Loading tokenizer: {BASE_MODEL_HF}") |
| | tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_HF, token=HF_TOKEN) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | print(f"[*] Loading base model: {BASE_MODEL_HF}") |
| | base = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL_HF, |
| | torch_dtype=torch.float16, |
| | low_cpu_mem_usage=True, |
| | token=HF_TOKEN, |
| | ) |
| |
|
| | print(f"[*] Loading LoRA adapter: {ADAPTER_REPO}") |
| | model = PeftModel.from_pretrained(base, ADAPTER_REPO, token=HF_TOKEN) |
| |
|
| | print("[*] Merging and unloading LoRA weights...") |
| | model = model.merge_and_unload() |
| | model = model.to(torch.float16) |
| |
|
| | print(f"[*] Saving merged model to {MERGED_HF_DIR}") |
| | model.save_pretrained(MERGED_HF_DIR, safe_serialization=True) |
| | tokenizer.save_pretrained(MERGED_HF_DIR) |
| |
|
| | size_gb = sum(f.stat().st_size for f in MERGED_HF_DIR.rglob("*") if f.is_file()) / (1024**3) |
| | print(f"[β] Merged model saved β {size_gb:.2f} GB") |
| | print() |
| | print("[*] Next step β convert to GGUF:") |
| | print(f" python J:/TheAI/llama.cpp/convert_hf_to_gguf.py {MERGED_HF_DIR} --outfile J:/TheAI/models/codette-v2-gguf/codette-v2.gguf --outtype q8_0") |
| | print() |
| | print("[*] Or run make_codette_gguf.py which does all steps automatically.") |
| |
|
| | |
| | elif choice == "B": |
| | print() |
| | print("=" * 60) |
| | print("PATH B: GGUF LoRA Merge") |
| | print("=" * 60) |
| |
|
| | print("[*] Checking required files...") |
| |
|
| | if not BASE_GGUF.exists(): |
| | print(f"[!] Base GGUF not found: {BASE_GGUF}") |
| | print("[!] Run make_codette_gguf.py first to create the base GGUF.") |
| | sys.exit(1) |
| | print(f"[β] Base GGUF: {BASE_GGUF.stat().st_size / (1024**3):.2f} GB") |
| |
|
| | if not LORA_GGUF.exists(): |
| | print(f"[!] LoRA GGUF not found: {LORA_GGUF}") |
| | print("[!] Note: HuggingFace PEFT adapters are not GGUF format.") |
| | print("[!] Use Path A to merge the HuggingFace adapter, then convert the result.") |
| | sys.exit(1) |
| | print(f"[β] LoRA GGUF: {LORA_GGUF.stat().st_size / (1024**2):.2f} MB") |
| |
|
| | if not LLAMA_TOOL.exists(): |
| | print(f"[!] Merge tool not found: {LLAMA_TOOL}") |
| | print("[!] Build llama.cpp first:") |
| | print(" cd J:/TheAI/llama.cpp") |
| | print(" cmake -B build && cmake --build build --config Release") |
| | sys.exit(1) |
| | print(f"[β] Merge tool found") |
| |
|
| | OUTPUT_GGUF.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | print() |
| | print(f"[*] Merging {BASE_GGUF.name} + {LORA_GGUF.name}") |
| | print(f"[*] Output: {OUTPUT_GGUF}") |
| | print() |
| |
|
| | cmd = [ |
| | str(LLAMA_TOOL), |
| | "--model", str(BASE_GGUF), |
| | "--lora", str(LORA_GGUF), |
| | "--output", str(OUTPUT_GGUF), |
| | ] |
| |
|
| | result = subprocess.run(cmd, cwd="J:/TheAI") |
| |
|
| | if result.returncode == 0 and OUTPUT_GGUF.exists(): |
| | size_gb = OUTPUT_GGUF.stat().st_size / (1024**3) |
| | print(f"[β] Merge complete: {OUTPUT_GGUF} ({size_gb:.2f} GB)") |
| | print() |
| | print("[*] Create Ollama model:") |
| | print(f" ollama create codette-v2 -f J:/TheAI/models/codette-v2-gguf/Modelfile") |
| | print() |
| | print("[*] Or load directly in llama.cpp:") |
| | print(f" llama-cli.exe -m {OUTPUT_GGUF} -p 'Your prompt here'") |
| | else: |
| | print("[!] Merge failed or output not created.") |
| | print() |
| | print("[*] Alternative β load LoRA separately at inference time:") |
| | print(f" llama-cli.exe -m {BASE_GGUF} --lora {LORA_GGUF}") |
| |
|
| | else: |
| | print(f"[!] Unknown choice: {choice}") |
| | sys.exit(1) |
| |
|