#!/usr/bin/env python3 """ Merge Codette LoRA Adapter with Base Model — v2 Merges HuggingFace PEFT adapter into base model using llama.cpp's export tool. Run this AFTER training completes and the adapter is on HuggingFace. Two paths: A) HuggingFace format → merged safetensors (for further conversion) B) GGUF base + GGUF LoRA → merged GGUF (if you have GGUF versions of both) Usage: $env:HF_TOKEN = "your_token" python merge_lora_adapter.py """ import os import sys import subprocess from pathlib import Path # ── Config ───────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", "") # Path A: Merge HuggingFace adapter (use this after training completes) BASE_MODEL_HF = "meta-llama/Llama-3.2-1B-Instruct" ADAPTER_REPO = "Raiff1982/codette-llama-adapter" MERGED_HF_DIR = Path("J:/TheAI/models/codette-v2-merged") # Path B: Merge GGUF LoRA into GGUF base (use if you have GGUF-format LoRA) BASE_GGUF = Path("J:/TheAI/models/codette-v2-gguf/codette-v2.gguf") LORA_GGUF = Path("J:/TheAI/models/codette-rc-xi-lora.bin") OUTPUT_GGUF = Path("J:/TheAI/models/codette-v2-merged.gguf") LLAMA_TOOL = Path("J:/TheAI/llama.cpp/build/bin/Release/llama-export-lora.exe") # ── Validate token ────────────────────────────────────────────────────────── if not HF_TOKEN: print("[!] HF_TOKEN not set. Run:") print(' $env:HF_TOKEN = "your_token_here"') sys.exit(1) print("=" * 80) print("MERGE CODETTE v2 LORA ADAPTER WITH BASE MODEL") print("=" * 80) print() print("Select merge path:") print(" A) HuggingFace format (PEFT adapter + HF base → merged safetensors)") print(" B) GGUF format (GGUF base + GGUF LoRA → merged GGUF)") print() choice = input("Enter A or B [default: A]: ").strip().upper() or "A" # ── Path A: HuggingFace PEFT merge ───────────────────────────────────────── if choice == "A": print() print("=" * 60) print("PATH A: HuggingFace PEFT Merge") print("=" * 60) import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel MERGED_HF_DIR.mkdir(parents=True, exist_ok=True) print(f"[*] Loading tokenizer: {BASE_MODEL_HF}") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_HF, token=HF_TOKEN) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"[*] Loading base model: {BASE_MODEL_HF}") base = AutoModelForCausalLM.from_pretrained( BASE_MODEL_HF, torch_dtype=torch.float16, low_cpu_mem_usage=True, token=HF_TOKEN, ) print(f"[*] Loading LoRA adapter: {ADAPTER_REPO}") model = PeftModel.from_pretrained(base, ADAPTER_REPO, token=HF_TOKEN) print("[*] Merging and unloading LoRA weights...") model = model.merge_and_unload() model = model.to(torch.float16) print(f"[*] Saving merged model to {MERGED_HF_DIR}") model.save_pretrained(MERGED_HF_DIR, safe_serialization=True) tokenizer.save_pretrained(MERGED_HF_DIR) size_gb = sum(f.stat().st_size for f in MERGED_HF_DIR.rglob("*") if f.is_file()) / (1024**3) print(f"[✓] Merged model saved — {size_gb:.2f} GB") print() print("[*] Next step — convert to GGUF:") print(f" python J:/TheAI/llama.cpp/convert_hf_to_gguf.py {MERGED_HF_DIR} --outfile J:/TheAI/models/codette-v2-gguf/codette-v2.gguf --outtype q8_0") print() print("[*] Or run make_codette_gguf.py which does all steps automatically.") # ── Path B: GGUF LoRA merge ───────────────────────────────────────────────── elif choice == "B": print() print("=" * 60) print("PATH B: GGUF LoRA Merge") print("=" * 60) print("[*] Checking required files...") if not BASE_GGUF.exists(): print(f"[!] Base GGUF not found: {BASE_GGUF}") print("[!] Run make_codette_gguf.py first to create the base GGUF.") sys.exit(1) print(f"[✓] Base GGUF: {BASE_GGUF.stat().st_size / (1024**3):.2f} GB") if not LORA_GGUF.exists(): print(f"[!] LoRA GGUF not found: {LORA_GGUF}") print("[!] Note: HuggingFace PEFT adapters are not GGUF format.") print("[!] Use Path A to merge the HuggingFace adapter, then convert the result.") sys.exit(1) print(f"[✓] LoRA GGUF: {LORA_GGUF.stat().st_size / (1024**2):.2f} MB") if not LLAMA_TOOL.exists(): print(f"[!] Merge tool not found: {LLAMA_TOOL}") print("[!] Build llama.cpp first:") print(" cd J:/TheAI/llama.cpp") print(" cmake -B build && cmake --build build --config Release") sys.exit(1) print(f"[✓] Merge tool found") OUTPUT_GGUF.parent.mkdir(parents=True, exist_ok=True) print() print(f"[*] Merging {BASE_GGUF.name} + {LORA_GGUF.name}") print(f"[*] Output: {OUTPUT_GGUF}") print() cmd = [ str(LLAMA_TOOL), "--model", str(BASE_GGUF), "--lora", str(LORA_GGUF), "--output", str(OUTPUT_GGUF), ] result = subprocess.run(cmd, cwd="J:/TheAI") if result.returncode == 0 and OUTPUT_GGUF.exists(): size_gb = OUTPUT_GGUF.stat().st_size / (1024**3) print(f"[✓] Merge complete: {OUTPUT_GGUF} ({size_gb:.2f} GB)") print() print("[*] Create Ollama model:") print(f" ollama create codette-v2 -f J:/TheAI/models/codette-v2-gguf/Modelfile") print() print("[*] Or load directly in llama.cpp:") print(f" llama-cli.exe -m {OUTPUT_GGUF} -p 'Your prompt here'") else: print("[!] Merge failed or output not created.") print() print("[*] Alternative — load LoRA separately at inference time:") print(f" llama-cli.exe -m {BASE_GGUF} --lora {LORA_GGUF}") else: print(f"[!] Unknown choice: {choice}") sys.exit(1)