File size: 6,259 Bytes
c676833
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""
Merge Codette LoRA Adapter with Base Model β€” v2
Merges HuggingFace PEFT adapter into base model using llama.cpp's export tool.

Run this AFTER training completes and the adapter is on HuggingFace.

Two paths:
  A) HuggingFace format β†’ merged safetensors (for further conversion)
  B) GGUF base + GGUF LoRA β†’ merged GGUF (if you have GGUF versions of both)

Usage:
  $env:HF_TOKEN = "your_token"
  python merge_lora_adapter.py
"""

import os
import sys
import subprocess
from pathlib import Path

# ── Config ─────────────────────────────────────────────────────────────────
HF_TOKEN      = os.environ.get("HF_TOKEN", "")

# Path A: Merge HuggingFace adapter (use this after training completes)
BASE_MODEL_HF  = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_REPO   = "Raiff1982/codette-llama-adapter"
MERGED_HF_DIR  = Path("J:/TheAI/models/codette-v2-merged")

# Path B: Merge GGUF LoRA into GGUF base (use if you have GGUF-format LoRA)
BASE_GGUF      = Path("J:/TheAI/models/codette-v2-gguf/codette-v2.gguf")
LORA_GGUF      = Path("J:/TheAI/models/codette-rc-xi-lora.bin")
OUTPUT_GGUF    = Path("J:/TheAI/models/codette-v2-merged.gguf")
LLAMA_TOOL     = Path("J:/TheAI/llama.cpp/build/bin/Release/llama-export-lora.exe")

# ── Validate token ──────────────────────────────────────────────────────────
if not HF_TOKEN:
    print("[!] HF_TOKEN not set. Run:")
    print('    $env:HF_TOKEN = "your_token_here"')
    sys.exit(1)

print("=" * 80)
print("MERGE CODETTE v2 LORA ADAPTER WITH BASE MODEL")
print("=" * 80)
print()
print("Select merge path:")
print("  A) HuggingFace format (PEFT adapter + HF base β†’ merged safetensors)")
print("  B) GGUF format (GGUF base + GGUF LoRA β†’ merged GGUF)")
print()

choice = input("Enter A or B [default: A]: ").strip().upper() or "A"

# ── Path A: HuggingFace PEFT merge ─────────────────────────────────────────
if choice == "A":
    print()
    print("=" * 60)
    print("PATH A: HuggingFace PEFT Merge")
    print("=" * 60)

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import PeftModel

    MERGED_HF_DIR.mkdir(parents=True, exist_ok=True)

    print(f"[*] Loading tokenizer: {BASE_MODEL_HF}")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_HF, token=HF_TOKEN)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"[*] Loading base model: {BASE_MODEL_HF}")
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_HF,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        token=HF_TOKEN,
    )

    print(f"[*] Loading LoRA adapter: {ADAPTER_REPO}")
    model = PeftModel.from_pretrained(base, ADAPTER_REPO, token=HF_TOKEN)

    print("[*] Merging and unloading LoRA weights...")
    model = model.merge_and_unload()
    model = model.to(torch.float16)

    print(f"[*] Saving merged model to {MERGED_HF_DIR}")
    model.save_pretrained(MERGED_HF_DIR, safe_serialization=True)
    tokenizer.save_pretrained(MERGED_HF_DIR)

    size_gb = sum(f.stat().st_size for f in MERGED_HF_DIR.rglob("*") if f.is_file()) / (1024**3)
    print(f"[βœ“] Merged model saved β€” {size_gb:.2f} GB")
    print()
    print("[*] Next step β€” convert to GGUF:")
    print(f"    python J:/TheAI/llama.cpp/convert_hf_to_gguf.py {MERGED_HF_DIR} --outfile J:/TheAI/models/codette-v2-gguf/codette-v2.gguf --outtype q8_0")
    print()
    print("[*] Or run make_codette_gguf.py which does all steps automatically.")

# ── Path B: GGUF LoRA merge ─────────────────────────────────────────────────
elif choice == "B":
    print()
    print("=" * 60)
    print("PATH B: GGUF LoRA Merge")
    print("=" * 60)

    print("[*] Checking required files...")

    if not BASE_GGUF.exists():
        print(f"[!] Base GGUF not found: {BASE_GGUF}")
        print("[!] Run make_codette_gguf.py first to create the base GGUF.")
        sys.exit(1)
    print(f"[βœ“] Base GGUF: {BASE_GGUF.stat().st_size / (1024**3):.2f} GB")

    if not LORA_GGUF.exists():
        print(f"[!] LoRA GGUF not found: {LORA_GGUF}")
        print("[!] Note: HuggingFace PEFT adapters are not GGUF format.")
        print("[!] Use Path A to merge the HuggingFace adapter, then convert the result.")
        sys.exit(1)
    print(f"[βœ“] LoRA GGUF: {LORA_GGUF.stat().st_size / (1024**2):.2f} MB")

    if not LLAMA_TOOL.exists():
        print(f"[!] Merge tool not found: {LLAMA_TOOL}")
        print("[!] Build llama.cpp first:")
        print("    cd J:/TheAI/llama.cpp")
        print("    cmake -B build && cmake --build build --config Release")
        sys.exit(1)
    print(f"[βœ“] Merge tool found")

    OUTPUT_GGUF.parent.mkdir(parents=True, exist_ok=True)

    print()
    print(f"[*] Merging {BASE_GGUF.name} + {LORA_GGUF.name}")
    print(f"[*] Output: {OUTPUT_GGUF}")
    print()

    cmd = [
        str(LLAMA_TOOL),
        "--model", str(BASE_GGUF),
        "--lora", str(LORA_GGUF),
        "--output", str(OUTPUT_GGUF),
    ]

    result = subprocess.run(cmd, cwd="J:/TheAI")

    if result.returncode == 0 and OUTPUT_GGUF.exists():
        size_gb = OUTPUT_GGUF.stat().st_size / (1024**3)
        print(f"[βœ“] Merge complete: {OUTPUT_GGUF} ({size_gb:.2f} GB)")
        print()
        print("[*] Create Ollama model:")
        print(f"    ollama create codette-v2 -f J:/TheAI/models/codette-v2-gguf/Modelfile")
        print()
        print("[*] Or load directly in llama.cpp:")
        print(f"    llama-cli.exe -m {OUTPUT_GGUF} -p 'Your prompt here'")
    else:
        print("[!] Merge failed or output not created.")
        print()
        print("[*] Alternative β€” load LoRA separately at inference time:")
        print(f"    llama-cli.exe -m {BASE_GGUF} --lora {LORA_GGUF}")

else:
    print(f"[!] Unknown choice: {choice}")
    sys.exit(1)