File size: 12,070 Bytes
58392d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97d584e
 
 
58392d5
 
97d584e
 
 
 
 
 
 
58392d5
97d584e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58392d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97d584e
58392d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
"""
export_gguf_windows.py β€” Merge LoRA adapters and export to GGUF on Windows.

Pipeline:
  1. Load base model + LoRA adapters via Unsloth
  2. Merge LoRA into weights, save 16-bit safetensors (HF format)
  3. Download convert_hf_to_gguf.py from llama.cpp (if not cached)
  4. Convert merged model β†’ F16 GGUF
  5. Quantize F16 GGUF β†’ Q4_K_M via llama_cpp.llama_model_quantize
  6. Update Modelfile to point at the Q4_K_M GGUF

Usage (from project root):
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 7b
    "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 0.5b --push
"""

from __future__ import annotations

import sys
import io
import os

if sys.platform == "win32":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1")
os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")

# Unsloth must be first
import unsloth  # noqa: F401
import transformers.utils.hub
import transformers.tokenization_utils_base
_noop = lambda *a, **kw: []
transformers.tokenization_utils_base.list_repo_templates = _noop
transformers.utils.hub.list_repo_templates = _noop

import argparse
import subprocess
import urllib.request
from pathlib import Path

# ── Args ───────────────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Merge LoRA + export GGUF on Windows")
parser.add_argument("--model",      default="7b", choices=["0.5b","1.5b","3b","7b","8b"],
                    help="Which fine-tuned model to export (default: 7b)")
parser.add_argument("--quant",      default="q4_k_m",
                    choices=["f16","q4_k_m","q5_k_m","q8_0"],
                    help="Output quantisation (default: q4_k_m)")
parser.add_argument("--push",       action="store_true", help="Push GGUF to HF Hub after export")
parser.add_argument("--skip-merge", action="store_true", help="Skip merge if merged/ dir already exists")
parser.add_argument("--skip-quant", action="store_true", help="Skip quantisation, keep F16 GGUF only")
args = parser.parse_args()

# ── Model profile lookup ──────────────────────────────────────────────────────
_PROFILES = {
    "0.5b": dict(base_id="unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit",
                 hf_repo="RayMelius/soci-agent-q4", seq_len=2048),
    "1.5b": dict(base_id="unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
                 hf_repo="RayMelius/soci-agent-1b5", seq_len=2048),
    "3b":   dict(base_id="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
                 hf_repo="RayMelius/soci-agent-3b", seq_len=2048),
    "7b":   dict(base_id="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
                 hf_repo="RayMelius/soci-agent-7b", seq_len=512),
    "8b":   dict(base_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
                 hf_repo="RayMelius/soci-agent-8b", seq_len=512),
}
PROFILE  = _PROFILES[args.model]
HF_REPO  = PROFILE["hf_repo"]
SEQ_LEN  = PROFILE["seq_len"]

TRAIN_DIR    = Path("data/training")
MODEL_DIR    = TRAIN_DIR / args.model           # e.g. data/training/7b/
LORA_DIR     = MODEL_DIR / "lora_adapters"
MERGED_DIR   = MODEL_DIR / "merged"
GGUF_DIR     = MODEL_DIR / "gguf"
CONVERT_CACHE = TRAIN_DIR / "_llama_convert"   # shared cache for the convert script

GGUF_DIR.mkdir(parents=True, exist_ok=True)
CONVERT_CACHE.mkdir(parents=True, exist_ok=True)

if not LORA_DIR.exists() or not any(LORA_DIR.iterdir()):
    print(f"[ERROR] No LoRA adapters found at {LORA_DIR}")
    print(f"  Run: python scripts/finetune_local.py --base-model {args.model}")
    sys.exit(1)

# ── Step 1: Merge LoRA β†’ 16-bit safetensors ──────────────────────────────────
print(f"\n=== Step 1: Merge LoRA adapters ({args.model}) ===")

if args.skip_merge and MERGED_DIR.exists() and any(MERGED_DIR.glob("*.safetensors")):
    print(f"  Skipping merge β€” {MERGED_DIR} already exists.")
else:
    from unsloth import FastLanguageModel

    print(f"  Loading {LORA_DIR} ...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = str(LORA_DIR),
        max_seq_length = SEQ_LEN,
        dtype          = None,
        load_in_4bit   = True,
    )

    print(f"  Merging LoRA and saving 16-bit weights to {MERGED_DIR} ...")
    model.save_pretrained_merged(
        str(MERGED_DIR),
        tokenizer,
        save_method = "merged_16bit",
    )
    print(f"  Merged model saved.")

# ── Step 2: Clone/update llama.cpp repo (shallow) ────────────────────────────
# We clone the full repo so the convert script uses its own bundled gguf-py,
# which is always in sync with the script (PyPI gguf lags behind llama.cpp master).
print(f"\n=== Step 2: Prepare llama.cpp convert script ===")

LLAMA_REPO    = CONVERT_CACHE / "llama.cpp"
CONVERT_SCRIPT = LLAMA_REPO / "convert_hf_to_gguf.py"
LLAMA_GGUF_PY  = LLAMA_REPO / "gguf-py"

if LLAMA_REPO.exists() and CONVERT_SCRIPT.exists():
    print(f"  Repo cached at {LLAMA_REPO} β€” pulling latest ...")
    subprocess.run(["git", "-C", str(LLAMA_REPO), "pull", "--ff-only", "-q"], check=False)
else:
    print(f"  Cloning llama.cpp (shallow) into {LLAMA_REPO} ...")
    subprocess.check_call([
        "git", "clone", "--depth=1", "--filter=blob:none",
        "https://github.com/ggml-org/llama.cpp.git",
        str(LLAMA_REPO),
    ])
    print(f"  Installing llama.cpp gguf-py + convert dependencies ...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                           str(LLAMA_GGUF_PY)])
    reqs = LLAMA_REPO / "requirements" / "requirements-convert_hf_to_gguf.txt"
    if reqs.exists():
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(reqs)])

# Build PYTHONPATH so convert script picks up llama.cpp's gguf-py over PyPI's
_convert_env = os.environ.copy()
_convert_env["PYTHONPATH"] = str(LLAMA_GGUF_PY / "src") + os.pathsep + _convert_env.get("PYTHONPATH", "")

print(f"  Convert script: {CONVERT_SCRIPT}")

# ── Step 3: Convert merged model β†’ F16 GGUF ──────────────────────────────────
print(f"\n=== Step 3: Convert to F16 GGUF ===")

GGUF_F16 = GGUF_DIR / f"{args.model}-f16.gguf"

if GGUF_F16.exists():
    print(f"  Already exists: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)")
else:
    cmd = [
        sys.executable, str(CONVERT_SCRIPT),
        str(MERGED_DIR),
        "--outfile", str(GGUF_F16),
        "--outtype", "f16",
    ]
    print(f"  Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=False, env=_convert_env)
    if result.returncode != 0:
        print(f"[ERROR] Conversion failed (exit {result.returncode})")
        sys.exit(1)
    print(f"  F16 GGUF: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)")

# ── Step 4: Quantise F16 β†’ Q4_K_M (or other) ─────────────────────────────────
QUANT_TYPE_MAP = {
    "f16":    4,   # LLAMA_FTYPE_MOSTLY_F16
    "q8_0":  7,   # LLAMA_FTYPE_MOSTLY_Q8_0
    "q4_k_m": 15, # LLAMA_FTYPE_MOSTLY_Q4_K_M
    "q5_k_m": 17, # LLAMA_FTYPE_MOSTLY_Q5_K_M
}

if args.skip_quant or args.quant == "f16":
    GGUF_FINAL = GGUF_F16
    print(f"\n=== Step 4: Skipping quantisation (using F16) ===")
else:
    print(f"\n=== Step 4: Quantise β†’ {args.quant.upper()} ===")
    GGUF_FINAL = GGUF_DIR / f"{args.model}-{args.quant}.gguf"

    if GGUF_FINAL.exists():
        print(f"  Already exists: {GGUF_FINAL} ({GGUF_FINAL.stat().st_size / 1e6:.0f} MB)")
    else:
        import ctypes
        import llama_cpp

        ftype = QUANT_TYPE_MAP[args.quant]
        params = llama_cpp.llama_model_quantize_default_params()
        params.ftype = ftype
        params.nthread = 4
        params.allow_requantize = False

        print(f"  Quantising {GGUF_F16.name} β†’ {GGUF_FINAL.name} ...")
        ret = llama_cpp.llama_model_quantize(
            str(GGUF_F16).encode(),
            str(GGUF_FINAL).encode(),
            ctypes.byref(params),
        )
        if ret != 0:
            print(f"[ERROR] Quantisation failed (return code {ret})")
            sys.exit(1)
        mb = GGUF_FINAL.stat().st_size / 1e6
        print(f"  {args.quant.upper()} GGUF: {GGUF_FINAL} ({mb:.0f} MB)")

# ── Step 5: Update Modelfile ──────────────────────────────────────────────────
print(f"\n=== Step 5: Update Modelfile ===")

modelfile_path = Path("Modelfile")
if modelfile_path.exists():
    content = modelfile_path.read_text(encoding="utf-8")
    # Comment out any existing FROM lines, then insert real one at top of FROM block
    gguf_rel = GGUF_FINAL.as_posix()   # forward slashes work in Modelfile on Windows
    new_from  = f"FROM ./{gguf_rel}"

    lines = content.splitlines()
    updated = []
    inserted = False
    for line in lines:
        stripped = line.strip()
        if stripped.startswith("FROM ") and not stripped.startswith("#"):
            # Comment out old FROM
            updated.append(f"#{line}")
            if not inserted:
                updated.append(new_from)
                inserted = True
        else:
            updated.append(line)
    if not inserted:
        updated.insert(0, new_from)

    modelfile_path.write_text("\n".join(updated) + "\n", encoding="utf-8")
    print(f"  Modelfile updated: FROM β†’ ./{gguf_rel}")
else:
    print(f"  [WARN] Modelfile not found β€” skipping update")

# ── Step 6: Push GGUF to HF Hub ──────────────────────────────────────────────
if args.push:
    print(f"\n=== Step 6: Push GGUF to {HF_REPO} ===")
    try:
        from dotenv import load_dotenv; load_dotenv()
    except ImportError:
        pass
    HF_TOKEN = os.environ.get("HF_TOKEN", "")
    if not HF_TOKEN:
        env_file = Path(".env")
        if env_file.exists():
            for line in env_file.read_text().splitlines():
                if line.startswith("HF_TOKEN="):
                    HF_TOKEN = line.split("=", 1)[1].strip().strip('"')

    if not HF_TOKEN:
        print("  [WARN] No HF_TOKEN β€” skipping push. Set HF_TOKEN in .env or env var.")
    else:
        from huggingface_hub import login, HfApi
        login(token=HF_TOKEN, add_to_git_credential=False)
        api = HfApi()
        api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True)
        mb = GGUF_FINAL.stat().st_size / 1e6
        print(f"  Uploading {GGUF_FINAL.name} ({mb:.0f} MB)...")
        api.upload_file(
            path_or_fileobj = str(GGUF_FINAL),
            path_in_repo    = GGUF_FINAL.name,
            repo_id         = HF_REPO,
            repo_type       = "model",
        )
        print(f"  Done: https://huggingface.co/{HF_REPO}/blob/main/{GGUF_FINAL.name}")

# ── Done ──────────────────────────────────────────────────────────────────────
print(f"""
=== Export complete ===
GGUF : {GGUF_FINAL}
Size : {GGUF_FINAL.stat().st_size / 1e6:.0f} MB

To use with Ollama:
  ollama create soci-agent -f Modelfile
  ollama run soci-agent

Or for {args.model}:
  ollama create soci-agent-{args.model} -f Modelfile
  set OLLAMA_MODEL=soci-agent-{args.model}
  set SOCI_PROVIDER=ollama
""")