""" export_gguf_windows.py — Merge LoRA adapters and export to GGUF on Windows. Pipeline: 1. Load base model + LoRA adapters via Unsloth 2. Merge LoRA into weights, save 16-bit safetensors (HF format) 3. Download convert_hf_to_gguf.py from llama.cpp (if not cached) 4. Convert merged model → F16 GGUF 5. Quantize F16 GGUF → Q4_K_M via llama_cpp.llama_model_quantize 6. Update Modelfile to point at the Q4_K_M GGUF Usage (from project root): "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 7b "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 0.5b --push """ from __future__ import annotations import sys import io import os if sys.platform == "win32": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1") os.environ.setdefault("TORCH_COMPILE_DISABLE", "1") # Unsloth must be first import unsloth # noqa: F401 import transformers.utils.hub import transformers.tokenization_utils_base _noop = lambda *a, **kw: [] transformers.tokenization_utils_base.list_repo_templates = _noop transformers.utils.hub.list_repo_templates = _noop import argparse import subprocess import urllib.request from pathlib import Path # ── Args ─────────────────────────────────────────────────────────────────────── parser = argparse.ArgumentParser(description="Merge LoRA + export GGUF on Windows") parser.add_argument("--model", default="7b", choices=["0.5b","1.5b","3b","7b","8b"], help="Which fine-tuned model to export (default: 7b)") parser.add_argument("--quant", default="q4_k_m", choices=["f16","q4_k_m","q5_k_m","q8_0"], help="Output quantisation (default: q4_k_m)") parser.add_argument("--push", action="store_true", help="Push GGUF to HF Hub after export") parser.add_argument("--skip-merge", action="store_true", help="Skip merge if merged/ dir already exists") parser.add_argument("--skip-quant", action="store_true", help="Skip quantisation, keep F16 GGUF only") args = parser.parse_args() # ── Model profile lookup ────────────────────────────────────────────────────── _PROFILES = { "0.5b": dict(base_id="unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit", hf_repo="RayMelius/soci-agent-q4", seq_len=2048), "1.5b": dict(base_id="unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit", hf_repo="RayMelius/soci-agent-1b5", seq_len=2048), "3b": dict(base_id="unsloth/Qwen2.5-3B-Instruct-bnb-4bit", hf_repo="RayMelius/soci-agent-3b", seq_len=2048), "7b": dict(base_id="unsloth/Qwen2.5-7B-Instruct-bnb-4bit", hf_repo="RayMelius/soci-agent-7b", seq_len=512), "8b": dict(base_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", hf_repo="RayMelius/soci-agent-8b", seq_len=512), } PROFILE = _PROFILES[args.model] HF_REPO = PROFILE["hf_repo"] SEQ_LEN = PROFILE["seq_len"] TRAIN_DIR = Path("data/training") MODEL_DIR = TRAIN_DIR / args.model # e.g. data/training/7b/ LORA_DIR = MODEL_DIR / "lora_adapters" MERGED_DIR = MODEL_DIR / "merged" GGUF_DIR = MODEL_DIR / "gguf" CONVERT_CACHE = TRAIN_DIR / "_llama_convert" # shared cache for the convert script GGUF_DIR.mkdir(parents=True, exist_ok=True) CONVERT_CACHE.mkdir(parents=True, exist_ok=True) if not LORA_DIR.exists() or not any(LORA_DIR.iterdir()): print(f"[ERROR] No LoRA adapters found at {LORA_DIR}") print(f" Run: python scripts/finetune_local.py --base-model {args.model}") sys.exit(1) # ── Step 1: Merge LoRA → 16-bit safetensors ────────────────────────────────── print(f"\n=== Step 1: Merge LoRA adapters ({args.model}) ===") if args.skip_merge and MERGED_DIR.exists() and any(MERGED_DIR.glob("*.safetensors")): print(f" Skipping merge — {MERGED_DIR} already exists.") else: from unsloth import FastLanguageModel print(f" Loading {LORA_DIR} ...") model, tokenizer = FastLanguageModel.from_pretrained( model_name = str(LORA_DIR), max_seq_length = SEQ_LEN, dtype = None, load_in_4bit = True, ) print(f" Merging LoRA and saving 16-bit weights to {MERGED_DIR} ...") model.save_pretrained_merged( str(MERGED_DIR), tokenizer, save_method = "merged_16bit", ) print(f" Merged model saved.") # ── Step 2: Clone/update llama.cpp repo (shallow) ──────────────────────────── # We clone the full repo so the convert script uses its own bundled gguf-py, # which is always in sync with the script (PyPI gguf lags behind llama.cpp master). print(f"\n=== Step 2: Prepare llama.cpp convert script ===") LLAMA_REPO = CONVERT_CACHE / "llama.cpp" CONVERT_SCRIPT = LLAMA_REPO / "convert_hf_to_gguf.py" LLAMA_GGUF_PY = LLAMA_REPO / "gguf-py" if LLAMA_REPO.exists() and CONVERT_SCRIPT.exists(): print(f" Repo cached at {LLAMA_REPO} — pulling latest ...") subprocess.run(["git", "-C", str(LLAMA_REPO), "pull", "--ff-only", "-q"], check=False) else: print(f" Cloning llama.cpp (shallow) into {LLAMA_REPO} ...") subprocess.check_call([ "git", "clone", "--depth=1", "--filter=blob:none", "https://github.com/ggml-org/llama.cpp.git", str(LLAMA_REPO), ]) print(f" Installing llama.cpp gguf-py + convert dependencies ...") subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", str(LLAMA_GGUF_PY)]) reqs = LLAMA_REPO / "requirements" / "requirements-convert_hf_to_gguf.txt" if reqs.exists(): subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(reqs)]) # Build PYTHONPATH so convert script picks up llama.cpp's gguf-py over PyPI's _convert_env = os.environ.copy() _convert_env["PYTHONPATH"] = str(LLAMA_GGUF_PY / "src") + os.pathsep + _convert_env.get("PYTHONPATH", "") print(f" Convert script: {CONVERT_SCRIPT}") # ── Step 3: Convert merged model → F16 GGUF ────────────────────────────────── print(f"\n=== Step 3: Convert to F16 GGUF ===") GGUF_F16 = GGUF_DIR / f"{args.model}-f16.gguf" if GGUF_F16.exists(): print(f" Already exists: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") else: cmd = [ sys.executable, str(CONVERT_SCRIPT), str(MERGED_DIR), "--outfile", str(GGUF_F16), "--outtype", "f16", ] print(f" Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=False, env=_convert_env) if result.returncode != 0: print(f"[ERROR] Conversion failed (exit {result.returncode})") sys.exit(1) print(f" F16 GGUF: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") # ── Step 4: Quantise F16 → Q4_K_M (or other) ───────────────────────────────── QUANT_TYPE_MAP = { "f16": 4, # LLAMA_FTYPE_MOSTLY_F16 "q8_0": 7, # LLAMA_FTYPE_MOSTLY_Q8_0 "q4_k_m": 15, # LLAMA_FTYPE_MOSTLY_Q4_K_M "q5_k_m": 17, # LLAMA_FTYPE_MOSTLY_Q5_K_M } if args.skip_quant or args.quant == "f16": GGUF_FINAL = GGUF_F16 print(f"\n=== Step 4: Skipping quantisation (using F16) ===") else: print(f"\n=== Step 4: Quantise → {args.quant.upper()} ===") GGUF_FINAL = GGUF_DIR / f"{args.model}-{args.quant}.gguf" if GGUF_FINAL.exists(): print(f" Already exists: {GGUF_FINAL} ({GGUF_FINAL.stat().st_size / 1e6:.0f} MB)") else: import ctypes import llama_cpp ftype = QUANT_TYPE_MAP[args.quant] params = llama_cpp.llama_model_quantize_default_params() params.ftype = ftype params.nthread = 4 params.allow_requantize = False print(f" Quantising {GGUF_F16.name} → {GGUF_FINAL.name} ...") ret = llama_cpp.llama_model_quantize( str(GGUF_F16).encode(), str(GGUF_FINAL).encode(), ctypes.byref(params), ) if ret != 0: print(f"[ERROR] Quantisation failed (return code {ret})") sys.exit(1) mb = GGUF_FINAL.stat().st_size / 1e6 print(f" {args.quant.upper()} GGUF: {GGUF_FINAL} ({mb:.0f} MB)") # ── Step 5: Update Modelfile ────────────────────────────────────────────────── print(f"\n=== Step 5: Update Modelfile ===") modelfile_path = Path("Modelfile") if modelfile_path.exists(): content = modelfile_path.read_text(encoding="utf-8") # Comment out any existing FROM lines, then insert real one at top of FROM block gguf_rel = GGUF_FINAL.as_posix() # forward slashes work in Modelfile on Windows new_from = f"FROM ./{gguf_rel}" lines = content.splitlines() updated = [] inserted = False for line in lines: stripped = line.strip() if stripped.startswith("FROM ") and not stripped.startswith("#"): # Comment out old FROM updated.append(f"#{line}") if not inserted: updated.append(new_from) inserted = True else: updated.append(line) if not inserted: updated.insert(0, new_from) modelfile_path.write_text("\n".join(updated) + "\n", encoding="utf-8") print(f" Modelfile updated: FROM → ./{gguf_rel}") else: print(f" [WARN] Modelfile not found — skipping update") # ── Step 6: Push GGUF to HF Hub ────────────────────────────────────────────── if args.push: print(f"\n=== Step 6: Push GGUF to {HF_REPO} ===") try: from dotenv import load_dotenv; load_dotenv() except ImportError: pass HF_TOKEN = os.environ.get("HF_TOKEN", "") if not HF_TOKEN: env_file = Path(".env") if env_file.exists(): for line in env_file.read_text().splitlines(): if line.startswith("HF_TOKEN="): HF_TOKEN = line.split("=", 1)[1].strip().strip('"') if not HF_TOKEN: print(" [WARN] No HF_TOKEN — skipping push. Set HF_TOKEN in .env or env var.") else: from huggingface_hub import login, HfApi login(token=HF_TOKEN, add_to_git_credential=False) api = HfApi() api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True) mb = GGUF_FINAL.stat().st_size / 1e6 print(f" Uploading {GGUF_FINAL.name} ({mb:.0f} MB)...") api.upload_file( path_or_fileobj = str(GGUF_FINAL), path_in_repo = GGUF_FINAL.name, repo_id = HF_REPO, repo_type = "model", ) print(f" Done: https://huggingface.co/{HF_REPO}/blob/main/{GGUF_FINAL.name}") # ── Done ────────────────────────────────────────────────────────────────────── print(f""" === Export complete === GGUF : {GGUF_FINAL} Size : {GGUF_FINAL.stat().st_size / 1e6:.0f} MB To use with Ollama: ollama create soci-agent -f Modelfile ollama run soci-agent Or for {args.model}: ollama create soci-agent-{args.model} -f Modelfile set OLLAMA_MODEL=soci-agent-{args.model} set SOCI_PROVIDER=ollama """)