| | """ |
| | export_gguf_windows.py β Merge LoRA adapters and export to GGUF on Windows. |
| | |
| | Pipeline: |
| | 1. Load base model + LoRA adapters via Unsloth |
| | 2. Merge LoRA into weights, save 16-bit safetensors (HF format) |
| | 3. Download convert_hf_to_gguf.py from llama.cpp (if not cached) |
| | 4. Convert merged model β F16 GGUF |
| | 5. Quantize F16 GGUF β Q4_K_M via llama_cpp.llama_model_quantize |
| | 6. Update Modelfile to point at the Q4_K_M GGUF |
| | |
| | Usage (from project root): |
| | "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py |
| | "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 7b |
| | "C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 0.5b --push |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import sys |
| | import io |
| | import os |
| |
|
| | if sys.platform == "win32": |
| | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") |
| | sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") |
| |
|
| | os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1") |
| | os.environ.setdefault("TORCH_COMPILE_DISABLE", "1") |
| |
|
| | |
| | import unsloth |
| | import transformers.utils.hub |
| | import transformers.tokenization_utils_base |
| | _noop = lambda *a, **kw: [] |
| | transformers.tokenization_utils_base.list_repo_templates = _noop |
| | transformers.utils.hub.list_repo_templates = _noop |
| |
|
| | import argparse |
| | import subprocess |
| | import urllib.request |
| | from pathlib import Path |
| |
|
| | |
| | parser = argparse.ArgumentParser(description="Merge LoRA + export GGUF on Windows") |
| | parser.add_argument("--model", default="7b", choices=["0.5b","1.5b","3b","7b","8b"], |
| | help="Which fine-tuned model to export (default: 7b)") |
| | parser.add_argument("--quant", default="q4_k_m", |
| | choices=["f16","q4_k_m","q5_k_m","q8_0"], |
| | help="Output quantisation (default: q4_k_m)") |
| | parser.add_argument("--push", action="store_true", help="Push GGUF to HF Hub after export") |
| | parser.add_argument("--skip-merge", action="store_true", help="Skip merge if merged/ dir already exists") |
| | parser.add_argument("--skip-quant", action="store_true", help="Skip quantisation, keep F16 GGUF only") |
| | args = parser.parse_args() |
| |
|
| | |
| | _PROFILES = { |
| | "0.5b": dict(base_id="unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit", |
| | hf_repo="RayMelius/soci-agent-q4", seq_len=2048), |
| | "1.5b": dict(base_id="unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit", |
| | hf_repo="RayMelius/soci-agent-1b5", seq_len=2048), |
| | "3b": dict(base_id="unsloth/Qwen2.5-3B-Instruct-bnb-4bit", |
| | hf_repo="RayMelius/soci-agent-3b", seq_len=2048), |
| | "7b": dict(base_id="unsloth/Qwen2.5-7B-Instruct-bnb-4bit", |
| | hf_repo="RayMelius/soci-agent-7b", seq_len=512), |
| | "8b": dict(base_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", |
| | hf_repo="RayMelius/soci-agent-8b", seq_len=512), |
| | } |
| | PROFILE = _PROFILES[args.model] |
| | HF_REPO = PROFILE["hf_repo"] |
| | SEQ_LEN = PROFILE["seq_len"] |
| |
|
| | TRAIN_DIR = Path("data/training") |
| | MODEL_DIR = TRAIN_DIR / args.model |
| | LORA_DIR = MODEL_DIR / "lora_adapters" |
| | MERGED_DIR = MODEL_DIR / "merged" |
| | GGUF_DIR = MODEL_DIR / "gguf" |
| | CONVERT_CACHE = TRAIN_DIR / "_llama_convert" |
| |
|
| | GGUF_DIR.mkdir(parents=True, exist_ok=True) |
| | CONVERT_CACHE.mkdir(parents=True, exist_ok=True) |
| |
|
| | if not LORA_DIR.exists() or not any(LORA_DIR.iterdir()): |
| | print(f"[ERROR] No LoRA adapters found at {LORA_DIR}") |
| | print(f" Run: python scripts/finetune_local.py --base-model {args.model}") |
| | sys.exit(1) |
| |
|
| | |
| | print(f"\n=== Step 1: Merge LoRA adapters ({args.model}) ===") |
| |
|
| | if args.skip_merge and MERGED_DIR.exists() and any(MERGED_DIR.glob("*.safetensors")): |
| | print(f" Skipping merge β {MERGED_DIR} already exists.") |
| | else: |
| | from unsloth import FastLanguageModel |
| |
|
| | print(f" Loading {LORA_DIR} ...") |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name = str(LORA_DIR), |
| | max_seq_length = SEQ_LEN, |
| | dtype = None, |
| | load_in_4bit = True, |
| | ) |
| |
|
| | print(f" Merging LoRA and saving 16-bit weights to {MERGED_DIR} ...") |
| | model.save_pretrained_merged( |
| | str(MERGED_DIR), |
| | tokenizer, |
| | save_method = "merged_16bit", |
| | ) |
| | print(f" Merged model saved.") |
| |
|
| | |
| | |
| | |
| | print(f"\n=== Step 2: Prepare llama.cpp convert script ===") |
| |
|
| | LLAMA_REPO = CONVERT_CACHE / "llama.cpp" |
| | CONVERT_SCRIPT = LLAMA_REPO / "convert_hf_to_gguf.py" |
| | LLAMA_GGUF_PY = LLAMA_REPO / "gguf-py" |
| |
|
| | if LLAMA_REPO.exists() and CONVERT_SCRIPT.exists(): |
| | print(f" Repo cached at {LLAMA_REPO} β pulling latest ...") |
| | subprocess.run(["git", "-C", str(LLAMA_REPO), "pull", "--ff-only", "-q"], check=False) |
| | else: |
| | print(f" Cloning llama.cpp (shallow) into {LLAMA_REPO} ...") |
| | subprocess.check_call([ |
| | "git", "clone", "--depth=1", "--filter=blob:none", |
| | "https://github.com/ggml-org/llama.cpp.git", |
| | str(LLAMA_REPO), |
| | ]) |
| | print(f" Installing llama.cpp gguf-py + convert dependencies ...") |
| | subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", |
| | str(LLAMA_GGUF_PY)]) |
| | reqs = LLAMA_REPO / "requirements" / "requirements-convert_hf_to_gguf.txt" |
| | if reqs.exists(): |
| | subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(reqs)]) |
| |
|
| | |
| | _convert_env = os.environ.copy() |
| | _convert_env["PYTHONPATH"] = str(LLAMA_GGUF_PY / "src") + os.pathsep + _convert_env.get("PYTHONPATH", "") |
| |
|
| | print(f" Convert script: {CONVERT_SCRIPT}") |
| |
|
| | |
| | print(f"\n=== Step 3: Convert to F16 GGUF ===") |
| |
|
| | GGUF_F16 = GGUF_DIR / f"{args.model}-f16.gguf" |
| |
|
| | if GGUF_F16.exists(): |
| | print(f" Already exists: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") |
| | else: |
| | cmd = [ |
| | sys.executable, str(CONVERT_SCRIPT), |
| | str(MERGED_DIR), |
| | "--outfile", str(GGUF_F16), |
| | "--outtype", "f16", |
| | ] |
| | print(f" Running: {' '.join(cmd)}") |
| | result = subprocess.run(cmd, capture_output=False, env=_convert_env) |
| | if result.returncode != 0: |
| | print(f"[ERROR] Conversion failed (exit {result.returncode})") |
| | sys.exit(1) |
| | print(f" F16 GGUF: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)") |
| |
|
| | |
| | QUANT_TYPE_MAP = { |
| | "f16": 4, |
| | "q8_0": 7, |
| | "q4_k_m": 15, |
| | "q5_k_m": 17, |
| | } |
| |
|
| | if args.skip_quant or args.quant == "f16": |
| | GGUF_FINAL = GGUF_F16 |
| | print(f"\n=== Step 4: Skipping quantisation (using F16) ===") |
| | else: |
| | print(f"\n=== Step 4: Quantise β {args.quant.upper()} ===") |
| | GGUF_FINAL = GGUF_DIR / f"{args.model}-{args.quant}.gguf" |
| |
|
| | if GGUF_FINAL.exists(): |
| | print(f" Already exists: {GGUF_FINAL} ({GGUF_FINAL.stat().st_size / 1e6:.0f} MB)") |
| | else: |
| | import ctypes |
| | import llama_cpp |
| |
|
| | ftype = QUANT_TYPE_MAP[args.quant] |
| | params = llama_cpp.llama_model_quantize_default_params() |
| | params.ftype = ftype |
| | params.nthread = 4 |
| | params.allow_requantize = False |
| |
|
| | print(f" Quantising {GGUF_F16.name} β {GGUF_FINAL.name} ...") |
| | ret = llama_cpp.llama_model_quantize( |
| | str(GGUF_F16).encode(), |
| | str(GGUF_FINAL).encode(), |
| | ctypes.byref(params), |
| | ) |
| | if ret != 0: |
| | print(f"[ERROR] Quantisation failed (return code {ret})") |
| | sys.exit(1) |
| | mb = GGUF_FINAL.stat().st_size / 1e6 |
| | print(f" {args.quant.upper()} GGUF: {GGUF_FINAL} ({mb:.0f} MB)") |
| |
|
| | |
| | print(f"\n=== Step 5: Update Modelfile ===") |
| |
|
| | modelfile_path = Path("Modelfile") |
| | if modelfile_path.exists(): |
| | content = modelfile_path.read_text(encoding="utf-8") |
| | |
| | gguf_rel = GGUF_FINAL.as_posix() |
| | new_from = f"FROM ./{gguf_rel}" |
| |
|
| | lines = content.splitlines() |
| | updated = [] |
| | inserted = False |
| | for line in lines: |
| | stripped = line.strip() |
| | if stripped.startswith("FROM ") and not stripped.startswith("#"): |
| | |
| | updated.append(f"#{line}") |
| | if not inserted: |
| | updated.append(new_from) |
| | inserted = True |
| | else: |
| | updated.append(line) |
| | if not inserted: |
| | updated.insert(0, new_from) |
| |
|
| | modelfile_path.write_text("\n".join(updated) + "\n", encoding="utf-8") |
| | print(f" Modelfile updated: FROM β ./{gguf_rel}") |
| | else: |
| | print(f" [WARN] Modelfile not found β skipping update") |
| |
|
| | |
| | if args.push: |
| | print(f"\n=== Step 6: Push GGUF to {HF_REPO} ===") |
| | try: |
| | from dotenv import load_dotenv; load_dotenv() |
| | except ImportError: |
| | pass |
| | HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| | if not HF_TOKEN: |
| | env_file = Path(".env") |
| | if env_file.exists(): |
| | for line in env_file.read_text().splitlines(): |
| | if line.startswith("HF_TOKEN="): |
| | HF_TOKEN = line.split("=", 1)[1].strip().strip('"') |
| |
|
| | if not HF_TOKEN: |
| | print(" [WARN] No HF_TOKEN β skipping push. Set HF_TOKEN in .env or env var.") |
| | else: |
| | from huggingface_hub import login, HfApi |
| | login(token=HF_TOKEN, add_to_git_credential=False) |
| | api = HfApi() |
| | api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True) |
| | mb = GGUF_FINAL.stat().st_size / 1e6 |
| | print(f" Uploading {GGUF_FINAL.name} ({mb:.0f} MB)...") |
| | api.upload_file( |
| | path_or_fileobj = str(GGUF_FINAL), |
| | path_in_repo = GGUF_FINAL.name, |
| | repo_id = HF_REPO, |
| | repo_type = "model", |
| | ) |
| | print(f" Done: https://huggingface.co/{HF_REPO}/blob/main/{GGUF_FINAL.name}") |
| |
|
| | |
| | print(f""" |
| | === Export complete === |
| | GGUF : {GGUF_FINAL} |
| | Size : {GGUF_FINAL.stat().st_size / 1e6:.0f} MB |
| | |
| | To use with Ollama: |
| | ollama create soci-agent -f Modelfile |
| | ollama run soci-agent |
| | |
| | Or for {args.model}: |
| | ollama create soci-agent-{args.model} -f Modelfile |
| | set OLLAMA_MODEL=soci-agent-{args.model} |
| | set SOCI_PROVIDER=ollama |
| | """) |
| |
|