Spaces:

RayMelius
/

soci2

Running

App Files Files Community

soci2 / scripts /export_gguf_windows.py

RayMelius

Fix GGUF export: clone llama.cpp for bundled gguf-py; update Modelfile

97d584e 12 days ago

raw

history blame contribute delete

12.1 kB

	"""
	export_gguf_windows.py — Merge LoRA adapters and export to GGUF on Windows.

	Pipeline:
	1. Load base model + LoRA adapters via Unsloth
	2. Merge LoRA into weights, save 16-bit safetensors (HF format)
	3. Download convert_hf_to_gguf.py from llama.cpp (if not cached)
	4. Convert merged model → F16 GGUF
	5. Quantize F16 GGUF → Q4_K_M via llama_cpp.llama_model_quantize
	6. Update Modelfile to point at the Q4_K_M GGUF

	Usage (from project root):
	"C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py
	"C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 7b
	"C:/Users/xabon/.conda/envs/ml-env/python.exe" scripts/export_gguf_windows.py --model 0.5b --push
	"""

	from __future__ import annotations

	import sys
	import io
	import os

	if sys.platform == "win32":
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
	sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")

	os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1")
	os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")

	# Unsloth must be first
	import unsloth # noqa: F401
	import transformers.utils.hub
	import transformers.tokenization_utils_base
	_noop = lambda a, *kw: []
	transformers.tokenization_utils_base.list_repo_templates = _noop
	transformers.utils.hub.list_repo_templates = _noop

	import argparse
	import subprocess
	import urllib.request
	from pathlib import Path

	# ── Args ───────────────────────────────────────────────────────────────────────
	parser = argparse.ArgumentParser(description="Merge LoRA + export GGUF on Windows")
	parser.add_argument("--model", default="7b", choices=["0.5b","1.5b","3b","7b","8b"],
	help="Which fine-tuned model to export (default: 7b)")
	parser.add_argument("--quant", default="q4_k_m",
	choices=["f16","q4_k_m","q5_k_m","q8_0"],
	help="Output quantisation (default: q4_k_m)")
	parser.add_argument("--push", action="store_true", help="Push GGUF to HF Hub after export")
	parser.add_argument("--skip-merge", action="store_true", help="Skip merge if merged/ dir already exists")
	parser.add_argument("--skip-quant", action="store_true", help="Skip quantisation, keep F16 GGUF only")
	args = parser.parse_args()

	# ── Model profile lookup ──────────────────────────────────────────────────────
	_PROFILES = {
	"0.5b": dict(base_id="unsloth/Qwen2.5-0.5B-Instruct-unsloth-bnb-4bit",
	hf_repo="RayMelius/soci-agent-q4", seq_len=2048),
	"1.5b": dict(base_id="unsloth/Qwen2.5-1.5B-Instruct-bnb-4bit",
	hf_repo="RayMelius/soci-agent-1b5", seq_len=2048),
	"3b": dict(base_id="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
	hf_repo="RayMelius/soci-agent-3b", seq_len=2048),
	"7b": dict(base_id="unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
	hf_repo="RayMelius/soci-agent-7b", seq_len=512),
	"8b": dict(base_id="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
	hf_repo="RayMelius/soci-agent-8b", seq_len=512),
	}
	PROFILE = _PROFILES[args.model]
	HF_REPO = PROFILE["hf_repo"]
	SEQ_LEN = PROFILE["seq_len"]

	TRAIN_DIR = Path("data/training")
	MODEL_DIR = TRAIN_DIR / args.model # e.g. data/training/7b/
	LORA_DIR = MODEL_DIR / "lora_adapters"
	MERGED_DIR = MODEL_DIR / "merged"
	GGUF_DIR = MODEL_DIR / "gguf"
	CONVERT_CACHE = TRAIN_DIR / "_llama_convert" # shared cache for the convert script

	GGUF_DIR.mkdir(parents=True, exist_ok=True)
	CONVERT_CACHE.mkdir(parents=True, exist_ok=True)

	if not LORA_DIR.exists() or not any(LORA_DIR.iterdir()):
	print(f"[ERROR] No LoRA adapters found at {LORA_DIR}")
	print(f" Run: python scripts/finetune_local.py --base-model {args.model}")
	sys.exit(1)

	# ── Step 1: Merge LoRA → 16-bit safetensors ──────────────────────────────────
	print(f"\n=== Step 1: Merge LoRA adapters ({args.model}) ===")

	if args.skip_merge and MERGED_DIR.exists() and any(MERGED_DIR.glob("*.safetensors")):
	print(f" Skipping merge — {MERGED_DIR} already exists.")
	else:
	from unsloth import FastLanguageModel

	print(f" Loading {LORA_DIR} ...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = str(LORA_DIR),
	max_seq_length = SEQ_LEN,
	dtype = None,
	load_in_4bit = True,
	)

	print(f" Merging LoRA and saving 16-bit weights to {MERGED_DIR} ...")
	model.save_pretrained_merged(
	str(MERGED_DIR),
	tokenizer,
	save_method = "merged_16bit",
	)
	print(f" Merged model saved.")

	# ── Step 2: Clone/update llama.cpp repo (shallow) ────────────────────────────
	# We clone the full repo so the convert script uses its own bundled gguf-py,
	# which is always in sync with the script (PyPI gguf lags behind llama.cpp master).
	print(f"\n=== Step 2: Prepare llama.cpp convert script ===")

	LLAMA_REPO = CONVERT_CACHE / "llama.cpp"
	CONVERT_SCRIPT = LLAMA_REPO / "convert_hf_to_gguf.py"
	LLAMA_GGUF_PY = LLAMA_REPO / "gguf-py"

	if LLAMA_REPO.exists() and CONVERT_SCRIPT.exists():
	print(f" Repo cached at {LLAMA_REPO} — pulling latest ...")
	subprocess.run(["git", "-C", str(LLAMA_REPO), "pull", "--ff-only", "-q"], check=False)
	else:
	print(f" Cloning llama.cpp (shallow) into {LLAMA_REPO} ...")
	subprocess.check_call([
	"git", "clone", "--depth=1", "--filter=blob:none",
	"https://github.com/ggml-org/llama.cpp.git",
	str(LLAMA_REPO),
	])
	print(f" Installing llama.cpp gguf-py + convert dependencies ...")
	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
	str(LLAMA_GGUF_PY)])
	reqs = LLAMA_REPO / "requirements" / "requirements-convert_hf_to_gguf.txt"
	if reqs.exists():
	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-r", str(reqs)])

	# Build PYTHONPATH so convert script picks up llama.cpp's gguf-py over PyPI's
	_convert_env = os.environ.copy()
	_convert_env["PYTHONPATH"] = str(LLAMA_GGUF_PY / "src") + os.pathsep + _convert_env.get("PYTHONPATH", "")

	print(f" Convert script: {CONVERT_SCRIPT}")

	# ── Step 3: Convert merged model → F16 GGUF ──────────────────────────────────
	print(f"\n=== Step 3: Convert to F16 GGUF ===")

	GGUF_F16 = GGUF_DIR / f"{args.model}-f16.gguf"

	if GGUF_F16.exists():
	print(f" Already exists: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)")
	else:
	cmd = [
	sys.executable, str(CONVERT_SCRIPT),
	str(MERGED_DIR),
	"--outfile", str(GGUF_F16),
	"--outtype", "f16",
	]
	print(f" Running: {' '.join(cmd)}")
	result = subprocess.run(cmd, capture_output=False, env=_convert_env)
	if result.returncode != 0:
	print(f"[ERROR] Conversion failed (exit {result.returncode})")
	sys.exit(1)
	print(f" F16 GGUF: {GGUF_F16} ({GGUF_F16.stat().st_size / 1e9:.2f} GB)")

	# ── Step 4: Quantise F16 → Q4_K_M (or other) ─────────────────────────────────
	QUANT_TYPE_MAP = {
	"f16": 4, # LLAMA_FTYPE_MOSTLY_F16
	"q8_0": 7, # LLAMA_FTYPE_MOSTLY_Q8_0
	"q4_k_m": 15, # LLAMA_FTYPE_MOSTLY_Q4_K_M
	"q5_k_m": 17, # LLAMA_FTYPE_MOSTLY_Q5_K_M
	}

	if args.skip_quant or args.quant == "f16":
	GGUF_FINAL = GGUF_F16
	print(f"\n=== Step 4: Skipping quantisation (using F16) ===")
	else:
	print(f"\n=== Step 4: Quantise → {args.quant.upper()} ===")
	GGUF_FINAL = GGUF_DIR / f"{args.model}-{args.quant}.gguf"

	if GGUF_FINAL.exists():
	print(f" Already exists: {GGUF_FINAL} ({GGUF_FINAL.stat().st_size / 1e6:.0f} MB)")
	else:
	import ctypes
	import llama_cpp

	ftype = QUANT_TYPE_MAP[args.quant]
	params = llama_cpp.llama_model_quantize_default_params()
	params.ftype = ftype
	params.nthread = 4
	params.allow_requantize = False

	print(f" Quantising {GGUF_F16.name} → {GGUF_FINAL.name} ...")
	ret = llama_cpp.llama_model_quantize(
	str(GGUF_F16).encode(),
	str(GGUF_FINAL).encode(),
	ctypes.byref(params),
	)
	if ret != 0:
	print(f"[ERROR] Quantisation failed (return code {ret})")
	sys.exit(1)
	mb = GGUF_FINAL.stat().st_size / 1e6
	print(f" {args.quant.upper()} GGUF: {GGUF_FINAL} ({mb:.0f} MB)")

	# ── Step 5: Update Modelfile ──────────────────────────────────────────────────
	print(f"\n=== Step 5: Update Modelfile ===")

	modelfile_path = Path("Modelfile")
	if modelfile_path.exists():
	content = modelfile_path.read_text(encoding="utf-8")
	# Comment out any existing FROM lines, then insert real one at top of FROM block
	gguf_rel = GGUF_FINAL.as_posix() # forward slashes work in Modelfile on Windows
	new_from = f"FROM ./{gguf_rel}"

	lines = content.splitlines()
	updated = []
	inserted = False
	for line in lines:
	stripped = line.strip()
	if stripped.startswith("FROM ") and not stripped.startswith("#"):
	# Comment out old FROM
	updated.append(f"#{line}")
	if not inserted:
	updated.append(new_from)
	inserted = True
	else:
	updated.append(line)
	if not inserted:
	updated.insert(0, new_from)

	modelfile_path.write_text("\n".join(updated) + "\n", encoding="utf-8")
	print(f" Modelfile updated: FROM → ./{gguf_rel}")
	else:
	print(f" [WARN] Modelfile not found — skipping update")

	# ── Step 6: Push GGUF to HF Hub ──────────────────────────────────────────────
	if args.push:
	print(f"\n=== Step 6: Push GGUF to {HF_REPO} ===")
	try:
	from dotenv import load_dotenv; load_dotenv()
	except ImportError:
	pass
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	if not HF_TOKEN:
	env_file = Path(".env")
	if env_file.exists():
	for line in env_file.read_text().splitlines():
	if line.startswith("HF_TOKEN="):
	HF_TOKEN = line.split("=", 1)[1].strip().strip('"')

	if not HF_TOKEN:
	print(" [WARN] No HF_TOKEN — skipping push. Set HF_TOKEN in .env or env var.")
	else:
	from huggingface_hub import login, HfApi
	login(token=HF_TOKEN, add_to_git_credential=False)
	api = HfApi()
	api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True)
	mb = GGUF_FINAL.stat().st_size / 1e6
	print(f" Uploading {GGUF_FINAL.name} ({mb:.0f} MB)...")
	api.upload_file(
	path_or_fileobj = str(GGUF_FINAL),
	path_in_repo = GGUF_FINAL.name,
	repo_id = HF_REPO,
	repo_type = "model",
	)
	print(f" Done: https://huggingface.co/{HF_REPO}/blob/main/{GGUF_FINAL.name}")

	# ── Done ──────────────────────────────────────────────────────────────────────
	print(f"""
	=== Export complete ===
	GGUF : {GGUF_FINAL}
	Size : {GGUF_FINAL.stat().st_size / 1e6:.0f} MB

	To use with Ollama:
	ollama create soci-agent -f Modelfile
	ollama run soci-agent

	Or for {args.model}:
	ollama create soci-agent-{args.model} -f Modelfile
	set OLLAMA_MODEL=soci-agent-{args.model}
	set SOCI_PROVIDER=ollama
	""")