Upload export_gguf.py with huggingface_hub

Browse files

Files changed (1) hide show

export_gguf.py +81 -0

export_gguf.py ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/usr/bin/env python3
+"""Export stock and no-CoT fine-tuned models to GGUF Q8_0.
+Does each model one at a time to avoid OOM on 8GB VRAM."""
+import torch
+import subprocess
+import shutil
+import sys
+from pathlib import Path
+BASE_MODEL = "katanemo/Arch-Router-1.5B"
+LORA_PATH = "finetuning/modelgate_arch_router_nocot_lora"
+LLAMA_CPP_DIR = Path("finetuning/llama.cpp")
+step = sys.argv[1] if len(sys.argv) > 1 else "all"
+# Clone llama.cpp for the converter script
+if not LLAMA_CPP_DIR.exists():
+    print("Cloning llama.cpp for GGUF converter...")
+    subprocess.run(
+        ["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", str(LLAMA_CPP_DIR)],
+        check=True,
+    )
+CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert_hf_to_gguf.py"
+if step in ("stock", "all"):
+    stock_dir = Path("finetuning/stock_hf_temp")
+    stock_gguf = Path("finetuning/stock_arch_router.Q8_0.gguf")
+    if not stock_gguf.exists():
+        print("=== Step 1: Save stock model to HF format ===")
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+        # Load in float16 on CPU to avoid GPU OOM
+        model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
+        model.save_pretrained(str(stock_dir))
+        tokenizer.save_pretrained(str(stock_dir))
+        del model, tokenizer
+        print("Stock HF model saved\n")
+        print("=== Step 2: Convert stock to GGUF Q8_0 ===")
+        subprocess.run(
+            ["python3", str(CONVERT_SCRIPT), str(stock_dir),
+             "--outfile", str(stock_gguf), "--outtype", "q8_0"],
+            check=True,
+        )
+        print(f"Stock GGUF saved: {stock_gguf} ({stock_gguf.stat().st_size / 1e6:.0f} MB)\n")
+        shutil.rmtree(str(stock_dir), ignore_errors=True)
+    else:
+        print(f"Stock GGUF already exists: {stock_gguf}")
+if step in ("nocot", "all"):
+    nocot_dir = Path("finetuning/nocot_hf_temp")
+    nocot_gguf = Path("finetuning/nocot_arch_router.Q8_0.gguf")
+    if not nocot_gguf.exists():
+        print("=== Step 3: Merge LoRA and save no-CoT model ===")
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from peft import PeftModel
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
+        model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
+        model = PeftModel.from_pretrained(model, LORA_PATH, device_map="cpu")
+        model = model.merge_and_unload()
+        model.save_pretrained(str(nocot_dir))
+        tokenizer.save_pretrained(str(nocot_dir))
+        del model, tokenizer
+        print("No-CoT merged HF model saved\n")
+        print("=== Step 4: Convert no-CoT to GGUF Q8_0 ===")
+        subprocess.run(
+            ["python3", str(CONVERT_SCRIPT), str(nocot_dir),
+             "--outfile", str(nocot_gguf), "--outtype", "q8_0"],
+            check=True,
+        )
+        print(f"No-CoT GGUF saved: {nocot_gguf} ({nocot_gguf.stat().st_size / 1e6:.0f} MB)\n")
+        shutil.rmtree(str(nocot_dir), ignore_errors=True)
+    else:
+        print(f"No-CoT GGUF already exists: {nocot_gguf}")
+print("Done!")