| |
| """Export stock and no-CoT fine-tuned models to GGUF Q8_0. |
| Does each model one at a time to avoid OOM on 8GB VRAM.""" |
|
|
| import torch |
| import subprocess |
| import shutil |
| import sys |
| from pathlib import Path |
|
|
| BASE_MODEL = "katanemo/Arch-Router-1.5B" |
| LORA_PATH = "finetuning/modelgate_arch_router_nocot_lora" |
| LLAMA_CPP_DIR = Path("finetuning/llama.cpp") |
|
|
| step = sys.argv[1] if len(sys.argv) > 1 else "all" |
|
|
| |
| if not LLAMA_CPP_DIR.exists(): |
| print("Cloning llama.cpp for GGUF converter...") |
| subprocess.run( |
| ["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", str(LLAMA_CPP_DIR)], |
| check=True, |
| ) |
|
|
| CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert_hf_to_gguf.py" |
|
|
| if step in ("stock", "all"): |
| stock_dir = Path("finetuning/stock_hf_temp") |
| stock_gguf = Path("finetuning/stock_arch_router.Q8_0.gguf") |
|
|
| if not stock_gguf.exists(): |
| print("=== Step 1: Save stock model to HF format ===") |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
| |
| model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu") |
| model.save_pretrained(str(stock_dir)) |
| tokenizer.save_pretrained(str(stock_dir)) |
| del model, tokenizer |
| print("Stock HF model saved\n") |
|
|
| print("=== Step 2: Convert stock to GGUF Q8_0 ===") |
| subprocess.run( |
| ["python3", str(CONVERT_SCRIPT), str(stock_dir), |
| "--outfile", str(stock_gguf), "--outtype", "q8_0"], |
| check=True, |
| ) |
| print(f"Stock GGUF saved: {stock_gguf} ({stock_gguf.stat().st_size / 1e6:.0f} MB)\n") |
| shutil.rmtree(str(stock_dir), ignore_errors=True) |
| else: |
| print(f"Stock GGUF already exists: {stock_gguf}") |
|
|
| if step in ("nocot", "all"): |
| nocot_dir = Path("finetuning/nocot_hf_temp") |
| nocot_gguf = Path("finetuning/nocot_arch_router.Q8_0.gguf") |
|
|
| if not nocot_gguf.exists(): |
| print("=== Step 3: Merge LoRA and save no-CoT model ===") |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import PeftModel |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
| model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu") |
| model = PeftModel.from_pretrained(model, LORA_PATH, device_map="cpu") |
| model = model.merge_and_unload() |
| model.save_pretrained(str(nocot_dir)) |
| tokenizer.save_pretrained(str(nocot_dir)) |
| del model, tokenizer |
| print("No-CoT merged HF model saved\n") |
|
|
| print("=== Step 4: Convert no-CoT to GGUF Q8_0 ===") |
| subprocess.run( |
| ["python3", str(CONVERT_SCRIPT), str(nocot_dir), |
| "--outfile", str(nocot_gguf), "--outtype", "q8_0"], |
| check=True, |
| ) |
| print(f"No-CoT GGUF saved: {nocot_gguf} ({nocot_gguf.stat().st_size / 1e6:.0f} MB)\n") |
| shutil.rmtree(str(nocot_dir), ignore_errors=True) |
| else: |
| print(f"No-CoT GGUF already exists: {nocot_gguf}") |
|
|
| print("Done!") |
|
|