AaryanK commited on
Commit
e6aab9e
·
verified ·
1 Parent(s): e438405

Upload export_gguf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. export_gguf.py +81 -0
export_gguf.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export stock and no-CoT fine-tuned models to GGUF Q8_0.
3
+ Does each model one at a time to avoid OOM on 8GB VRAM."""
4
+
5
+ import torch
6
+ import subprocess
7
+ import shutil
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ BASE_MODEL = "katanemo/Arch-Router-1.5B"
12
+ LORA_PATH = "finetuning/modelgate_arch_router_nocot_lora"
13
+ LLAMA_CPP_DIR = Path("finetuning/llama.cpp")
14
+
15
+ step = sys.argv[1] if len(sys.argv) > 1 else "all"
16
+
17
+ # Clone llama.cpp for the converter script
18
+ if not LLAMA_CPP_DIR.exists():
19
+ print("Cloning llama.cpp for GGUF converter...")
20
+ subprocess.run(
21
+ ["git", "clone", "--depth=1", "https://github.com/ggerganov/llama.cpp", str(LLAMA_CPP_DIR)],
22
+ check=True,
23
+ )
24
+
25
+ CONVERT_SCRIPT = LLAMA_CPP_DIR / "convert_hf_to_gguf.py"
26
+
27
+ if step in ("stock", "all"):
28
+ stock_dir = Path("finetuning/stock_hf_temp")
29
+ stock_gguf = Path("finetuning/stock_arch_router.Q8_0.gguf")
30
+
31
+ if not stock_gguf.exists():
32
+ print("=== Step 1: Save stock model to HF format ===")
33
+ from transformers import AutoModelForCausalLM, AutoTokenizer
34
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
35
+ # Load in float16 on CPU to avoid GPU OOM
36
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
37
+ model.save_pretrained(str(stock_dir))
38
+ tokenizer.save_pretrained(str(stock_dir))
39
+ del model, tokenizer
40
+ print("Stock HF model saved\n")
41
+
42
+ print("=== Step 2: Convert stock to GGUF Q8_0 ===")
43
+ subprocess.run(
44
+ ["python3", str(CONVERT_SCRIPT), str(stock_dir),
45
+ "--outfile", str(stock_gguf), "--outtype", "q8_0"],
46
+ check=True,
47
+ )
48
+ print(f"Stock GGUF saved: {stock_gguf} ({stock_gguf.stat().st_size / 1e6:.0f} MB)\n")
49
+ shutil.rmtree(str(stock_dir), ignore_errors=True)
50
+ else:
51
+ print(f"Stock GGUF already exists: {stock_gguf}")
52
+
53
+ if step in ("nocot", "all"):
54
+ nocot_dir = Path("finetuning/nocot_hf_temp")
55
+ nocot_gguf = Path("finetuning/nocot_arch_router.Q8_0.gguf")
56
+
57
+ if not nocot_gguf.exists():
58
+ print("=== Step 3: Merge LoRA and save no-CoT model ===")
59
+ from transformers import AutoModelForCausalLM, AutoTokenizer
60
+ from peft import PeftModel
61
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
62
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="cpu")
63
+ model = PeftModel.from_pretrained(model, LORA_PATH, device_map="cpu")
64
+ model = model.merge_and_unload()
65
+ model.save_pretrained(str(nocot_dir))
66
+ tokenizer.save_pretrained(str(nocot_dir))
67
+ del model, tokenizer
68
+ print("No-CoT merged HF model saved\n")
69
+
70
+ print("=== Step 4: Convert no-CoT to GGUF Q8_0 ===")
71
+ subprocess.run(
72
+ ["python3", str(CONVERT_SCRIPT), str(nocot_dir),
73
+ "--outfile", str(nocot_gguf), "--outtype", "q8_0"],
74
+ check=True,
75
+ )
76
+ print(f"No-CoT GGUF saved: {nocot_gguf} ({nocot_gguf.stat().st_size / 1e6:.0f} MB)\n")
77
+ shutil.rmtree(str(nocot_dir), ignore_errors=True)
78
+ else:
79
+ print(f"No-CoT GGUF already exists: {nocot_gguf}")
80
+
81
+ print("Done!")