v4: FP16 loading + strip quantization_config for clean GGUF conversion
Browse files- convert-gguf-hf-jobs.py +19 -4
convert-gguf-hf-jobs.py
CHANGED
|
@@ -36,6 +36,7 @@ import os
|
|
| 36 |
import sys
|
| 37 |
import subprocess
|
| 38 |
import logging
|
|
|
|
| 39 |
|
| 40 |
logging.basicConfig(
|
| 41 |
level=logging.INFO,
|
|
@@ -82,7 +83,7 @@ from unsloth import FastVisionModel
|
|
| 82 |
|
| 83 |
model, tokenizer = FastVisionModel.from_pretrained(
|
| 84 |
BASE_MODEL,
|
| 85 |
-
load_in_4bit=
|
| 86 |
max_seq_length=4096,
|
| 87 |
)
|
| 88 |
|
|
@@ -104,6 +105,21 @@ logger.info("Merge complete. Saving merged model as FP16 to: %s", MERGED_DIR)
|
|
| 104 |
model.save_pretrained(MERGED_DIR, safe_serialization=True)
|
| 105 |
tokenizer.save_pretrained(MERGED_DIR)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# Also copy over the vision processor configs
|
| 108 |
from huggingface_hub import hf_hub_download
|
| 109 |
for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
|
|
@@ -188,9 +204,8 @@ if result.returncode != 0:
|
|
| 188 |
# Last resort: try Unsloth's built-in GGUF export
|
| 189 |
logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
|
| 190 |
try:
|
| 191 |
-
# Reload the merged model for GGUF export
|
| 192 |
-
|
| 193 |
-
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
|
| 194 |
MERGED_DIR,
|
| 195 |
load_in_4bit=False,
|
| 196 |
max_seq_length=4096,
|
|
|
|
| 36 |
import sys
|
| 37 |
import subprocess
|
| 38 |
import logging
|
| 39 |
+
import json
|
| 40 |
|
| 41 |
logging.basicConfig(
|
| 42 |
level=logging.INFO,
|
|
|
|
| 83 |
|
| 84 |
model, tokenizer = FastVisionModel.from_pretrained(
|
| 85 |
BASE_MODEL,
|
| 86 |
+
load_in_4bit=False, # FP16 — clean weights for GGUF (A10G has 24GB, model ~16GB)
|
| 87 |
max_seq_length=4096,
|
| 88 |
)
|
| 89 |
|
|
|
|
| 105 |
model.save_pretrained(MERGED_DIR, safe_serialization=True)
|
| 106 |
tokenizer.save_pretrained(MERGED_DIR)
|
| 107 |
|
| 108 |
+
# CRITICAL: Remove quantization_config from config.json
|
| 109 |
+
# Even with FP16 loading, Unsloth may write quantization metadata.
|
| 110 |
+
# llama.cpp cannot handle bitsandbytes quant method and will crash.
|
| 111 |
+
config_path = os.path.join(MERGED_DIR, "config.json")
|
| 112 |
+
if os.path.exists(config_path):
|
| 113 |
+
with open(config_path) as f:
|
| 114 |
+
config = json.load(f)
|
| 115 |
+
if "quantization_config" in config:
|
| 116 |
+
logger.info("Removing quantization_config from config.json (was: %s)",
|
| 117 |
+
config["quantization_config"].get("quant_method", "unknown"))
|
| 118 |
+
del config["quantization_config"]
|
| 119 |
+
with open(config_path, "w") as f:
|
| 120 |
+
json.dump(config, f, indent=2)
|
| 121 |
+
logger.info("config.json cleaned — llama.cpp will treat as FP16")
|
| 122 |
+
|
| 123 |
# Also copy over the vision processor configs
|
| 124 |
from huggingface_hub import hf_hub_download
|
| 125 |
for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
|
|
|
|
| 204 |
# Last resort: try Unsloth's built-in GGUF export
|
| 205 |
logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
|
| 206 |
try:
|
| 207 |
+
# Reload the merged model for GGUF export (use FastVisionModel for VLM)
|
| 208 |
+
merged_model, merged_tokenizer = FastVisionModel.from_pretrained(
|
|
|
|
| 209 |
MERGED_DIR,
|
| 210 |
load_in_4bit=False,
|
| 211 |
max_seq_length=4096,
|