Qrverse commited on
Commit
69ccab5
·
verified ·
1 Parent(s): ea5d0cc

v4: FP16 loading + strip quantization_config for clean GGUF conversion

Browse files
Files changed (1) hide show
  1. convert-gguf-hf-jobs.py +19 -4
convert-gguf-hf-jobs.py CHANGED
@@ -36,6 +36,7 @@ import os
36
  import sys
37
  import subprocess
38
  import logging
 
39
 
40
  logging.basicConfig(
41
  level=logging.INFO,
@@ -82,7 +83,7 @@ from unsloth import FastVisionModel
82
 
83
  model, tokenizer = FastVisionModel.from_pretrained(
84
  BASE_MODEL,
85
- load_in_4bit=True,
86
  max_seq_length=4096,
87
  )
88
 
@@ -104,6 +105,21 @@ logger.info("Merge complete. Saving merged model as FP16 to: %s", MERGED_DIR)
104
  model.save_pretrained(MERGED_DIR, safe_serialization=True)
105
  tokenizer.save_pretrained(MERGED_DIR)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Also copy over the vision processor configs
108
  from huggingface_hub import hf_hub_download
109
  for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
@@ -188,9 +204,8 @@ if result.returncode != 0:
188
  # Last resort: try Unsloth's built-in GGUF export
189
  logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
190
  try:
191
- # Reload the merged model for GGUF export
192
- from unsloth import FastLanguageModel
193
- merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
194
  MERGED_DIR,
195
  load_in_4bit=False,
196
  max_seq_length=4096,
 
36
  import sys
37
  import subprocess
38
  import logging
39
+ import json
40
 
41
  logging.basicConfig(
42
  level=logging.INFO,
 
83
 
84
  model, tokenizer = FastVisionModel.from_pretrained(
85
  BASE_MODEL,
86
+ load_in_4bit=False, # FP16 — clean weights for GGUF (A10G has 24GB, model ~16GB)
87
  max_seq_length=4096,
88
  )
89
 
 
105
  model.save_pretrained(MERGED_DIR, safe_serialization=True)
106
  tokenizer.save_pretrained(MERGED_DIR)
107
 
108
+ # CRITICAL: Remove quantization_config from config.json
109
+ # Even with FP16 loading, Unsloth may write quantization metadata.
110
+ # llama.cpp cannot handle bitsandbytes quant method and will crash.
111
+ config_path = os.path.join(MERGED_DIR, "config.json")
112
+ if os.path.exists(config_path):
113
+ with open(config_path) as f:
114
+ config = json.load(f)
115
+ if "quantization_config" in config:
116
+ logger.info("Removing quantization_config from config.json (was: %s)",
117
+ config["quantization_config"].get("quant_method", "unknown"))
118
+ del config["quantization_config"]
119
+ with open(config_path, "w") as f:
120
+ json.dump(config, f, indent=2)
121
+ logger.info("config.json cleaned — llama.cpp will treat as FP16")
122
+
123
  # Also copy over the vision processor configs
124
  from huggingface_hub import hf_hub_download
125
  for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
 
204
  # Last resort: try Unsloth's built-in GGUF export
205
  logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
206
  try:
207
+ # Reload the merged model for GGUF export (use FastVisionModel for VLM)
208
+ merged_model, merged_tokenizer = FastVisionModel.from_pretrained(
 
209
  MERGED_DIR,
210
  load_in_4bit=False,
211
  max_seq_length=4096,