Qrverse
/

qr-verse-ai-lora

@@ -36,6 +36,7 @@ import os
 import sys
 import subprocess
 import logging
 logging.basicConfig(
     level=logging.INFO,
@@ -82,7 +83,7 @@ from unsloth import FastVisionModel
 model, tokenizer = FastVisionModel.from_pretrained(
     BASE_MODEL,
-    load_in_4bit=True,
     max_seq_length=4096,
 )
@@ -104,6 +105,21 @@ logger.info("Merge complete. Saving merged model as FP16 to: %s", MERGED_DIR)
 model.save_pretrained(MERGED_DIR, safe_serialization=True)
 tokenizer.save_pretrained(MERGED_DIR)
 # Also copy over the vision processor configs
 from huggingface_hub import hf_hub_download
 for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
@@ -188,9 +204,8 @@ if result.returncode != 0:
         # Last resort: try Unsloth's built-in GGUF export
         logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
         try:
-            # Reload the merged model for GGUF export
-            from unsloth import FastLanguageModel
-            merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
                 MERGED_DIR,
                 load_in_4bit=False,
                 max_seq_length=4096,

 import sys
 import subprocess
 import logging
+import json
 logging.basicConfig(
     level=logging.INFO,
 model, tokenizer = FastVisionModel.from_pretrained(
     BASE_MODEL,
+    load_in_4bit=False,   # FP16 — clean weights for GGUF (A10G has 24GB, model ~16GB)
     max_seq_length=4096,
 )
 model.save_pretrained(MERGED_DIR, safe_serialization=True)
 tokenizer.save_pretrained(MERGED_DIR)
+# CRITICAL: Remove quantization_config from config.json
+# Even with FP16 loading, Unsloth may write quantization metadata.
+# llama.cpp cannot handle bitsandbytes quant method and will crash.
+config_path = os.path.join(MERGED_DIR, "config.json")
+if os.path.exists(config_path):
+    with open(config_path) as f:
+        config = json.load(f)
+    if "quantization_config" in config:
+        logger.info("Removing quantization_config from config.json (was: %s)",
+                     config["quantization_config"].get("quant_method", "unknown"))
+        del config["quantization_config"]
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        logger.info("config.json cleaned — llama.cpp will treat as FP16")
 # Also copy over the vision processor configs
 from huggingface_hub import hf_hub_download
 for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
         # Last resort: try Unsloth's built-in GGUF export
         logger.info("Trying Unsloth save_pretrained_gguf as last resort...")
         try:
+            # Reload the merged model for GGUF export (use FastVisionModel for VLM)
+            merged_model, merged_tokenizer = FastVisionModel.from_pretrained(
                 MERGED_DIR,
                 load_in_4bit=False,
                 max_seq_length=4096,