Qrverse commited on
Commit
22f4524
·
verified ·
1 Parent(s): 2cf91ad

Combined GGUF convert + quantize script (Round 3)

Browse files
Files changed (1) hide show
  1. convert-and-quantize-gguf.py +317 -0
convert-and-quantize-gguf.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "unsloth",
5
+ # "huggingface_hub>=0.25",
6
+ # "torch>=2.0",
7
+ # "safetensors",
8
+ # "numpy",
9
+ # "sentencepiece",
10
+ # "transformers>=4.50",
11
+ # "gguf>=0.6",
12
+ # "peft>=0.13",
13
+ # "cmake",
14
+ # ]
15
+ # ///
16
+ """
17
+ QR-Verse AI — Combined LoRA → F16 GGUF → Q4_K_M (Single HF Job)
18
+ =================================================================
19
+
20
+ Merges Round 3 LoRA adapter into base Qwen3-VL-8B, converts to F16 GGUF,
21
+ then quantizes to Q4_K_M. All in one job to avoid double GPU spin-up.
22
+
23
+ Steps:
24
+ 1. Clone llama.cpp + build llama-quantize (parallel with model download)
25
+ 2. Load base Qwen3-VL-8B + LoRA adapter via Unsloth (FP16)
26
+ 3. Merge LoRA weights into base model
27
+ 4. Save merged model as FP16 safetensors
28
+ 5. Convert to F16 GGUF via convert_hf_to_gguf.py
29
+ 6. Quantize F16 → Q4_K_M via llama-quantize
30
+ 7. Upload Q4_K_M GGUF + Modelfile to HuggingFace Hub
31
+
32
+ Usage:
33
+ hf jobs uv run --flavor a100-large --timeout 3h \
34
+ --secrets HF_TOKEN \
35
+ https://huggingface.co/Qrverse/qr-verse-ai-lora/resolve/main/convert-and-quantize-gguf.py
36
+ """
37
+
38
+ import os
39
+ import sys
40
+ import subprocess
41
+ import logging
42
+ import json
43
+ import threading
44
+ import time
45
+
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format="%(asctime)s [%(levelname)s] %(message)s",
49
+ datefmt="%Y-%m-%d %H:%M:%S",
50
+ )
51
+ logger = logging.getLogger(__name__)
52
+
53
+ ADAPTER_REPO = "Qrverse/qr-verse-ai-lora"
54
+ BASE_MODEL = "unsloth/Qwen3-VL-8B-Instruct"
55
+ F16_FILENAME = "qr-verse-ai-r3-F16.gguf"
56
+ Q4_FILENAME = "qr-verse-ai-r3-Q4_K_M.gguf"
57
+ MERGED_DIR = "./merged-model"
58
+ OUTPUT_DIR = "./output"
59
+
60
+ SYSTEM_PROMPT = (
61
+ "You are QR-Verse AI, a helpful assistant for the QR-Verse platform. "
62
+ "You help users create, customize, and manage QR codes. You can generate "
63
+ "QR codes for URLs, WiFi networks, vCards, email, SMS, and 20+ other types. "
64
+ "You also support AI-powered QR code art generation with 130+ style presets. "
65
+ "You can check website health, SEO, SSL certificates, and broken links. "
66
+ "You speak 7 languages: English, Spanish, Dutch, French, Portuguese, German, Italian. "
67
+ "Always be concise, accurate, and helpful."
68
+ )
69
+
70
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
71
+
72
+ start_time = time.time()
73
+
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # 1. Clone llama.cpp + build llama-quantize (in background thread)
77
+ # ---------------------------------------------------------------------------
78
+
79
+ build_error = None
80
+
81
+ def build_llama_cpp():
82
+ """Build llama-quantize in background while model loads."""
83
+ global build_error
84
+ try:
85
+ logger.info("[BUILD] Cloning llama.cpp...")
86
+ subprocess.run(
87
+ ["git", "clone", "--depth", "1", "https://github.com/ggml-org/llama.cpp.git"],
88
+ check=True, capture_output=True,
89
+ )
90
+
91
+ logger.info("[BUILD] Building llama-quantize with cmake...")
92
+ os.makedirs("llama.cpp/build", exist_ok=True)
93
+ subprocess.run(
94
+ ["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp",
95
+ "-DGGML_CUDA=OFF", "-DCMAKE_BUILD_TYPE=Release"],
96
+ check=True, capture_output=True,
97
+ )
98
+ subprocess.run(
99
+ ["cmake", "--build", "llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
100
+ check=True, capture_output=True,
101
+ )
102
+ logger.info("[BUILD] llama-quantize built successfully")
103
+ except Exception as e:
104
+ build_error = e
105
+ logger.error("[BUILD] Failed: %s", e)
106
+
107
+ # Start build in background
108
+ build_thread = threading.Thread(target=build_llama_cpp)
109
+ build_thread.start()
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # 2. Load base model + LoRA adapter via Unsloth
114
+ # ---------------------------------------------------------------------------
115
+
116
+ logger.info("[MODEL] Loading base model: %s (FP16, no quantization)", BASE_MODEL)
117
+
118
+ from unsloth import FastVisionModel
119
+
120
+ model, tokenizer = FastVisionModel.from_pretrained(
121
+ BASE_MODEL,
122
+ load_in_4bit=False, # FP16 — clean weights for GGUF
123
+ max_seq_length=4096,
124
+ )
125
+
126
+ logger.info("[MODEL] Base model loaded (%.1fs). Applying LoRA adapter: %s",
127
+ time.time() - start_time, ADAPTER_REPO)
128
+
129
+ from peft import PeftModel
130
+ model = PeftModel.from_pretrained(model, ADAPTER_REPO)
131
+ logger.info("[MODEL] LoRA adapter applied (%.1fs)", time.time() - start_time)
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # 3. Merge LoRA into base model and save as FP16
136
+ # ---------------------------------------------------------------------------
137
+
138
+ logger.info("[MERGE] Merging LoRA weights into base model...")
139
+ model = model.merge_and_unload()
140
+ logger.info("[MERGE] Merge complete (%.1fs). Saving to: %s",
141
+ time.time() - start_time, MERGED_DIR)
142
+
143
+ model.save_pretrained(MERGED_DIR, safe_serialization=True)
144
+ tokenizer.save_pretrained(MERGED_DIR)
145
+
146
+ # Free GPU memory — no longer needed
147
+ del model
148
+ del tokenizer
149
+ import torch
150
+ torch.cuda.empty_cache()
151
+ import gc
152
+ gc.collect()
153
+ logger.info("[MERGE] Model saved, GPU memory freed (%.1fs)", time.time() - start_time)
154
+
155
+ # CRITICAL: Remove quantization_config from config.json
156
+ config_path = os.path.join(MERGED_DIR, "config.json")
157
+ if os.path.exists(config_path):
158
+ with open(config_path) as f:
159
+ config = json.load(f)
160
+ if "quantization_config" in config:
161
+ logger.info("[MERGE] Removing quantization_config from config.json")
162
+ del config["quantization_config"]
163
+ with open(config_path, "w") as f:
164
+ json.dump(config, f, indent=2)
165
+
166
+ # Copy vision processor configs from adapter repo
167
+ from huggingface_hub import hf_hub_download
168
+ import shutil
169
+ for config_file in ["preprocessor_config.json", "video_preprocessor_config.json", "chat_template.jinja"]:
170
+ try:
171
+ src = hf_hub_download(ADAPTER_REPO, config_file)
172
+ shutil.copy2(src, os.path.join(MERGED_DIR, config_file))
173
+ logger.info("[MERGE] Copied %s", config_file)
174
+ except Exception:
175
+ pass
176
+
177
+ # Log merged model size
178
+ total_size = 0
179
+ for f in sorted(os.listdir(MERGED_DIR)):
180
+ fpath = os.path.join(MERGED_DIR, f)
181
+ if os.path.isfile(fpath):
182
+ total_size += os.path.getsize(fpath) / 1024 / 1024
183
+ logger.info("[MERGE] Total merged model: %.1f MB", total_size)
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # 4. Convert merged model to F16 GGUF
188
+ # ---------------------------------------------------------------------------
189
+
190
+ logger.info("[GGUF] Converting merged model to F16 GGUF...")
191
+
192
+ convert_script = "llama.cpp/convert_hf_to_gguf.py"
193
+ f16_path = os.path.join(OUTPUT_DIR, F16_FILENAME)
194
+
195
+ result = subprocess.run(
196
+ [sys.executable, convert_script, MERGED_DIR,
197
+ "--outfile", f16_path, "--outtype", "f16"],
198
+ capture_output=True, text=True,
199
+ )
200
+
201
+ if result.stdout:
202
+ for line in result.stdout.strip().split("\n")[-10:]:
203
+ logger.info(" convert: %s", line)
204
+ if result.stderr:
205
+ for line in result.stderr.strip().split("\n")[-10:]:
206
+ logger.info(" convert (stderr): %s", line)
207
+
208
+ if result.returncode != 0:
209
+ logger.error("[GGUF] F16 conversion failed (exit %d)", result.returncode)
210
+ logger.error("STDERR: %s", result.stderr[-3000:] if result.stderr else "(empty)")
211
+ sys.exit(1)
212
+
213
+ f16_size_gb = os.path.getsize(f16_path) / 1024**3
214
+ logger.info("[GGUF] F16 GGUF created: %s (%.1f GB) (%.1fs)",
215
+ F16_FILENAME, f16_size_gb, time.time() - start_time)
216
+
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # 5. Quantize F16 → Q4_K_M
220
+ # ---------------------------------------------------------------------------
221
+
222
+ # Wait for llama-quantize build to finish
223
+ build_thread.join()
224
+ if build_error:
225
+ logger.error("[QUANTIZE] llama-quantize build failed, cannot quantize: %s", build_error)
226
+ sys.exit(1)
227
+
228
+ # Find quantize binary
229
+ quantize_bin = "llama.cpp/build/bin/llama-quantize"
230
+ if not os.path.exists(quantize_bin):
231
+ for candidate in ["llama.cpp/build/llama-quantize", "llama.cpp/build/bin/quantize"]:
232
+ if os.path.exists(candidate):
233
+ quantize_bin = candidate
234
+ break
235
+
236
+ logger.info("[QUANTIZE] Quantizing F16 → Q4_K_M...")
237
+ q4_path = os.path.join(OUTPUT_DIR, Q4_FILENAME)
238
+
239
+ result = subprocess.run(
240
+ [quantize_bin, f16_path, q4_path, "Q4_K_M"],
241
+ capture_output=True, text=True,
242
+ )
243
+
244
+ if result.stdout:
245
+ for line in result.stdout.strip().split("\n")[-10:]:
246
+ logger.info(" quantize: %s", line)
247
+
248
+ if result.returncode != 0:
249
+ logger.error("[QUANTIZE] Q4_K_M quantization failed (exit %d)", result.returncode)
250
+ logger.error("STDERR: %s", result.stderr[-2000:] if result.stderr else "(empty)")
251
+ sys.exit(1)
252
+
253
+ q4_size_gb = os.path.getsize(q4_path) / 1024**3
254
+ logger.info("[QUANTIZE] Q4_K_M created: %s (%.2f GB) (%.1fs)",
255
+ Q4_FILENAME, q4_size_gb, time.time() - start_time)
256
+
257
+ # Clean up F16 to free disk space (we only upload Q4_K_M)
258
+ os.remove(f16_path)
259
+ logger.info("[QUANTIZE] F16 GGUF removed to free disk space")
260
+
261
+
262
+ # ---------------------------------------------------------------------------
263
+ # 6. Upload Q4_K_M GGUF + Modelfile to Hub
264
+ # ---------------------------------------------------------------------------
265
+
266
+ from huggingface_hub import HfApi
267
+ api = HfApi()
268
+
269
+ logger.info("[UPLOAD] Uploading Q4_K_M GGUF to Hub...")
270
+ api.upload_file(
271
+ path_or_fileobj=q4_path,
272
+ path_in_repo=Q4_FILENAME,
273
+ repo_id=ADAPTER_REPO,
274
+ commit_message=f"Round 3 GGUF Q4_K_M: {Q4_FILENAME} ({q4_size_gb:.1f} GB) — LoRA r64, 3766 examples, loss 0.6704",
275
+ )
276
+ logger.info("[UPLOAD] Q4_K_M GGUF uploaded!")
277
+
278
+ # Generate Modelfile for Ollama
279
+ modelfile_content = f"""# Ollama Modelfile for QR-Verse AI (Round 3)
280
+ # Usage:
281
+ # ollama create qr-verse-ai -f Modelfile
282
+ # ollama run qr-verse-ai
283
+
284
+ FROM ./{Q4_FILENAME}
285
+
286
+ SYSTEM \"\"\"{SYSTEM_PROMPT}\"\"\"
287
+
288
+ PARAMETER temperature 0.7
289
+ PARAMETER num_ctx 4096
290
+ """
291
+
292
+ modelfile_path = os.path.join(OUTPUT_DIR, "Modelfile")
293
+ with open(modelfile_path, "w") as f:
294
+ f.write(modelfile_content)
295
+
296
+ api.upload_file(
297
+ path_or_fileobj=modelfile_path,
298
+ path_in_repo="Modelfile",
299
+ repo_id=ADAPTER_REPO,
300
+ commit_message="Ollama Modelfile for QR-Verse AI Round 3 (Q4_K_M)",
301
+ )
302
+ logger.info("[UPLOAD] Modelfile uploaded!")
303
+
304
+ elapsed = time.time() - start_time
305
+
306
+ print("\n" + "=" * 60)
307
+ print("GGUF CONVERSION + QUANTIZATION COMPLETE")
308
+ print("=" * 60)
309
+ print(f" Q4_K_M: {Q4_FILENAME} ({q4_size_gb:.1f} GB)")
310
+ print(f" Hub: https://huggingface.co/{ADAPTER_REPO}")
311
+ print(f" Time: {elapsed / 60:.1f} minutes")
312
+ print()
313
+ print("Deploy on Ubuntu RTX 3080:")
314
+ print(f" 1. hf download {ADAPTER_REPO} {Q4_FILENAME} Modelfile")
315
+ print(" 2. ollama create qr-verse-ai -f Modelfile")
316
+ print(" 3. ollama run qr-verse-ai")
317
+ print("=" * 60)