ssdavid commited on
Commit
b30b5b3
·
verified ·
1 Parent(s): ef69abc

Upload convert_to_gguf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. convert_to_gguf.py +424 -0
convert_to_gguf.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "transformers>=4.36.0",
6
+ # "peft>=0.7.0",
7
+ # "torch>=2.0.0",
8
+ # "accelerate>=0.24.0",
9
+ # "huggingface_hub>=0.20.0",
10
+ # "sentencepiece>=0.1.99",
11
+ # "protobuf>=3.20.0",
12
+ # "numpy",
13
+ # "gguf",
14
+ # ]
15
+ # ///
16
+
17
+ """
18
+ GGUF Conversion Script - Production Ready
19
+
20
+ This script converts a LoRA fine-tuned model to GGUF format for use with:
21
+ - llama.cpp
22
+ - Ollama
23
+ - LM Studio
24
+ - Other GGUF-compatible tools
25
+
26
+ PREREQUISITES (install these FIRST):
27
+ - Ubuntu/Debian: sudo apt-get update && sudo apt-get install -y build-essential cmake
28
+ - RHEL/CentOS: sudo yum groupinstall -y "Development Tools" && sudo yum install -y cmake
29
+ - macOS: xcode-select --install && brew install cmake
30
+
31
+ Usage:
32
+ Set environment variables:
33
+ - ADAPTER_MODEL: Your fine-tuned model (e.g., "username/my-finetuned-model")
34
+ - BASE_MODEL: Base model used for fine-tuning (e.g., "Qwen/Qwen2.5-0.5B")
35
+ - OUTPUT_REPO: Where to upload GGUF files (e.g., "username/my-model-gguf")
36
+ - HF_USERNAME: Your Hugging Face username (optional, for README)
37
+
38
+ Dependencies: All required packages are declared in PEP 723 header above.
39
+ """
40
+
41
+ import os
42
+ import sys
43
+ import torch
44
+ from transformers import AutoModelForCausalLM, AutoTokenizer
45
+ from peft import PeftModel
46
+ from huggingface_hub import HfApi
47
+ import subprocess
48
+
49
+
50
+ def check_system_dependencies():
51
+ """Check if required system packages are available."""
52
+ print("🔍 Checking system dependencies...")
53
+
54
+ # Check for git
55
+ if subprocess.run(["which", "git"], capture_output=True).returncode != 0:
56
+ print(" ❌ git is not installed. Please install it:")
57
+ print(" Ubuntu/Debian: sudo apt-get install git")
58
+ print(" RHEL/CentOS: sudo yum install git")
59
+ print(" macOS: brew install git")
60
+ return False
61
+
62
+ # Check for make or cmake
63
+ has_make = subprocess.run(["which", "make"], capture_output=True).returncode == 0
64
+ has_cmake = subprocess.run(["which", "cmake"], capture_output=True).returncode == 0
65
+
66
+ if not has_make and not has_cmake:
67
+ print(" ❌ Neither make nor cmake found. Please install build tools:")
68
+ print(" Ubuntu/Debian: sudo apt-get install build-essential cmake")
69
+ print(" RHEL/CentOS: sudo yum groupinstall 'Development Tools' && sudo yum install cmake")
70
+ print(" macOS: xcode-select --install && brew install cmake")
71
+ return False
72
+
73
+ print(" ✅ System dependencies found")
74
+ return True
75
+
76
+
77
+ def run_command(cmd, description):
78
+ """Run a command with error handling."""
79
+ print(f" {description}...")
80
+ try:
81
+ result = subprocess.run(
82
+ cmd,
83
+ check=True,
84
+ capture_output=True,
85
+ text=True
86
+ )
87
+ if result.stdout:
88
+ print(f" {result.stdout[:200]}") # Show first 200 chars
89
+ return True
90
+ except subprocess.CalledProcessError as e:
91
+ print(f" ❌ Command failed: {' '.join(cmd)}")
92
+ if e.stdout:
93
+ print(f" STDOUT: {e.stdout[:500]}")
94
+ if e.stderr:
95
+ print(f" STDERR: {e.stderr[:500]}")
96
+ return False
97
+ except FileNotFoundError:
98
+ print(f" ❌ Command not found: {cmd[0]}")
99
+ return False
100
+
101
+
102
+ print("🔄 GGUF Conversion Script")
103
+ print("=" * 60)
104
+
105
+ # Check system dependencies first
106
+ if not check_system_dependencies():
107
+ print("\n❌ Please install the missing system dependencies and try again.")
108
+ sys.exit(1)
109
+
110
+ # Configuration from environment variables
111
+ ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "evalstate/qwen-capybara-medium")
112
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
113
+ OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "evalstate/qwen-capybara-medium-gguf")
114
+ username = os.environ.get("HF_USERNAME", ADAPTER_MODEL.split('/')[0])
115
+
116
+ print(f"\n📦 Configuration:")
117
+ print(f" Base model: {BASE_MODEL}")
118
+ print(f" Adapter model: {ADAPTER_MODEL}")
119
+ print(f" Output repo: {OUTPUT_REPO}")
120
+
121
+ # Step 1: Load base model and adapter
122
+ print("\n🔧 Step 1: Loading base model and LoRA adapter...")
123
+ print(" (This may take a few minutes)")
124
+
125
+ try:
126
+ base_model = AutoModelForCausalLM.from_pretrained(
127
+ BASE_MODEL,
128
+ dtype=torch.float16,
129
+ device_map="auto",
130
+ trust_remote_code=True,
131
+ )
132
+ print(" ✅ Base model loaded")
133
+ except Exception as e:
134
+ print(f" ❌ Failed to load base model: {e}")
135
+ sys.exit(1)
136
+
137
+ try:
138
+ # Load and merge adapter
139
+ print(" Loading LoRA adapter...")
140
+ model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
141
+ print(" ✅ Adapter loaded")
142
+
143
+ print(" Merging adapter with base model...")
144
+ merged_model = model.merge_and_unload()
145
+ print(" ✅ Models merged!")
146
+ except Exception as e:
147
+ print(f" ❌ Failed to merge models: {e}")
148
+ sys.exit(1)
149
+
150
+ try:
151
+ # Load tokenizer
152
+ tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
153
+ print(" ✅ Tokenizer loaded")
154
+ except Exception as e:
155
+ print(f" ❌ Failed to load tokenizer: {e}")
156
+ sys.exit(1)
157
+
158
+ # Step 2: Save merged model temporarily
159
+ print("\n💾 Step 2: Saving merged model...")
160
+ merged_dir = "/tmp/merged_model"
161
+ try:
162
+ merged_model.save_pretrained(merged_dir, safe_serialization=True)
163
+ tokenizer.save_pretrained(merged_dir)
164
+ print(f" ✅ Merged model saved to {merged_dir}")
165
+ except Exception as e:
166
+ print(f" ❌ Failed to save merged model: {e}")
167
+ sys.exit(1)
168
+
169
+ # Step 3: Install llama.cpp for conversion
170
+ print("\n📥 Step 3: Setting up llama.cpp for GGUF conversion...")
171
+
172
+ # Clone llama.cpp repository
173
+ if not run_command(
174
+ ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
175
+ "Cloning llama.cpp repository"
176
+ ):
177
+ print(" Trying alternative clone method...")
178
+ # Try shallow clone
179
+ if not run_command(
180
+ ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
181
+ "Cloning llama.cpp (shallow)"
182
+ ):
183
+ sys.exit(1)
184
+
185
+ # Install Python dependencies
186
+ print(" Installing Python dependencies...")
187
+ if not run_command(
188
+ ["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"],
189
+ "Installing llama.cpp requirements"
190
+ ):
191
+ print(" ⚠️ Some requirements may already be installed")
192
+
193
+ if not run_command(
194
+ ["pip", "install", "sentencepiece", "protobuf"],
195
+ "Installing tokenizer dependencies"
196
+ ):
197
+ print(" ⚠️ Tokenizer dependencies may already be installed")
198
+
199
+ # Step 4: Convert to GGUF (FP16)
200
+ print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
201
+ gguf_output_dir = "/tmp/gguf_output"
202
+ os.makedirs(gguf_output_dir, exist_ok=True)
203
+
204
+ convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
205
+ model_name = ADAPTER_MODEL.split('/')[-1]
206
+ gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
207
+
208
+ print(f" Running conversion...")
209
+ if not run_command(
210
+ [
211
+ sys.executable, convert_script,
212
+ merged_dir,
213
+ "--outfile", gguf_file,
214
+ "--outtype", "f16"
215
+ ],
216
+ f"Converting to FP16"
217
+ ):
218
+ print(" ❌ Conversion failed!")
219
+ sys.exit(1)
220
+
221
+ print(f" ✅ FP16 GGUF created: {gguf_file}")
222
+
223
+ # Step 5: Quantize to different formats
224
+ print("\n⚙️ Step 5: Creating quantized versions...")
225
+
226
+ # Build quantize tool using CMake (more reliable than make)
227
+ print(" Building quantize tool with CMake...")
228
+ os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
229
+
230
+ # Configure with CMake
231
+ if not run_command(
232
+ ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp",
233
+ "-DGGML_CUDA=OFF"],
234
+ "Configuring with CMake"
235
+ ):
236
+ print(" ❌ CMake configuration failed")
237
+ sys.exit(1)
238
+
239
+ # Build just the quantize tool
240
+ if not run_command(
241
+ ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
242
+ "Building llama-quantize"
243
+ ):
244
+ print(" ❌ Build failed!")
245
+ sys.exit(1)
246
+
247
+ print(" ✅ Quantize tool built")
248
+
249
+ # Use the CMake build output path
250
+ quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
251
+
252
+ # Common quantization formats
253
+ quant_formats = [
254
+ ("Q4_K_M", "4-bit, medium quality (recommended)"),
255
+ ("Q5_K_M", "5-bit, higher quality"),
256
+ ("Q8_0", "8-bit, very high quality"),
257
+ ]
258
+
259
+ quantized_files = []
260
+ for quant_type, description in quant_formats:
261
+ print(f" Creating {quant_type} quantization ({description})...")
262
+ quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
263
+
264
+ if not run_command(
265
+ [quantize_bin, gguf_file, quant_file, quant_type],
266
+ f"Quantizing to {quant_type}"
267
+ ):
268
+ print(f" ⚠️ Skipping {quant_type} due to error")
269
+ continue
270
+
271
+ quantized_files.append((quant_file, quant_type))
272
+
273
+ # Get file size
274
+ size_mb = os.path.getsize(quant_file) / (1024 * 1024)
275
+ print(f" ✅ {quant_type}: {size_mb:.1f} MB")
276
+
277
+ if not quantized_files:
278
+ print(" ❌ No quantized versions were created successfully")
279
+ sys.exit(1)
280
+
281
+ # Step 6: Upload to Hub
282
+ print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
283
+ api = HfApi()
284
+
285
+ # Create repo
286
+ print(f" Creating repository: {OUTPUT_REPO}")
287
+ try:
288
+ api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
289
+ print(" ✅ Repository ready")
290
+ except Exception as e:
291
+ print(f" ℹ️ Repository may already exist: {e}")
292
+
293
+ # Upload FP16 version
294
+ print(" Uploading FP16 GGUF...")
295
+ try:
296
+ api.upload_file(
297
+ path_or_fileobj=gguf_file,
298
+ path_in_repo=f"{model_name}-f16.gguf",
299
+ repo_id=OUTPUT_REPO,
300
+ )
301
+ print(" ✅ FP16 uploaded")
302
+ except Exception as e:
303
+ print(f" ❌ Upload failed: {e}")
304
+ sys.exit(1)
305
+
306
+ # Upload quantized versions
307
+ for quant_file, quant_type in quantized_files:
308
+ print(f" Uploading {quant_type}...")
309
+ try:
310
+ api.upload_file(
311
+ path_or_fileobj=quant_file,
312
+ path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
313
+ repo_id=OUTPUT_REPO,
314
+ )
315
+ print(f" �� {quant_type} uploaded")
316
+ except Exception as e:
317
+ print(f" ❌ Upload failed for {quant_type}: {e}")
318
+ continue
319
+
320
+ # Create README
321
+ print("\n📝 Creating README...")
322
+ readme_content = f"""---
323
+ base_model: {BASE_MODEL}
324
+ tags:
325
+ - gguf
326
+ - llama.cpp
327
+ - quantized
328
+ - trl
329
+ - sft
330
+ ---
331
+
332
+ # {OUTPUT_REPO.split('/')[-1]}
333
+
334
+ This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).
335
+
336
+ ## Model Details
337
+
338
+ - **Base Model:** {BASE_MODEL}
339
+ - **Fine-tuned Model:** {ADAPTER_MODEL}
340
+ - **Training:** Supervised Fine-Tuning (SFT) with TRL
341
+ - **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.)
342
+
343
+ ## Available Quantizations
344
+
345
+ | File | Quant | Size | Description | Use Case |
346
+ |------|-------|------|-------------|----------|
347
+ | {model_name}-f16.gguf | F16 | ~1GB | Full precision | Best quality, slower |
348
+ | {model_name}-q8_0.gguf | Q8_0 | ~500MB | 8-bit | High quality |
349
+ | {model_name}-q5_k_m.gguf | Q5_K_M | ~350MB | 5-bit medium | Good quality, smaller |
350
+ | {model_name}-q4_k_m.gguf | Q4_K_M | ~300MB | 4-bit medium | Recommended - good balance |
351
+
352
+ ## Usage
353
+
354
+ ### With llama.cpp
355
+
356
+ ```bash
357
+ # Download model
358
+ huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
359
+
360
+ # Run with llama.cpp
361
+ ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt here"
362
+ ```
363
+
364
+ ### With Ollama
365
+
366
+ 1. Create a `Modelfile`:
367
+ ```
368
+ FROM ./{model_name}-q4_k_m.gguf
369
+ ```
370
+
371
+ 2. Create the model:
372
+ ```bash
373
+ ollama create my-model -f Modelfile
374
+ ollama run my-model
375
+ ```
376
+
377
+ ### With LM Studio
378
+
379
+ 1. Download the `.gguf` file
380
+ 2. Import into LM Studio
381
+ 3. Start chatting!
382
+
383
+ ## License
384
+
385
+ Inherits the license from the base model: {BASE_MODEL}
386
+
387
+ ## Citation
388
+
389
+ ```bibtex
390
+ @misc{{{OUTPUT_REPO.split('/')[-1].replace('-', '_')},
391
+ author = {{{username}}},
392
+ title = {{{OUTPUT_REPO.split('/')[-1]}}},
393
+ year = {{2025}},
394
+ publisher = {{Hugging Face}},
395
+ url = {{https://huggingface.co/{OUTPUT_REPO}}}
396
+ }}
397
+ ```
398
+
399
+ ---
400
+
401
+ *Converted to GGUF format using llama.cpp*
402
+ """
403
+
404
+ try:
405
+ api.upload_file(
406
+ path_or_fileobj=readme_content.encode(),
407
+ path_in_repo="README.md",
408
+ repo_id=OUTPUT_REPO,
409
+ )
410
+ print(" ✅ README uploaded")
411
+ except Exception as e:
412
+ print(f" ❌ README upload failed: {e}")
413
+
414
+ print("\n" + "=" * 60)
415
+ print("✅ GGUF Conversion Complete!")
416
+ print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
417
+ print(f"\n📥 Download with:")
418
+ print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
419
+ print(f"\n🚀 Use with Ollama:")
420
+ print(" 1. Download the GGUF file")
421
+ print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
422
+ print(" 3. ollama create my-model -f Modelfile")
423
+ print(" 4. ollama run my-model")
424
+ print("=" * 60)