erik1988 commited on
Commit
988573c
·
verified ·
1 Parent(s): fc470fe

Upload convert_to_gguf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. convert_to_gguf.py +74 -398
convert_to_gguf.py CHANGED
@@ -14,411 +14,87 @@
14
  # ]
15
  # ///
16
 
17
- """
18
- GGUF Conversion Script - Production Ready
19
-
20
- This script converts a LoRA fine-tuned model to GGUF format for use with:
21
- - llama.cpp
22
- - Ollama
23
- - LM Studio
24
- - Other GGUF-compatible tools
25
-
26
- PREREQUISITES (install these FIRST):
27
- - Ubuntu/Debian: sudo apt-get update && sudo apt-get install -y build-essential cmake
28
- - RHEL/CentOS: sudo yum groupinstall -y "Development Tools" && sudo yum install -y cmake
29
- - macOS: xcode-select --install && brew install cmake
30
-
31
- Usage:
32
- Set environment variables:
33
- - ADAPTER_MODEL: Your fine-tuned model (e.g., "username/my-finetuned-model")
34
- - BASE_MODEL: Base model used for fine-tuning (e.g., "Qwen/Qwen2.5-0.5B")
35
- - OUTPUT_REPO: Where to upload GGUF files (e.g., "username/my-model-gguf")
36
- - HF_USERNAME: Your Hugging Face username (optional, for README)
37
-
38
- Dependencies: All required packages are declared in PEP 723 header above.
39
- """
40
-
41
- import os
42
- import sys
43
- import torch
44
  from transformers import AutoModelForCausalLM, AutoTokenizer
45
  from peft import PeftModel
46
  from huggingface_hub import HfApi
47
- import subprocess
48
-
49
-
50
- def check_system_dependencies():
51
- """Check if required system packages are available."""
52
- print("🔍 Checking system dependencies...")
53
-
54
- # Check for git
55
- if subprocess.run(["which", "git"], capture_output=True).returncode != 0:
56
- print(" git is not installed. Please install it:")
57
- print(" Ubuntu/Debian: sudo apt-get install git")
58
- print(" RHEL/CentOS: sudo yum install git")
59
- print(" macOS: brew install git")
60
- return False
61
-
62
- # Check for make or cmake
63
- has_make = subprocess.run(["which", "make"], capture_output=True).returncode == 0
64
- has_cmake = subprocess.run(["which", "cmake"], capture_output=True).returncode == 0
65
-
66
- if not has_make and not has_cmake:
67
- print(" ❌ Neither make nor cmake found. Please install build tools:")
68
- print(" Ubuntu/Debian: sudo apt-get install build-essential cmake")
69
- print(" RHEL/CentOS: sudo yum groupinstall 'Development Tools' && sudo yum install cmake")
70
- print(" macOS: xcode-select --install && brew install cmake")
71
- return False
72
-
73
- print(" ✅ System dependencies found")
74
- return True
75
-
76
-
77
- def run_command(cmd, description):
78
- """Run a command with error handling."""
79
- print(f" {description}...")
80
- try:
81
- result = subprocess.run(
82
- cmd,
83
- check=True,
84
- capture_output=True,
85
- text=True
86
- )
87
- if result.stdout:
88
- print(f" {result.stdout[:200]}") # Show first 200 chars
89
- return True
90
- except subprocess.CalledProcessError as e:
91
- print(f" ❌ Command failed: {' '.join(cmd)}")
92
- if e.stdout:
93
- print(f" STDOUT: {e.stdout[:500]}")
94
- if e.stderr:
95
- print(f" STDERR: {e.stderr[:500]}")
96
- return False
97
- except FileNotFoundError:
98
- print(f" ❌ Command not found: {cmd[0]}")
99
- return False
100
-
101
-
102
- print("🔄 GGUF Conversion Script")
103
- print("=" * 60)
104
-
105
- # Check system dependencies first
106
- if not check_system_dependencies():
107
- print("\n❌ Please install the missing system dependencies and try again.")
108
- sys.exit(1)
109
-
110
- # Configuration from environment variables
111
- ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "evalstate/qwen-capybara-medium")
112
- BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
113
- OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "evalstate/qwen-capybara-medium-gguf")
114
- username = os.environ.get("HF_USERNAME", ADAPTER_MODEL.split('/')[0])
115
-
116
- print(f"\n📦 Configuration:")
117
- print(f" Base model: {BASE_MODEL}")
118
- print(f" Adapter model: {ADAPTER_MODEL}")
119
- print(f" Output repo: {OUTPUT_REPO}")
120
-
121
- # Step 1: Load base model and adapter
122
- print("\n🔧 Step 1: Loading base model and LoRA adapter...")
123
- print(" (This may take a few minutes)")
124
-
125
- try:
126
- base_model = AutoModelForCausalLM.from_pretrained(
127
- BASE_MODEL,
128
- dtype=torch.float16,
129
- device_map="auto",
130
- trust_remote_code=True,
131
- )
132
- print(" ✅ Base model loaded")
133
- except Exception as e:
134
- print(f" ❌ Failed to load base model: {e}")
135
- sys.exit(1)
136
 
137
- try:
138
- # Load and merge adapter
139
- print(" Loading LoRA adapter...")
140
- model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
141
- print(" ✅ Adapter loaded")
142
-
143
- print(" Merging adapter with base model...")
144
- merged_model = model.merge_and_unload()
145
- print(" ✅ Models merged!")
146
- except Exception as e:
147
- print(f" ❌ Failed to merge models: {e}")
148
- sys.exit(1)
149
-
150
- try:
151
- # Load tokenizer
152
- tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
153
- print(" ✅ Tokenizer loaded")
154
- except Exception as e:
155
- print(f" ❌ Failed to load tokenizer: {e}")
156
- sys.exit(1)
157
-
158
- # Step 2: Save merged model temporarily
159
- print("\n💾 Step 2: Saving merged model...")
160
  merged_dir = "/tmp/merged_model"
161
- try:
162
- merged_model.save_pretrained(merged_dir, safe_serialization=True)
163
- tokenizer.save_pretrained(merged_dir)
164
- print(f" ✅ Merged model saved to {merged_dir}")
165
- except Exception as e:
166
- print(f" Failed to save merged model: {e}")
167
- sys.exit(1)
168
-
169
- # Step 3: Install llama.cpp for conversion
170
- print("\n📥 Step 3: Setting up llama.cpp for GGUF conversion...")
171
-
172
- # Clone llama.cpp repository
173
- if not run_command(
174
- ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
175
- "Cloning llama.cpp repository"
176
- ):
177
- print(" Trying alternative clone method...")
178
- # Try shallow clone
179
- if not run_command(
180
- ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
181
- "Cloning llama.cpp (shallow)"
182
- ):
183
- sys.exit(1)
184
-
185
- # Install Python dependencies
186
- print(" Installing Python dependencies...")
187
- if not run_command(
188
- ["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"],
189
- "Installing llama.cpp requirements"
190
- ):
191
- print(" ⚠️ Some requirements may already be installed")
192
-
193
- if not run_command(
194
- ["pip", "install", "sentencepiece", "protobuf"],
195
- "Installing tokenizer dependencies"
196
- ):
197
- print(" ⚠️ Tokenizer dependencies may already be installed")
198
-
199
- # Step 4: Convert to GGUF (FP16)
200
- print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
201
- gguf_output_dir = "/tmp/gguf_output"
202
- os.makedirs(gguf_output_dir, exist_ok=True)
203
-
204
- convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
205
- model_name = ADAPTER_MODEL.split('/')[-1]
206
- gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
207
-
208
- print(f" Running conversion...")
209
- if not run_command(
210
- [
211
- sys.executable, convert_script,
212
- merged_dir,
213
- "--outfile", gguf_file,
214
- "--outtype", "f16"
215
- ],
216
- f"Converting to FP16"
217
- ):
218
- print(" ❌ Conversion failed!")
219
- sys.exit(1)
220
-
221
- print(f" ✅ FP16 GGUF created: {gguf_file}")
222
-
223
- # Step 5: Quantize to different formats
224
- print("\n⚙️ Step 5: Creating quantized versions...")
225
-
226
- # Build quantize tool using CMake (more reliable than make)
227
- print(" Building quantize tool with CMake...")
228
  os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
229
-
230
- # Configure with CMake
231
- if not run_command(
232
- ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp",
233
- "-DGGML_CUDA=OFF"],
234
- "Configuring with CMake"
235
- ):
236
- print(" ❌ CMake configuration failed")
237
- sys.exit(1)
238
-
239
- # Build just the quantize tool
240
- if not run_command(
241
- ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
242
- "Building llama-quantize"
243
- ):
244
- print(" ❌ Build failed!")
245
- sys.exit(1)
246
-
247
- print(" ✅ Quantize tool built")
248
-
249
- # Use the CMake build output path
250
- quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
251
-
252
- # Common quantization formats
253
- quant_formats = [
254
- ("Q4_K_M", "4-bit, medium quality (recommended)"),
255
- ("Q5_K_M", "5-bit, higher quality"),
256
- ("Q8_0", "8-bit, very high quality"),
257
- ]
258
-
259
- quantized_files = []
260
- for quant_type, description in quant_formats:
261
- print(f" Creating {quant_type} quantization ({description})...")
262
- quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
263
-
264
- if not run_command(
265
- [quantize_bin, gguf_file, quant_file, quant_type],
266
- f"Quantizing to {quant_type}"
267
- ):
268
- print(f" ⚠️ Skipping {quant_type} due to error")
269
- continue
270
-
271
- quantized_files.append((quant_file, quant_type))
272
-
273
- # Get file size
274
- size_mb = os.path.getsize(quant_file) / (1024 * 1024)
275
- print(f" ✅ {quant_type}: {size_mb:.1f} MB")
276
-
277
- if not quantized_files:
278
- print(" ❌ No quantized versions were created successfully")
279
- sys.exit(1)
280
-
281
- # Step 6: Upload to Hub
282
- print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
283
  api = HfApi()
 
284
 
285
- # Create repo
286
- print(f" Creating repository: {OUTPUT_REPO}")
287
- try:
288
- api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
289
- print(" ✅ Repository ready")
290
- except Exception as e:
291
- print(f" ℹ️ Repository may already exist: {e}")
292
-
293
- # Upload FP16 version
294
- print(" Uploading FP16 GGUF...")
295
- try:
296
- api.upload_file(
297
- path_or_fileobj=gguf_file,
298
- path_in_repo=f"{model_name}-f16.gguf",
299
- repo_id=OUTPUT_REPO,
300
- )
301
- print(" ✅ FP16 uploaded")
302
- except Exception as e:
303
- print(f" ❌ Upload failed: {e}")
304
- sys.exit(1)
305
-
306
- # Upload quantized versions
307
- for quant_file, quant_type in quantized_files:
308
- print(f" Uploading {quant_type}...")
309
- try:
310
- api.upload_file(
311
- path_or_fileobj=quant_file,
312
- path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
313
- repo_id=OUTPUT_REPO,
314
- )
315
- print(f" ✅ {quant_type} uploaded")
316
- except Exception as e:
317
- print(f" ❌ Upload failed for {quant_type}: {e}")
318
- continue
319
-
320
- # Create README
321
- print("\n📝 Creating README...")
322
- readme_content = f"""---
323
- base_model: {BASE_MODEL}
324
- tags:
325
- - gguf
326
- - llama.cpp
327
- - quantized
328
- - trl
329
- - sft
330
- ---
331
-
332
- # {OUTPUT_REPO.split('/')[-1]}
333
-
334
- This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).
335
-
336
- ## Model Details
337
-
338
- - **Base Model:** {BASE_MODEL}
339
- - **Fine-tuned Model:** {ADAPTER_MODEL}
340
- - **Training:** Supervised Fine-Tuning (SFT) with TRL
341
- - **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.)
342
-
343
- ## Available Quantizations
344
-
345
- | File | Quant | Size | Description | Use Case |
346
- |------|-------|------|-------------|----------|
347
- | {model_name}-f16.gguf | F16 | ~1GB | Full precision | Best quality, slower |
348
- | {model_name}-q8_0.gguf | Q8_0 | ~500MB | 8-bit | High quality |
349
- | {model_name}-q5_k_m.gguf | Q5_K_M | ~350MB | 5-bit medium | Good quality, smaller |
350
- | {model_name}-q4_k_m.gguf | Q4_K_M | ~300MB | 4-bit medium | Recommended - good balance |
351
-
352
- ## Usage
353
-
354
- ### With llama.cpp
355
-
356
- ```bash
357
- # Download model
358
- huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
359
-
360
- # Run with llama.cpp
361
- ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt here"
362
- ```
363
-
364
- ### With Ollama
365
-
366
- 1. Create a `Modelfile`:
367
- ```
368
- FROM ./{model_name}-q4_k_m.gguf
369
- ```
370
-
371
- 2. Create the model:
372
- ```bash
373
- ollama create my-model -f Modelfile
374
- ollama run my-model
375
- ```
376
-
377
- ### With LM Studio
378
-
379
- 1. Download the `.gguf` file
380
- 2. Import into LM Studio
381
- 3. Start chatting!
382
-
383
- ## License
384
-
385
- Inherits the license from the base model: {BASE_MODEL}
386
-
387
- ## Citation
388
-
389
- ```bibtex
390
- @misc{{{OUTPUT_REPO.split('/')[-1].replace('-', '_')},
391
- author = {{{username}}},
392
- title = {{{OUTPUT_REPO.split('/')[-1]}}},
393
- year = {{2025}},
394
- publisher = {{Hugging Face}},
395
- url = {{https://huggingface.co/{OUTPUT_REPO}}}
396
- }}
397
- ```
398
-
399
- ---
400
-
401
- *Converted to GGUF format using llama.cpp*
402
- """
403
 
404
- try:
405
- api.upload_file(
406
- path_or_fileobj=readme_content.encode(),
407
- path_in_repo="README.md",
408
- repo_id=OUTPUT_REPO,
409
- )
410
- print(" ✅ README uploaded")
411
- except Exception as e:
412
- print(f" ❌ README upload failed: {e}")
413
 
414
- print("\n" + "=" * 60)
415
- print("✅ GGUF Conversion Complete!")
416
- print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
417
- print(f"\n📥 Download with:")
418
- print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
419
- print(f"\n🚀 Use with Ollama:")
420
- print(" 1. Download the GGUF file")
421
- print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
422
- print(" 3. ollama create my-model -f Modelfile")
423
- print(" 4. ollama run my-model")
424
- print("=" * 60)
 
14
  # ]
15
  # ///
16
 
17
+ import os, sys, subprocess, torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  from transformers import AutoModelForCausalLM, AutoTokenizer
19
  from peft import PeftModel
20
  from huggingface_hub import HfApi
21
+ import huggingface_hub
22
+
23
+ # Login
24
+ token = os.environ.get("HF_TOKEN")
25
+ if token:
26
+ huggingface_hub.login(token=token)
27
+ print("Logged in")
28
+
29
+ # Install build tools
30
+ print("Installing build tools...")
31
+ subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
32
+ subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], capture_output=True, check=True)
33
+ print("Build tools installed")
34
+
35
+ ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "erik1988/elias-memory-agent-v1")
36
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
37
+ OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "erik1988/elias-memory-agent-v1-gguf")
38
+
39
+ print(f"Base: {BASE_MODEL}, Adapter: {ADAPTER_MODEL}, Output: {OUTPUT_REPO}")
40
+
41
+ # Load and merge
42
+ print("Loading base model...")
43
+ base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, dtype=torch.float16, device_map="auto", trust_remote_code=True)
44
+ print("Loading adapter...")
45
+ model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
46
+ print("Merging...")
47
+ merged = model.merge_and_unload()
48
+ tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  merged_dir = "/tmp/merged_model"
51
+ merged.save_pretrained(merged_dir, safe_serialization=True)
52
+ tokenizer.save_pretrained(merged_dir)
53
+ print("Merged model saved")
54
+
55
+ # Clone llama.cpp
56
+ print("Cloning llama.cpp...")
57
+ subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True)
58
+ subprocess.run(["pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True)
59
+
60
+ # Convert to F16 GGUF
61
+ gguf_dir = "/tmp/gguf_output"
62
+ os.makedirs(gguf_dir, exist_ok=True)
63
+ model_name = ADAPTER_MODEL.split("/")[-1]
64
+ f16_file = f"{gguf_dir}/{model_name}-f16.gguf"
65
+
66
+ print("Converting to F16 GGUF...")
67
+ subprocess.run([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", f16_file, "--outtype", "f16"], check=True)
68
+ print(f"F16 GGUF: {os.path.getsize(f16_file) / 1024 / 1024:.0f} MB")
69
+
70
+ # Build quantize tool
71
+ print("Building quantize tool...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
73
+ subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True)
74
+ subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True)
75
+
76
+ quantize = "/tmp/llama.cpp/build/bin/llama-quantize"
77
+ quant_files = []
78
+
79
+ for qt, desc in [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")]:
80
+ qf = f"{gguf_dir}/{model_name}-{qt.lower()}.gguf"
81
+ print(f"Quantizing {qt}...")
82
+ r = subprocess.run([quantize, f16_file, qf, qt], capture_output=True)
83
+ if r.returncode == 0:
84
+ size = os.path.getsize(qf) / 1024 / 1024
85
+ print(f" {qt}: {size:.0f} MB")
86
+ quant_files.append((qf, qt))
87
+
88
+ # Upload
89
+ print("Uploading to Hub...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  api = HfApi()
91
+ api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
92
 
93
+ api.upload_file(path_or_fileobj=f16_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO)
94
+ print("F16 uploaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ for qf, qt in quant_files:
97
+ api.upload_file(path_or_fileobj=qf, path_in_repo=f"{model_name}-{qt.lower()}.gguf", repo_id=OUTPUT_REPO)
98
+ print(f"{qt} uploaded")
 
 
 
 
 
 
99
 
100
+ print(f"\nDone! https://huggingface.co/{OUTPUT_REPO}")