lokegud commited on
Commit
e3a0ad5
·
verified ·
1 Parent(s): 3365f48

Upload convert_to_gguf.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. convert_to_gguf.py +350 -0
convert_to_gguf.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # dependencies = [
4
+ # "transformers>=4.36.0",
5
+ # "peft>=0.7.0",
6
+ # "torch>=2.0.0",
7
+ # "accelerate>=0.24.0",
8
+ # "huggingface_hub>=0.20.0",
9
+ # "sentencepiece>=0.1.99",
10
+ # "protobuf>=3.20.0",
11
+ # "numpy",
12
+ # "gguf",
13
+ # ]
14
+ # ///
15
+
16
+ """
17
+ GGUF Conversion Script - Production Ready
18
+
19
+ This script converts a LoRA fine-tuned model to GGUF format for use with:
20
+ - llama.cpp
21
+ - Ollama
22
+ - LM Studio
23
+ - Other GGUF-compatible tools
24
+
25
+ Usage:
26
+ Set environment variables:
27
+ - ADAPTER_MODEL: Your fine-tuned model (e.g., "username/my-finetuned-model")
28
+ - BASE_MODEL: Base model used for fine-tuning (e.g., "Qwen/Qwen2.5-0.5B")
29
+ - OUTPUT_REPO: Where to upload GGUF files (e.g., "username/my-model-gguf")
30
+ - HF_USERNAME: Your Hugging Face username (optional, for README)
31
+
32
+ Dependencies: All required packages are declared in PEP 723 header above.
33
+ Build tools (gcc, cmake) are installed automatically by this script.
34
+ """
35
+
36
+ import os
37
+ import torch
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer
39
+ from peft import PeftModel
40
+ from huggingface_hub import HfApi
41
+ import subprocess
42
+
43
+ print("🔄 GGUF Conversion Script")
44
+ print("=" * 60)
45
+
46
+ # Configuration from environment variables
47
+ ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "evalstate/qwen-capybara-medium")
48
+ BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
49
+ OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "evalstate/qwen-capybara-medium-gguf")
50
+ username = os.environ.get("HF_USERNAME", ADAPTER_MODEL.split('/')[0])
51
+
52
+ print(f"\n📦 Configuration:")
53
+ print(f" Base model: {BASE_MODEL}")
54
+ print(f" Adapter model: {ADAPTER_MODEL}")
55
+ print(f" Output repo: {OUTPUT_REPO}")
56
+
57
+ # Step 1: Load base model and adapter
58
+ print("\n🔧 Step 1: Loading base model and LoRA adapter...")
59
+ print(" (This may take a few minutes)")
60
+
61
+ base_model = AutoModelForCausalLM.from_pretrained(
62
+ BASE_MODEL,
63
+ dtype=torch.float16,
64
+ device_map="auto",
65
+ trust_remote_code=True,
66
+ )
67
+ print(" ✅ Base model loaded")
68
+
69
+ # Load and merge adapter
70
+ print(" Loading LoRA adapter...")
71
+ model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
72
+ print(" ✅ Adapter loaded")
73
+
74
+ print(" Merging adapter with base model...")
75
+ merged_model = model.merge_and_unload()
76
+ print(" ✅ Models merged!")
77
+
78
+ # Load tokenizer
79
+ tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
80
+ print(" ✅ Tokenizer loaded")
81
+
82
+ # Step 2: Save merged model temporarily
83
+ print("\n💾 Step 2: Saving merged model...")
84
+ merged_dir = "/tmp/merged_model"
85
+ merged_model.save_pretrained(merged_dir, safe_serialization=True)
86
+ tokenizer.save_pretrained(merged_dir)
87
+ print(f" ✅ Merged model saved to {merged_dir}")
88
+
89
+ # Step 3: Install llama.cpp for conversion
90
+ print("\n📥 Step 3: Setting up llama.cpp for GGUF conversion...")
91
+
92
+ # CRITICAL: Install build tools FIRST (before cloning llama.cpp)
93
+ print(" Installing build tools...")
94
+ subprocess.run(
95
+ ["apt-get", "update", "-qq"],
96
+ check=True,
97
+ capture_output=True
98
+ )
99
+ subprocess.run(
100
+ ["apt-get", "install", "-y", "-qq", "build-essential", "cmake"],
101
+ check=True,
102
+ capture_output=True
103
+ )
104
+ print(" ✅ Build tools installed")
105
+
106
+ print(" Cloning llama.cpp repository...")
107
+ subprocess.run(
108
+ ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
109
+ check=True,
110
+ capture_output=True
111
+ )
112
+ print(" ✅ llama.cpp cloned")
113
+
114
+ print(" Installing Python dependencies...")
115
+ subprocess.run(
116
+ ["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"],
117
+ check=True,
118
+ capture_output=True
119
+ )
120
+ # sentencepiece and protobuf are needed for tokenizer conversion
121
+ subprocess.run(
122
+ ["pip", "install", "sentencepiece", "protobuf"],
123
+ check=True,
124
+ capture_output=True
125
+ )
126
+ print(" ✅ Dependencies installed")
127
+
128
+ # Step 4: Convert to GGUF (FP16)
129
+ print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
130
+ gguf_output_dir = "/tmp/gguf_output"
131
+ os.makedirs(gguf_output_dir, exist_ok=True)
132
+
133
+ convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
134
+ model_name = ADAPTER_MODEL.split('/')[-1]
135
+ gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
136
+
137
+ print(f" Running: python {convert_script} {merged_dir}")
138
+ try:
139
+ result = subprocess.run(
140
+ [
141
+ "python", convert_script,
142
+ merged_dir,
143
+ "--outfile", gguf_file,
144
+ "--outtype", "f16"
145
+ ],
146
+ check=True,
147
+ capture_output=True,
148
+ text=True
149
+ )
150
+ print(result.stdout)
151
+ if result.stderr:
152
+ print("Warnings:", result.stderr)
153
+ except subprocess.CalledProcessError as e:
154
+ print(f"❌ Conversion failed!")
155
+ print("STDOUT:", e.stdout)
156
+ print("STDERR:", e.stderr)
157
+ raise
158
+ print(f" ✅ FP16 GGUF created: {gguf_file}")
159
+
160
+ # Step 5: Quantize to different formats
161
+ print("\n⚙️ Step 5: Creating quantized versions...")
162
+
163
+ # Build quantize tool using CMake (more reliable than make)
164
+ print(" Building quantize tool with CMake...")
165
+ try:
166
+ # Create build directory
167
+ os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
168
+
169
+ # Configure with CMake
170
+ subprocess.run(
171
+ ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp",
172
+ "-DGGML_CUDA=OFF"], # Disable CUDA for faster build
173
+ check=True,
174
+ capture_output=True,
175
+ text=True
176
+ )
177
+
178
+ # Build just the quantize tool
179
+ subprocess.run(
180
+ ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
181
+ check=True,
182
+ capture_output=True,
183
+ text=True
184
+ )
185
+ print(" ✅ Quantize tool built")
186
+ except subprocess.CalledProcessError as e:
187
+ print(f" ❌ Build failed!")
188
+ print("STDOUT:", e.stdout)
189
+ print("STDERR:", e.stderr)
190
+ raise
191
+
192
+ # Use the CMake build output path
193
+ quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
194
+
195
+ # Common quantization formats
196
+ quant_formats = [
197
+ ("Q4_K_M", "4-bit, medium quality (recommended)"),
198
+ ("Q5_K_M", "5-bit, higher quality"),
199
+ ("Q8_0", "8-bit, very high quality"),
200
+ ]
201
+
202
+ quantized_files = []
203
+ for quant_type, description in quant_formats:
204
+ print(f" Creating {quant_type} quantization ({description})...")
205
+ quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
206
+
207
+ subprocess.run(
208
+ [quantize_bin, gguf_file, quant_file, quant_type],
209
+ check=True,
210
+ capture_output=True
211
+ )
212
+ quantized_files.append((quant_file, quant_type))
213
+
214
+ # Get file size
215
+ size_mb = os.path.getsize(quant_file) / (1024 * 1024)
216
+ print(f" ✅ {quant_type}: {size_mb:.1f} MB")
217
+
218
+ # Step 6: Upload to Hub
219
+ print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
220
+ api = HfApi()
221
+
222
+ # Create repo
223
+ print(f" Creating repository: {OUTPUT_REPO}")
224
+ try:
225
+ api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
226
+ print(" ✅ Repository created")
227
+ except Exception as e:
228
+ print(f" ℹ️ Repository may already exist: {e}")
229
+
230
+ # Upload FP16 version
231
+ print(" Uploading FP16 GGUF...")
232
+ api.upload_file(
233
+ path_or_fileobj=gguf_file,
234
+ path_in_repo=f"{model_name}-f16.gguf",
235
+ repo_id=OUTPUT_REPO,
236
+ )
237
+ print(" ✅ FP16 uploaded")
238
+
239
+ # Upload quantized versions
240
+ for quant_file, quant_type in quantized_files:
241
+ print(f" Uploading {quant_type}...")
242
+ api.upload_file(
243
+ path_or_fileobj=quant_file,
244
+ path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
245
+ repo_id=OUTPUT_REPO,
246
+ )
247
+ print(f" ✅ {quant_type} uploaded")
248
+
249
+ # Create README
250
+ print("\n📝 Creating README...")
251
+ readme_content = f"""---
252
+ base_model: {BASE_MODEL}
253
+ tags:
254
+ - gguf
255
+ - llama.cpp
256
+ - quantized
257
+ - trl
258
+ - sft
259
+ ---
260
+
261
+ # {OUTPUT_REPO.split('/')[-1]}
262
+
263
+ This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).
264
+
265
+ ## Model Details
266
+
267
+ - **Base Model:** {BASE_MODEL}
268
+ - **Fine-tuned Model:** {ADAPTER_MODEL}
269
+ - **Training:** Supervised Fine-Tuning (SFT) with TRL
270
+ - **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.)
271
+
272
+ ## Available Quantizations
273
+
274
+ | File | Quant | Size | Description | Use Case |
275
+ |------|-------|------|-------------|----------|
276
+ | {model_name}-f16.gguf | F16 | ~1GB | Full precision | Best quality, slower |
277
+ | {model_name}-q8_0.gguf | Q8_0 | ~500MB | 8-bit | High quality |
278
+ | {model_name}-q5_k_m.gguf | Q5_K_M | ~350MB | 5-bit medium | Good quality, smaller |
279
+ | {model_name}-q4_k_m.gguf | Q4_K_M | ~300MB | 4-bit medium | Recommended - good balance |
280
+
281
+ ## Usage
282
+
283
+ ### With llama.cpp
284
+
285
+ ```bash
286
+ # Download model
287
+ huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
288
+
289
+ # Run with llama.cpp
290
+ ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt here"
291
+ ```
292
+
293
+ ### With Ollama
294
+
295
+ 1. Create a `Modelfile`:
296
+ ```
297
+ FROM ./{model_name}-q4_k_m.gguf
298
+ ```
299
+
300
+ 2. Create the model:
301
+ ```bash
302
+ ollama create my-model -f Modelfile
303
+ ollama run my-model
304
+ ```
305
+
306
+ ### With LM Studio
307
+
308
+ 1. Download the `.gguf` file
309
+ 2. Import into LM Studio
310
+ 3. Start chatting!
311
+
312
+ ## License
313
+
314
+ Inherits the license from the base model: {BASE_MODEL}
315
+
316
+ ## Citation
317
+
318
+ ```bibtex
319
+ @misc{{{OUTPUT_REPO.split('/')[-1].replace('-', '_')},
320
+ author = {{{username}}},
321
+ title = {{{OUTPUT_REPO.split('/')[-1]}}},
322
+ year = {{2025}},
323
+ publisher = {{Hugging Face}},
324
+ url = {{https://huggingface.co/{OUTPUT_REPO}}}
325
+ }}
326
+ ```
327
+
328
+ ---
329
+
330
+ *Converted to GGUF format using llama.cpp*
331
+ """
332
+
333
+ api.upload_file(
334
+ path_or_fileobj=readme_content.encode(),
335
+ path_in_repo="README.md",
336
+ repo_id=OUTPUT_REPO,
337
+ )
338
+ print(" ✅ README uploaded")
339
+
340
+ print("\n" + "=" * 60)
341
+ print("✅ GGUF Conversion Complete!")
342
+ print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
343
+ print(f"\n📥 Download with:")
344
+ print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
345
+ print(f"\n🚀 Use with Ollama:")
346
+ print(" 1. Download the GGUF file")
347
+ print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
348
+ print(" 3. ollama create my-model -f Modelfile")
349
+ print(" 4. ollama run my-model")
350
+ print("=" * 60)