tobil commited on
Commit
65268ed
Β·
verified Β·
1 Parent(s): e7fc932

Add 4B GGUF conversion script

Browse files
Files changed (1) hide show
  1. convert_4B_gguf.py +282 -0
convert_4B_gguf.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "transformers>=4.36.0",
6
+ # "peft>=0.7.0",
7
+ # "torch>=2.0.0",
8
+ # "accelerate>=0.24.0",
9
+ # "huggingface_hub>=0.20.0",
10
+ # "sentencepiece>=0.1.99",
11
+ # "protobuf>=3.20.0",
12
+ # "numpy",
13
+ # "gguf",
14
+ # ]
15
+ # ///
16
+ """
17
+ GGUF Conversion for QMD Query Expansion 4B Model
18
+
19
+ Loads base model, applies SFT adapter, then GRPO adapter, merges all,
20
+ and converts to GGUF format for use with Ollama/llama.cpp/LM Studio.
21
+ """
22
+
23
+ import os
24
+ import sys
25
+ import subprocess
26
+
27
+ import torch
28
+ from transformers import AutoModelForCausalLM, AutoTokenizer
29
+ from peft import PeftModel
30
+ from huggingface_hub import HfApi, login
31
+
32
+ # Configuration
33
+ BASE_MODEL = "Qwen/Qwen3-4B"
34
+ SFT_MODEL = "tobil/qmd-query-expansion-4B-sft"
35
+ GRPO_MODEL = "tobil/qmd-query-expansion-4B-grpo"
36
+ OUTPUT_REPO = "tobil/qmd-query-expansion-4B-gguf"
37
+
38
+ def run_command(cmd, description):
39
+ """Run a command with error handling."""
40
+ print(f" {description}...")
41
+ try:
42
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
43
+ return True
44
+ except subprocess.CalledProcessError as e:
45
+ print(f" ❌ Command failed: {' '.join(cmd)}")
46
+ if e.stderr:
47
+ print(f" STDERR: {e.stderr[:500]}")
48
+ return False
49
+ except FileNotFoundError:
50
+ print(f" ❌ Command not found: {cmd[0]}")
51
+ return False
52
+
53
+
54
+ print("πŸ”„ QMD Query Expansion 4B GGUF Conversion")
55
+ print("=" * 60)
56
+
57
+ # Install build tools
58
+ print("\nπŸ“¦ Installing build dependencies...")
59
+ subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
60
+ subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)
61
+ print(" βœ… Build tools ready")
62
+
63
+ # Login to HuggingFace
64
+ hf_token = os.environ.get("HF_TOKEN")
65
+ if hf_token:
66
+ print("\nπŸ” Logging in to HuggingFace...")
67
+ login(token=hf_token)
68
+ print(" βœ… Logged in")
69
+
70
+ # Step 1: Load base model
71
+ print(f"\nπŸ”§ Step 1: Loading base model {BASE_MODEL}...")
72
+ base_model = AutoModelForCausalLM.from_pretrained(
73
+ BASE_MODEL,
74
+ torch_dtype=torch.bfloat16,
75
+ device_map="auto",
76
+ trust_remote_code=True,
77
+ )
78
+ print(" βœ… Base model loaded")
79
+
80
+ # Step 2: Load and merge SFT adapter
81
+ print(f"\nπŸ”§ Step 2: Loading SFT adapter {SFT_MODEL}...")
82
+ model = PeftModel.from_pretrained(base_model, SFT_MODEL)
83
+ print(" Merging SFT adapter...")
84
+ model = model.merge_and_unload()
85
+ print(" βœ… SFT merged")
86
+
87
+ # Step 3: Load and merge GRPO adapter
88
+ print(f"\nπŸ”§ Step 3: Loading GRPO adapter {GRPO_MODEL}...")
89
+ model = PeftModel.from_pretrained(model, GRPO_MODEL)
90
+ print(" Merging GRPO adapter...")
91
+ merged_model = model.merge_and_unload()
92
+ print(" βœ… GRPO merged - final model ready")
93
+
94
+ # Load tokenizer
95
+ print("\nπŸ“ Loading tokenizer...")
96
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
97
+ print(" βœ… Tokenizer loaded")
98
+
99
+ # Step 4: Save merged model
100
+ print("\nπŸ’Ύ Step 4: Saving merged model to disk...")
101
+ merged_dir = "/tmp/merged_model"
102
+ merged_model.save_pretrained(merged_dir, safe_serialization=True)
103
+ tokenizer.save_pretrained(merged_dir)
104
+ print(f" βœ… Saved to {merged_dir}")
105
+
106
+ # Step 5: Setup llama.cpp
107
+ print("\nπŸ“₯ Step 5: Setting up llama.cpp...")
108
+ if not os.path.exists("/tmp/llama.cpp"):
109
+ run_command(
110
+ ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
111
+ "Cloning llama.cpp"
112
+ )
113
+
114
+ # Install Python deps
115
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True)
116
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True)
117
+ print(" βœ… llama.cpp ready")
118
+
119
+ # Step 6: Convert to GGUF (FP16)
120
+ print("\nπŸ”„ Step 6: Converting to GGUF format (FP16)...")
121
+ gguf_output_dir = "/tmp/gguf_output"
122
+ os.makedirs(gguf_output_dir, exist_ok=True)
123
+
124
+ model_name = "qmd-query-expansion-4B"
125
+ gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
126
+
127
+ convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
128
+ if not run_command(
129
+ [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
130
+ "Converting to FP16 GGUF"
131
+ ):
132
+ print(" ❌ Conversion failed!")
133
+ sys.exit(1)
134
+
135
+ size_mb = os.path.getsize(gguf_file) / (1024 * 1024)
136
+ print(f" βœ… FP16 GGUF created: {size_mb:.1f} MB")
137
+
138
+ # Step 7: Build quantize tool
139
+ print("\nβš™οΈ Step 7: Building quantize tool...")
140
+ os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
141
+
142
+ run_command(
143
+ ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
144
+ "Configuring with CMake"
145
+ )
146
+ run_command(
147
+ ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
148
+ "Building llama-quantize"
149
+ )
150
+
151
+ quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
152
+ print(" βœ… Quantize tool built")
153
+
154
+ # Step 8: Create quantized versions
155
+ print("\nβš™οΈ Step 8: Creating quantized versions...")
156
+ quant_formats = [
157
+ ("Q4_K_M", "4-bit medium (recommended)"),
158
+ ("Q5_K_M", "5-bit medium"),
159
+ ("Q8_0", "8-bit"),
160
+ ]
161
+
162
+ quantized_files = []
163
+ for quant_type, description in quant_formats:
164
+ print(f" Creating {quant_type} ({description})...")
165
+ quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
166
+
167
+ if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"):
168
+ size_mb = os.path.getsize(quant_file) / (1024 * 1024)
169
+ print(f" βœ… {quant_type}: {size_mb:.1f} MB")
170
+ quantized_files.append((quant_file, quant_type))
171
+ else:
172
+ print(f" ⚠️ Skipping {quant_type}")
173
+
174
+ # Step 9: Upload to Hub
175
+ print("\n☁️ Step 9: Uploading to Hugging Face Hub...")
176
+ api = HfApi()
177
+
178
+ print(f" Creating repository: {OUTPUT_REPO}")
179
+ api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
180
+
181
+ # Upload F16
182
+ print(" Uploading FP16...")
183
+ api.upload_file(
184
+ path_or_fileobj=gguf_file,
185
+ path_in_repo=f"{model_name}-f16.gguf",
186
+ repo_id=OUTPUT_REPO,
187
+ )
188
+ print(" βœ… FP16 uploaded")
189
+
190
+ # Upload quantized versions
191
+ for quant_file, quant_type in quantized_files:
192
+ print(f" Uploading {quant_type}...")
193
+ api.upload_file(
194
+ path_or_fileobj=quant_file,
195
+ path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
196
+ repo_id=OUTPUT_REPO,
197
+ )
198
+ print(f" βœ… {quant_type} uploaded")
199
+
200
+ # Create README
201
+ print("\nπŸ“ Creating README...")
202
+ readme_content = f"""---
203
+ base_model: {BASE_MODEL}
204
+ tags:
205
+ - gguf
206
+ - llama.cpp
207
+ - quantized
208
+ - query-expansion
209
+ - qmd
210
+ ---
211
+
212
+ # QMD Query Expansion 4B (GGUF)
213
+
214
+ GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio.
215
+
216
+ ## Model Details
217
+
218
+ - **Base Model:** {BASE_MODEL}
219
+ - **SFT Adapter:** {SFT_MODEL}
220
+ - **GRPO Adapter:** {GRPO_MODEL}
221
+ - **Task:** Query expansion for hybrid search (lex/vec/hyde format)
222
+
223
+ ## Available Quantizations
224
+
225
+ | File | Quant | Description |
226
+ |------|-------|-------------|
227
+ | {model_name}-f16.gguf | F16 | Full precision |
228
+ | {model_name}-q8_0.gguf | Q8_0 | 8-bit |
229
+ | {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium |
230
+ | {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) |
231
+
232
+ ## Usage
233
+
234
+ ### With Ollama
235
+
236
+ ```bash
237
+ # Download
238
+ huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir .
239
+
240
+ # Create Modelfile
241
+ echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile
242
+
243
+ # Create and run
244
+ ollama create qmd-expand-4b -f Modelfile
245
+ ollama run qmd-expand-4b
246
+ ```
247
+
248
+ ### Prompt Format
249
+
250
+ Use Qwen3 chat format with `/no_think`:
251
+
252
+ ```
253
+ <|im_start|>user
254
+ /no_think Expand this search query: your query here<|im_end|>
255
+ <|im_start|>assistant
256
+ ```
257
+
258
+ ### Expected Output
259
+
260
+ ```
261
+ lex: keyword variation 1
262
+ lex: keyword variation 2
263
+ vec: natural language reformulation
264
+ hyde: Hypothetical document passage answering the query.
265
+ ```
266
+
267
+ ## License
268
+
269
+ Apache 2.0 (inherited from Qwen3)
270
+ """
271
+
272
+ api.upload_file(
273
+ path_or_fileobj=readme_content.encode(),
274
+ path_in_repo="README.md",
275
+ repo_id=OUTPUT_REPO,
276
+ )
277
+ print(" βœ… README uploaded")
278
+
279
+ print("\n" + "=" * 60)
280
+ print("βœ… GGUF Conversion Complete!")
281
+ print(f"πŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}")
282
+ print("=" * 60)