qmd-training-scripts / convert_1.7B_gguf.py
tobil's picture
Add 1.7B GGUF conversion script
e7fc932 verified
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "transformers>=4.36.0",
# "peft>=0.7.0",
# "torch>=2.0.0",
# "accelerate>=0.24.0",
# "huggingface_hub>=0.20.0",
# "sentencepiece>=0.1.99",
# "protobuf>=3.20.0",
# "numpy",
# "gguf",
# ]
# ///
"""
GGUF Conversion for QMD Query Expansion 1.7B Model
Loads base model, applies SFT adapter, then GRPO adapter, merges all,
and converts to GGUF format for use with Ollama/llama.cpp/LM Studio.
"""
import os
import sys
import subprocess
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi, login
# Configuration
BASE_MODEL = "Qwen/Qwen3-1.7B"
SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft"
GRPO_MODEL = "tobil/qmd-query-expansion-1.7B-grpo"
OUTPUT_REPO = "tobil/qmd-query-expansion-1.7B-gguf"
def run_command(cmd, description):
"""Run a command with error handling."""
print(f" {description}...")
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
return True
except subprocess.CalledProcessError as e:
print(f" ❌ Command failed: {' '.join(cmd)}")
if e.stderr:
print(f" STDERR: {e.stderr[:500]}")
return False
except FileNotFoundError:
print(f" ❌ Command not found: {cmd[0]}")
return False
print("πŸ”„ QMD Query Expansion 1.7B GGUF Conversion")
print("=" * 60)
# Install build tools
print("\nπŸ“¦ Installing build dependencies...")
subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)
print(" βœ… Build tools ready")
# Login to HuggingFace
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
print("\nπŸ” Logging in to HuggingFace...")
login(token=hf_token)
print(" βœ… Logged in")
# Step 1: Load base model
print(f"\nπŸ”§ Step 1: Loading base model {BASE_MODEL}...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
print(" βœ… Base model loaded")
# Step 2: Load and merge SFT adapter
print(f"\nπŸ”§ Step 2: Loading SFT adapter {SFT_MODEL}...")
model = PeftModel.from_pretrained(base_model, SFT_MODEL)
print(" Merging SFT adapter...")
model = model.merge_and_unload()
print(" βœ… SFT merged")
# Step 3: Load and merge GRPO adapter
print(f"\nπŸ”§ Step 3: Loading GRPO adapter {GRPO_MODEL}...")
model = PeftModel.from_pretrained(model, GRPO_MODEL)
print(" Merging GRPO adapter...")
merged_model = model.merge_and_unload()
print(" βœ… GRPO merged - final model ready")
# Load tokenizer
print("\nπŸ“ Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
print(" βœ… Tokenizer loaded")
# Step 4: Save merged model
print("\nπŸ’Ύ Step 4: Saving merged model to disk...")
merged_dir = "/tmp/merged_model"
merged_model.save_pretrained(merged_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_dir)
print(f" βœ… Saved to {merged_dir}")
# Step 5: Setup llama.cpp
print("\nπŸ“₯ Step 5: Setting up llama.cpp...")
if not os.path.exists("/tmp/llama.cpp"):
run_command(
["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
"Cloning llama.cpp"
)
# Install Python deps
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True)
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True)
print(" βœ… llama.cpp ready")
# Step 6: Convert to GGUF (FP16)
print("\nπŸ”„ Step 6: Converting to GGUF format (FP16)...")
gguf_output_dir = "/tmp/gguf_output"
os.makedirs(gguf_output_dir, exist_ok=True)
model_name = "qmd-query-expansion-1.7B"
gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
if not run_command(
[sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
"Converting to FP16 GGUF"
):
print(" ❌ Conversion failed!")
sys.exit(1)
size_mb = os.path.getsize(gguf_file) / (1024 * 1024)
print(f" βœ… FP16 GGUF created: {size_mb:.1f} MB")
# Step 7: Build quantize tool
print("\nβš™οΈ Step 7: Building quantize tool...")
os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
run_command(
["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
"Configuring with CMake"
)
run_command(
["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
"Building llama-quantize"
)
quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
print(" βœ… Quantize tool built")
# Step 8: Create quantized versions
print("\nβš™οΈ Step 8: Creating quantized versions...")
quant_formats = [
("Q4_K_M", "4-bit medium (recommended)"),
("Q5_K_M", "5-bit medium"),
("Q8_0", "8-bit"),
]
quantized_files = []
for quant_type, description in quant_formats:
print(f" Creating {quant_type} ({description})...")
quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"):
size_mb = os.path.getsize(quant_file) / (1024 * 1024)
print(f" βœ… {quant_type}: {size_mb:.1f} MB")
quantized_files.append((quant_file, quant_type))
else:
print(f" ⚠️ Skipping {quant_type}")
# Step 9: Upload to Hub
print("\n☁️ Step 9: Uploading to Hugging Face Hub...")
api = HfApi()
print(f" Creating repository: {OUTPUT_REPO}")
api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
# Upload F16
print(" Uploading FP16...")
api.upload_file(
path_or_fileobj=gguf_file,
path_in_repo=f"{model_name}-f16.gguf",
repo_id=OUTPUT_REPO,
)
print(" βœ… FP16 uploaded")
# Upload quantized versions
for quant_file, quant_type in quantized_files:
print(f" Uploading {quant_type}...")
api.upload_file(
path_or_fileobj=quant_file,
path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
repo_id=OUTPUT_REPO,
)
print(f" βœ… {quant_type} uploaded")
# Create README
print("\nπŸ“ Creating README...")
readme_content = f"""---
base_model: {BASE_MODEL}
tags:
- gguf
- llama.cpp
- quantized
- query-expansion
- qmd
---
# QMD Query Expansion 1.7B (GGUF)
GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio.
## Model Details
- **Base Model:** {BASE_MODEL}
- **SFT Adapter:** {SFT_MODEL}
- **GRPO Adapter:** {GRPO_MODEL}
- **Task:** Query expansion for hybrid search (lex/vec/hyde format)
## Available Quantizations
| File | Quant | Description |
|------|-------|-------------|
| {model_name}-f16.gguf | F16 | Full precision |
| {model_name}-q8_0.gguf | Q8_0 | 8-bit |
| {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium |
| {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) |
## Usage
### With Ollama
```bash
# Download
huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir .
# Create Modelfile
echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile
# Create and run
ollama create qmd-expand -f Modelfile
ollama run qmd-expand
```
### Prompt Format
Use Qwen3 chat format with `/no_think`:
```
<|im_start|>user
/no_think Expand this search query: your query here<|im_end|>
<|im_start|>assistant
```
### Expected Output
```
lex: keyword variation 1
lex: keyword variation 2
vec: natural language reformulation
hyde: Hypothetical document passage answering the query.
```
## License
Apache 2.0 (inherited from Qwen3)
"""
api.upload_file(
path_or_fileobj=readme_content.encode(),
path_in_repo="README.md",
repo_id=OUTPUT_REPO,
)
print(" βœ… README uploaded")
print("\n" + "=" * 60)
print("βœ… GGUF Conversion Complete!")
print(f"πŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}")
print("=" * 60)