qwen-wordpress-training / convert_gguf.py
mattPearce's picture
Upload convert_gguf.py with huggingface_hub
469a051 verified
#!/usr/bin/env python3
# /// script
# dependencies = [
# "transformers>=4.36.0",
# "peft>=0.7.0",
# "torch>=2.0.0",
# "accelerate>=0.24.0",
# "huggingface_hub>=0.20.0",
# "sentencepiece>=0.1.99",
# "protobuf>=3.20.0",
# "numpy",
# "gguf",
# ]
# ///
"""GGUF Conversion for WordPress Coder Model"""
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from huggingface_hub import HfApi
import subprocess
print("πŸ”„ GGUF Conversion Script")
print("=" * 60)
# Configuration
ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "mattPearce/qwen-wordpress-coder")
BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-14B-Instruct")
OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "mattPearce/qwen-wordpress-coder-gguf")
username = os.environ.get("HF_USERNAME", "mattPearce")
print(f"\nπŸ“¦ Configuration:")
print(f" Base model: {BASE_MODEL}")
print(f" Adapter model: {ADAPTER_MODEL}")
print(f" Output repo: {OUTPUT_REPO}")
# Step 1: Load base model and adapter
print("\nπŸ”§ Step 1: Loading base model and LoRA adapter...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
print(" βœ… Base model loaded")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
print(" βœ… Adapter loaded")
print(" Merging adapter with base model...")
merged_model = model.merge_and_unload()
print(" βœ… Models merged!")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
print(" βœ… Tokenizer loaded")
# Step 2: Save merged model
print("\nπŸ’Ύ Step 2: Saving merged model...")
merged_dir = "/tmp/merged_model"
merged_model.save_pretrained(merged_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_dir)
print(f" βœ… Merged model saved to {merged_dir}")
# Step 3: Install llama.cpp
print("\nπŸ“₯ Step 3: Setting up llama.cpp...")
print(" Installing build tools...")
subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], check=True, capture_output=True)
print(" βœ… Build tools installed")
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True)
subprocess.run(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], check=True, capture_output=True)
subprocess.run(["pip", "install", "sentencepiece", "protobuf"], check=True, capture_output=True)
print(" βœ… llama.cpp setup complete")
# Step 4: Convert to GGUF
print("\nπŸ”„ Step 4: Converting to GGUF format (FP16)...")
gguf_output_dir = "/tmp/gguf_output"
os.makedirs(gguf_output_dir, exist_ok=True)
model_name = "qwen-wordpress-coder"
gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
result = subprocess.run(
["python", "/tmp/llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
check=True, capture_output=True, text=True
)
print(f" βœ… FP16 GGUF created: {gguf_file}")
# Step 5: Quantize
print("\nβš™οΈ Step 5: Creating quantized versions...")
os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True, text=True)
subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True, text=True)
print(" βœ… Quantize tool built")
quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
quant_formats = [
("Q4_K_M", "4-bit recommended"),
("Q5_K_M", "5-bit higher quality"),
("Q8_0", "8-bit very high quality"),
]
quantized_files = []
for quant_type, description in quant_formats:
print(f" Creating {quant_type} ({description})...")
quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
subprocess.run([quantize_bin, gguf_file, quant_file, quant_type], check=True, capture_output=True)
quantized_files.append((quant_file, quant_type))
size_mb = os.path.getsize(quant_file) / (1024 * 1024)
print(f" βœ… {quant_type}: {size_mb:.1f} MB")
# Step 6: Upload to Hub
print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
api = HfApi()
api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
print(" βœ… Repository created")
print(" Uploading FP16 GGUF...")
api.upload_file(path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO)
print(" βœ… FP16 uploaded")
for quant_file, quant_type in quantized_files:
print(f" Uploading {quant_type}...")
api.upload_file(path_or_fileobj=quant_file, path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=OUTPUT_REPO)
print(f" βœ… {quant_type} uploaded")
# Create README
readme_content = f"""---
base_model: {BASE_MODEL}
tags:
- gguf
- llama.cpp
- quantized
- wordpress
- qwen
---
# Qwen WordPress Coder - GGUF
GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), a fine-tuned {BASE_MODEL} for generating WordPress plugins and Gutenberg blocks.
## Model Details
- **Base Model:** Qwen2.5-Coder-14B-Instruct
- **Fine-tuned On:** 419 WordPress plugin/block examples from Automattic repos
- **Training:** Supervised Fine-Tuning with LoRA
- **Format:** GGUF (for llama.cpp, Ollama, LM Studio)
## Available Quantizations
| File | Quant | Size | Description |
|------|-------|------|-------------|
| qwen-wordpress-coder-f16.gguf | F16 | ~28GB | Full precision |
| qwen-wordpress-coder-q8_0.gguf | Q8_0 | ~15GB | 8-bit, very high quality |
| qwen-wordpress-coder-q5_k_m.gguf | Q5_K_M | ~10GB | 5-bit, good quality |
| qwen-wordpress-coder-q4_k_m.gguf | Q4_K_M | ~8GB | 4-bit, recommended |
## Usage
### With LM Studio
1. Download `qwen-wordpress-coder-q4_k_m.gguf`
2. Import into LM Studio
3. Prompt: "Create a Gutenberg block for..."
### With Ollama
```bash
# Create Modelfile
cat > Modelfile << 'EOF'
FROM ./qwen-wordpress-coder-q4_k_m.gguf
SYSTEM You are an expert WordPress developer specializing in creating high-quality plugins and Gutenberg blocks. You write clean, well-documented code following WordPress coding standards.
EOF
# Create and run
ollama create wordpress-coder -f Modelfile
ollama run wordpress-coder "Create a block for displaying testimonials"
```
### With llama.cpp
```bash
./llama-cli -m qwen-wordpress-coder-q4_k_m.gguf -ngl 32 -p "Create a WordPress plugin for..."
```
## Example Prompts
- "Create a Gutenberg block for displaying product reviews with star ratings"
- "Build a WordPress plugin for custom post type management"
- "Generate a block that displays recent posts in a grid layout"
"""
api.upload_file(path_or_fileobj=readme_content.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO)
print(" βœ… README uploaded")
print("\n" + "=" * 60)
print("βœ… GGUF Conversion Complete!")
print(f"πŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}")
print(f"\nπŸ“₯ Recommended download:")
print(f" huggingface-cli download {OUTPUT_REPO} qwen-wordpress-coder-q4_k_m.gguf")
print("=" * 60)