qwen-wordpress-training / convert_gguf.py

Upload convert_gguf.py with huggingface_hub

469a051 verified 4 months ago

7.18 kB

	#!/usr/bin/env python3
	# /// script
	# dependencies = [
	# "transformers>=4.36.0",
	# "peft>=0.7.0",
	# "torch>=2.0.0",
	# "accelerate>=0.24.0",
	# "huggingface_hub>=0.20.0",
	# "sentencepiece>=0.1.99",
	# "protobuf>=3.20.0",
	# "numpy",
	# "gguf",
	# ]
	# ///

	"""GGUF Conversion for WordPress Coder Model"""

	import os
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from huggingface_hub import HfApi
	import subprocess

	print("🔄 GGUF Conversion Script")
	print("=" * 60)

	# Configuration
	ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "mattPearce/qwen-wordpress-coder")
	BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-14B-Instruct")
	OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "mattPearce/qwen-wordpress-coder-gguf")
	username = os.environ.get("HF_USERNAME", "mattPearce")

	print(f"\n📦 Configuration:")
	print(f" Base model: {BASE_MODEL}")
	print(f" Adapter model: {ADAPTER_MODEL}")
	print(f" Output repo: {OUTPUT_REPO}")

	# Step 1: Load base model and adapter
	print("\n🔧 Step 1: Loading base model and LoRA adapter...")
	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	print(" ✅ Base model loaded")

	model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
	print(" ✅ Adapter loaded")

	print(" Merging adapter with base model...")
	merged_model = model.merge_and_unload()
	print(" ✅ Models merged!")

	tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
	print(" ✅ Tokenizer loaded")

	# Step 2: Save merged model
	print("\n💾 Step 2: Saving merged model...")
	merged_dir = "/tmp/merged_model"
	merged_model.save_pretrained(merged_dir, safe_serialization=True)
	tokenizer.save_pretrained(merged_dir)
	print(f" ✅ Merged model saved to {merged_dir}")

	# Step 3: Install llama.cpp
	print("\n📥 Step 3: Setting up llama.cpp...")
	print(" Installing build tools...")
	subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True)
	subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], check=True, capture_output=True)
	print(" ✅ Build tools installed")

	subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True)
	subprocess.run(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], check=True, capture_output=True)
	subprocess.run(["pip", "install", "sentencepiece", "protobuf"], check=True, capture_output=True)
	print(" ✅ llama.cpp setup complete")

	# Step 4: Convert to GGUF
	print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
	gguf_output_dir = "/tmp/gguf_output"
	os.makedirs(gguf_output_dir, exist_ok=True)

	model_name = "qwen-wordpress-coder"
	gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"

	result = subprocess.run(
	["python", "/tmp/llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
	check=True, capture_output=True, text=True
	)
	print(f" ✅ FP16 GGUF created: {gguf_file}")

	# Step 5: Quantize
	print("\n⚙️ Step 5: Creating quantized versions...")
	os.makedirs("/tmp/llama.cpp/build", exist_ok=True)

	subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True, text=True)
	subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True, text=True)
	print(" ✅ Quantize tool built")

	quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"

	quant_formats = [
	("Q4_K_M", "4-bit recommended"),
	("Q5_K_M", "5-bit higher quality"),
	("Q8_0", "8-bit very high quality"),
	]

	quantized_files = []
	for quant_type, description in quant_formats:
	print(f" Creating {quant_type} ({description})...")
	quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
	subprocess.run([quantize_bin, gguf_file, quant_file, quant_type], check=True, capture_output=True)
	quantized_files.append((quant_file, quant_type))
	size_mb = os.path.getsize(quant_file) / (1024 * 1024)
	print(f" ✅ {quant_type}: {size_mb:.1f} MB")

	# Step 6: Upload to Hub
	print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
	api = HfApi()

	api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
	print(" ✅ Repository created")

	print(" Uploading FP16 GGUF...")
	api.upload_file(path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO)
	print(" ✅ FP16 uploaded")

	for quant_file, quant_type in quantized_files:
	print(f" Uploading {quant_type}...")
	api.upload_file(path_or_fileobj=quant_file, path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=OUTPUT_REPO)
	print(f" ✅ {quant_type} uploaded")

	# Create README
	readme_content = f"""---
	base_model: {BASE_MODEL}
	tags:
	- gguf
	- llama.cpp
	- quantized
	- wordpress
	- qwen
	---

	# Qwen WordPress Coder - GGUF

	GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), a fine-tuned {BASE_MODEL} for generating WordPress plugins and Gutenberg blocks.

	## Model Details

	- Base Model: Qwen2.5-Coder-14B-Instruct
	- Fine-tuned On: 419 WordPress plugin/block examples from Automattic repos
	- Training: Supervised Fine-Tuning with LoRA
	- Format: GGUF (for llama.cpp, Ollama, LM Studio)

	## Available Quantizations

	\| File \| Quant \| Size \| Description \|
	\|------\|-------\|------\|-------------\|
	\| qwen-wordpress-coder-f16.gguf \| F16 \| ~28GB \| Full precision \|
	\| qwen-wordpress-coder-q8_0.gguf \| Q8_0 \| ~15GB \| 8-bit, very high quality \|
	\| qwen-wordpress-coder-q5_k_m.gguf \| Q5_K_M \| ~10GB \| 5-bit, good quality \|
	\| qwen-wordpress-coder-q4_k_m.gguf \| Q4_K_M \| ~8GB \| 4-bit, recommended \|

	## Usage

	### With LM Studio

	1. Download `qwen-wordpress-coder-q4_k_m.gguf`
	2. Import into LM Studio
	3. Prompt: "Create a Gutenberg block for..."

	### With Ollama

	```bash
	# Create Modelfile
	cat > Modelfile << 'EOF'
	FROM ./qwen-wordpress-coder-q4_k_m.gguf

	SYSTEM You are an expert WordPress developer specializing in creating high-quality plugins and Gutenberg blocks. You write clean, well-documented code following WordPress coding standards.
	EOF

	# Create and run
	ollama create wordpress-coder -f Modelfile
	ollama run wordpress-coder "Create a block for displaying testimonials"
	```

	### With llama.cpp

	```bash
	./llama-cli -m qwen-wordpress-coder-q4_k_m.gguf -ngl 32 -p "Create a WordPress plugin for..."
	```

	## Example Prompts

	- "Create a Gutenberg block for displaying product reviews with star ratings"
	- "Build a WordPress plugin for custom post type management"
	- "Generate a block that displays recent posts in a grid layout"
	"""

	api.upload_file(path_or_fileobj=readme_content.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO)
	print(" ✅ README uploaded")

	print("\n" + "=" * 60)
	print("✅ GGUF Conversion Complete!")
	print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
	print(f"\n📥 Recommended download:")
	print(f" huggingface-cli download {OUTPUT_REPO} qwen-wordpress-coder-q4_k_m.gguf")
	print("=" * 60)