Upload convert_to_gguf.py with huggingface_hub

253f8fb verified about 1 month ago

8.6 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "transformers>=4.36.0",
	# "peft>=0.7.0",
	# "torch>=2.0.0",
	# "accelerate>=0.24.0",
	# "huggingface_hub>=0.20.0",
	# "sentencepiece>=0.1.99",
	# "protobuf>=3.20.0",
	# "numpy",
	# "gguf",
	# ]
	# system_dependencies = ["build-essential", "cmake", "git"]
	# ///

	"""
	GGUF Conversion - Q4_K_M Only

	Converts fine-tuned model to GGUF with Q4_K_M quantization.
	"""

	import os
	import sys
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from huggingface_hub import HfApi
	import subprocess


	def install_build_tools():
	"""Install build tools required for llama.cpp."""
	print(" Installing build tools...")
	try:
	# Update and install build tools
	subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True)
	subprocess.run([
	"apt-get", "install", "-y", "-qq",
	"build-essential", "cmake", "git"
	], check=True, capture_output=True)
	print(" ✅ Build tools installed")
	return True
	except Exception as e:
	print(f" ❌ Failed to install build tools: {e}")
	return False


	def run_command(cmd, description):
	"""Run a command with error handling."""
	print(f" {description}...")
	try:
	result = subprocess.run(
	cmd,
	check=True,
	capture_output=True,
	text=True
	)
	if result.stdout:
	print(f" {result.stdout[:200]}")
	return True
	except subprocess.CalledProcessError as e:
	print(f" ❌ Command failed: {' '.join(cmd)}")
	if e.stderr:
	print(f" STDERR: {e.stderr[:500]}")
	return False
	except FileNotFoundError:
	print(f" ❌ Command not found: {cmd[0]}")
	return False


	print("🔄 GGUF Conversion - Q4_K_M")
	print("=" * 60)

	# Install build tools FIRST (before cloning llama.cpp)
	install_build_tools()

	# Configuration from environment variables
	ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "albertlieadrian/qwen3-0.6b-codeforces-sft")
	BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen3-0.6B")
	OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "albertlieadrian/qwen3-0.6b-codeforces-sft-gguf")
	HF_USERNAME = os.environ.get("HF_USERNAME", "albertlieadrian")

	print(f"\n📦 Configuration:")
	print(f" Base model: {BASE_MODEL}")
	print(f" Adapter model: {ADAPTER_MODEL}")
	print(f" Output repo: {OUTPUT_REPO}")

	# Step 1: Load base model and adapter
	print("\n🔧 Step 1: Loading base model and LoRA adapter...")

	try:
	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	print(" ✅ Base model loaded")
	except Exception as e:
	print(f" ❌ Failed to load base model: {e}")
	sys.exit(1)

	try:
	print(" Loading LoRA adapter...")
	model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
	print(" ✅ Adapter loaded")

	print(" Merging adapter with base model...")
	merged_model = model.merge_and_unload()
	print(" ✅ Models merged!")
	except Exception as e:
	print(f" ❌ Failed to merge models: {e}")
	sys.exit(1)

	try:
	tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
	print(" ✅ Tokenizer loaded")
	except Exception as e:
	print(f" ❌ Failed to load tokenizer: {e}")
	sys.exit(1)

	# Step 2: Save merged model
	print("\n💾 Step 2: Saving merged model...")
	merged_dir = "/tmp/merged_model"
	try:
	merged_model.save_pretrained(merged_dir, safe_serialization=True)
	tokenizer.save_pretrained(merged_dir)
	print(f" ✅ Merged model saved to {merged_dir}")
	except Exception as e:
	print(f" ❌ Failed to save merged model: {e}")
	sys.exit(1)

	# Step 3: Setup llama.cpp
	print("\n📥 Step 3: Setting up llama.cpp...")

	if not run_command(
	["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
	"Cloning llama.cpp"
	):
	sys.exit(1)

	print(" Installing Python dependencies...")
	run_command(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], "Installing requirements")
	run_command(["pip", "install", "sentencepiece", "protobuf"], "Installing tokenizer deps")

	# Step 4: Convert to GGUF (FP16)
	print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
	gguf_output_dir = "/tmp/gguf_output"
	os.makedirs(gguf_output_dir, exist_ok=True)

	convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
	model_name = ADAPTER_MODEL.split('/')[-1]
	gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"

	if not run_command(
	[sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
	"Converting to FP16"
	):
	print(" ❌ Conversion failed!")
	sys.exit(1)

	print(f" ✅ FP16 GGUF created: {gguf_file}")

	# Step 5: Quantize to Q4_K_M
	print("\n⚙️ Step 5: Quantizing to Q4_K_M...")

	# Build quantize tool with CMake
	print(" Building quantize tool with CMake...")
	os.makedirs("/tmp/llama.cpp/build", exist_ok=True)

	if not run_command(
	["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
	"Configuring with CMake"
	):
	sys.exit(1)

	if not run_command(
	["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
	"Building llama-quantize"
	):
	sys.exit(1)

	print(" ✅ Quantize tool built")

	quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
	quant_file = f"{gguf_output_dir}/{model_name}-q4_k_m.gguf"

	print(f" Creating Q4_K_M quantization...")
	if not run_command([quantize_bin, gguf_file, quant_file, "Q4_K_M"], "Quantizing to Q4_K_M"):
	print(" ❌ Quantization failed!")
	sys.exit(1)

	size_mb = os.path.getsize(quant_file) / (1024 * 1024)
	print(f" ✅ Q4_K_M: {size_mb:.1f} MB")

	# Step 6: Upload to Hub
	print("\n☁️ Step 6: Uploading to Hugging Face Hub...")
	api = HfApi()

	print(f" Creating repository: {OUTPUT_REPO}")
	try:
	api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
	print(" ✅ Repository ready")
	except Exception as e:
	print(f" ℹ️ Repository may already exist: {e}")

	# Upload Q4_K_M
	print(" Uploading Q4_K_M GGUF...")
	try:
	api.upload_file(
	path_or_fileobj=quant_file,
	path_in_repo=f"{model_name}-q4_k_m.gguf",
	repo_id=OUTPUT_REPO,
	)
	print(" ✅ Q4_K_M uploaded")
	except Exception as e:
	print(f" ❌ Upload failed: {e}")
	sys.exit(1)

	# Create README
	print("\n📝 Creating README...")
	readme_content = f"""---
	base_model: {BASE_MODEL}
	tags:
	- gguf
	- llama.cpp
	- quantized
	- trl
	- sft
	---

	# {model_name}-gguf

	This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).

	## Model Details

	- Base Model: {BASE_MODEL}
	- Fine-tuned Model: {ADAPTER_MODEL}
	- Training: Supervised Fine-Tuning (SFT) with TRL
	- Format: GGUF (for llama.cpp, Ollama, LM Studio, etc.)

	## Quantization

	\| File \| Quant \| Size \| Description \|
	\|------\|-------\|------\|-------------\|
	\| {model_name}-q4_k_m.gguf \| Q4_K_M \| ~{size_mb:.0f}MB \| 4-bit medium (recommended) \|

	## Usage

	### With llama.cpp

	```bash
	huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
	./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt"
	```

	### With Ollama

	1. Create a `Modelfile`:
	```
	FROM ./{model_name}-q4_k_m.gguf
	```

	2. Create and run:
	```bash
	ollama create my-model -f Modelfile
	ollama run my-model
	```

	### With LM Studio

	1. Download the `.gguf` file
	2. Import into LM Studio
	3. Start chatting!

	## License

	Inherits the license from the base model: {BASE_MODEL}

	---

	Converted to GGUF format using llama.cpp
	"""

	try:
	api.upload_file(
	path_or_fileobj=readme_content.encode(),
	path_in_repo="README.md",
	repo_id=OUTPUT_REPO,
	)
	print(" ✅ README uploaded")
	except Exception as e:
	print(f" ❌ README upload failed: {e}")

	print("\n" + "=" * 60)
	print("✅ GGUF Conversion Complete!")
	print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
	print(f"\n📥 Download with:")
	print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
	print(f"\n🚀 Use with Ollama:")
	print(f" 1. Download the GGUF file")
	print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
	print(" 3. ollama create my-model -f Modelfile")
	print(" 4. ollama run my-model")
	print("=" * 60)