erik1988
/

way2agi-training-scripts

Model card Files Files and versions

xet

Community

erik1988 commited on Mar 10

Commit

988573c

verified ·

1 Parent(s): fc470fe

Upload convert_to_gguf.py with huggingface_hub

Browse files

Files changed (1) hide show

convert_to_gguf.py +74 -398

convert_to_gguf.py CHANGED Viewed

@@ -14,411 +14,87 @@
 # ]
 # ///
-"""
-GGUF Conversion Script - Production Ready
-This script converts a LoRA fine-tuned model to GGUF format for use with:
-- llama.cpp
-- Ollama
-- LM Studio
-- Other GGUF-compatible tools
-PREREQUISITES (install these FIRST):
-- Ubuntu/Debian: sudo apt-get update && sudo apt-get install -y build-essential cmake
-- RHEL/CentOS: sudo yum groupinstall -y "Development Tools" && sudo yum install -y cmake
-- macOS: xcode-select --install && brew install cmake
-Usage:
-    Set environment variables:
-    - ADAPTER_MODEL: Your fine-tuned model (e.g., "username/my-finetuned-model")
-    - BASE_MODEL: Base model used for fine-tuning (e.g., "Qwen/Qwen2.5-0.5B")
-    - OUTPUT_REPO: Where to upload GGUF files (e.g., "username/my-model-gguf")
-    - HF_USERNAME: Your Hugging Face username (optional, for README)
-Dependencies: All required packages are declared in PEP 723 header above.
-"""
-import os
-import sys
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from huggingface_hub import HfApi
-import subprocess
-def check_system_dependencies():
-    """Check if required system packages are available."""
-    print("🔍 Checking system dependencies...")
-    # Check for git
-    if subprocess.run(["which", "git"], capture_output=True).returncode != 0:
-        print("  ❌ git is not installed. Please install it:")
-        print("     Ubuntu/Debian: sudo apt-get install git")
-        print("     RHEL/CentOS: sudo yum install git")
-        print("     macOS: brew install git")
-        return False
-    # Check for make or cmake
-    has_make = subprocess.run(["which", "make"], capture_output=True).returncode == 0
-    has_cmake = subprocess.run(["which", "cmake"], capture_output=True).returncode == 0
-    if not has_make and not has_cmake:
-        print("  ❌ Neither make nor cmake found. Please install build tools:")
-        print("     Ubuntu/Debian: sudo apt-get install build-essential cmake")
-        print("     RHEL/CentOS: sudo yum groupinstall 'Development Tools' && sudo yum install cmake")
-        print("     macOS: xcode-select --install && brew install cmake")
-        return False
-    print("  ✅ System dependencies found")
-    return True
-def run_command(cmd, description):
-    """Run a command with error handling."""
-    print(f"   {description}...")
-    try:
-        result = subprocess.run(
-            cmd,
-            check=True,
-            capture_output=True,
-            text=True
-        )
-        if result.stdout:
-            print(f"   {result.stdout[:200]}")  # Show first 200 chars
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"   ❌ Command failed: {' '.join(cmd)}")
-        if e.stdout:
-            print(f"   STDOUT: {e.stdout[:500]}")
-        if e.stderr:
-            print(f"   STDERR: {e.stderr[:500]}")
-        return False
-    except FileNotFoundError:
-        print(f"   ❌ Command not found: {cmd[0]}")
-        return False
-print("🔄 GGUF Conversion Script")
-print("=" * 60)
-# Check system dependencies first
-if not check_system_dependencies():
-    print("\n❌ Please install the missing system dependencies and try again.")
-    sys.exit(1)
-# Configuration from environment variables
-ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "evalstate/qwen-capybara-medium")
-BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
-OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "evalstate/qwen-capybara-medium-gguf")
-username = os.environ.get("HF_USERNAME", ADAPTER_MODEL.split('/')[0])
-print(f"\n📦 Configuration:")
-print(f"   Base model: {BASE_MODEL}")
-print(f"   Adapter model: {ADAPTER_MODEL}")
-print(f"   Output repo: {OUTPUT_REPO}")
-# Step 1: Load base model and adapter
-print("\n🔧 Step 1: Loading base model and LoRA adapter...")
-print("   (This may take a few minutes)")
-try:
-    base_model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL,
-        dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    print("   ✅ Base model loaded")
-except Exception as e:
-    print(f"   ❌ Failed to load base model: {e}")
-    sys.exit(1)
-try:
-    # Load and merge adapter
-    print("   Loading LoRA adapter...")
-    model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
-    print("   ✅ Adapter loaded")
-    print("   Merging adapter with base model...")
-    merged_model = model.merge_and_unload()
-    print("   ✅ Models merged!")
-except Exception as e:
-    print(f"   ❌ Failed to merge models: {e}")
-    sys.exit(1)
-try:
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
-    print("   ✅ Tokenizer loaded")
-except Exception as e:
-    print(f"   ❌ Failed to load tokenizer: {e}")
-    sys.exit(1)
-# Step 2: Save merged model temporarily
-print("\n💾 Step 2: Saving merged model...")
 merged_dir = "/tmp/merged_model"
-try:
-    merged_model.save_pretrained(merged_dir, safe_serialization=True)
-    tokenizer.save_pretrained(merged_dir)
-    print(f"   ✅ Merged model saved to {merged_dir}")
-except Exception as e:
-    print(f"   ❌ Failed to save merged model: {e}")
-    sys.exit(1)
-# Step 3: Install llama.cpp for conversion
-print("\n📥 Step 3: Setting up llama.cpp for GGUF conversion...")
-# Clone llama.cpp repository
-if not run_command(
-    ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
-    "Cloning llama.cpp repository"
-):
-    print("   Trying alternative clone method...")
-    # Try shallow clone
-    if not run_command(
-        ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
-        "Cloning llama.cpp (shallow)"
-    ):
-        sys.exit(1)
-# Install Python dependencies
-print("   Installing Python dependencies...")
-if not run_command(
-    ["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"],
-    "Installing llama.cpp requirements"
-):
-    print("   ⚠️  Some requirements may already be installed")
-if not run_command(
-    ["pip", "install", "sentencepiece", "protobuf"],
-    "Installing tokenizer dependencies"
-):
-    print("   ⚠️  Tokenizer dependencies may already be installed")
-# Step 4: Convert to GGUF (FP16)
-print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
-gguf_output_dir = "/tmp/gguf_output"
-os.makedirs(gguf_output_dir, exist_ok=True)
-convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py"
-model_name = ADAPTER_MODEL.split('/')[-1]
-gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"
-print(f"   Running conversion...")
-if not run_command(
-    [
-        sys.executable, convert_script,
-        merged_dir,
-        "--outfile", gguf_file,
-        "--outtype", "f16"
-    ],
-    f"Converting to FP16"
-):
-    print("   ❌ Conversion failed!")
-    sys.exit(1)
-print(f"   ✅ FP16 GGUF created: {gguf_file}")
-# Step 5: Quantize to different formats
-print("\n⚙️  Step 5: Creating quantized versions...")
-# Build quantize tool using CMake (more reliable than make)
-print("   Building quantize tool with CMake...")
 os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
-# Configure with CMake
-if not run_command(
-    ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp",
-     "-DGGML_CUDA=OFF"],
-    "Configuring with CMake"
-):
-    print("   ❌ CMake configuration failed")
-    sys.exit(1)
-# Build just the quantize tool
-if not run_command(
-    ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
-    "Building llama-quantize"
-):
-    print("   ❌ Build failed!")
-    sys.exit(1)
-print("   ✅ Quantize tool built")
-# Use the CMake build output path
-quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
-# Common quantization formats
-quant_formats = [
-    ("Q4_K_M", "4-bit, medium quality (recommended)"),
-    ("Q5_K_M", "5-bit, higher quality"),
-    ("Q8_0", "8-bit, very high quality"),
-]
-quantized_files = []
-for quant_type, description in quant_formats:
-    print(f"   Creating {quant_type} quantization ({description})...")
-    quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
-    if not run_command(
-        [quantize_bin, gguf_file, quant_file, quant_type],
-        f"Quantizing to {quant_type}"
-    ):
-        print(f"   ⚠️  Skipping {quant_type} due to error")
-        continue
-    quantized_files.append((quant_file, quant_type))
-    # Get file size
-    size_mb = os.path.getsize(quant_file) / (1024 * 1024)
-    print(f"   ✅ {quant_type}: {size_mb:.1f} MB")
-if not quantized_files:
-    print("   ❌ No quantized versions were created successfully")
-    sys.exit(1)
-# Step 6: Upload to Hub
-print("\n☁️  Step 6: Uploading to Hugging Face Hub...")
 api = HfApi()
-# Create repo
-print(f"   Creating repository: {OUTPUT_REPO}")
-try:
-    api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
-    print("   ✅ Repository ready")
-except Exception as e:
-    print(f"   ℹ️  Repository may already exist: {e}")
-# Upload FP16 version
-print("   Uploading FP16 GGUF...")
-try:
-    api.upload_file(
-        path_or_fileobj=gguf_file,
-        path_in_repo=f"{model_name}-f16.gguf",
-        repo_id=OUTPUT_REPO,
-    )
-    print("   ✅ FP16 uploaded")
-except Exception as e:
-    print(f"   ❌ Upload failed: {e}")
-    sys.exit(1)
-# Upload quantized versions
-for quant_file, quant_type in quantized_files:
-    print(f"   Uploading {quant_type}...")
-    try:
-        api.upload_file(
-            path_or_fileobj=quant_file,
-            path_in_repo=f"{model_name}-{quant_type.lower()}.gguf",
-            repo_id=OUTPUT_REPO,
-        )
-        print(f"   ✅ {quant_type} uploaded")
-    except Exception as e:
-        print(f"   ❌ Upload failed for {quant_type}: {e}")
-        continue
-# Create README
-print("\n📝 Creating README...")
-readme_content = f"""---
-base_model: {BASE_MODEL}
-tags:
-- gguf
-- llama.cpp
-- quantized
-- trl
-- sft
----
-# {OUTPUT_REPO.split('/')[-1]}
-This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}).
-## Model Details
-- **Base Model:** {BASE_MODEL}
-- **Fine-tuned Model:** {ADAPTER_MODEL}
-- **Training:** Supervised Fine-Tuning (SFT) with TRL
-- **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.)
-## Available Quantizations
-| File | Quant | Size | Description | Use Case |
-|------|-------|------|-------------|----------|
-| {model_name}-f16.gguf | F16 | ~1GB | Full precision | Best quality, slower |
-| {model_name}-q8_0.gguf | Q8_0 | ~500MB | 8-bit | High quality |
-| {model_name}-q5_k_m.gguf | Q5_K_M | ~350MB | 5-bit medium | Good quality, smaller |
-| {model_name}-q4_k_m.gguf | Q4_K_M | ~300MB | 4-bit medium | Recommended - good balance |
-## Usage
-### With llama.cpp
-```bash
-# Download model
-huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
-# Run with llama.cpp
-./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt here"
-```
-### With Ollama
-1. Create a `Modelfile`:
-```
-FROM ./{model_name}-q4_k_m.gguf
-```
-2. Create the model:
-```bash
-ollama create my-model -f Modelfile
-ollama run my-model
-```
-### With LM Studio
-1. Download the `.gguf` file
-2. Import into LM Studio
-3. Start chatting!
-## License
-Inherits the license from the base model: {BASE_MODEL}
-## Citation
-```bibtex
-@misc{{{OUTPUT_REPO.split('/')[-1].replace('-', '_')},
-  author = {{{username}}},
-  title = {{{OUTPUT_REPO.split('/')[-1]}}},
-  year = {{2025}},
-  publisher = {{Hugging Face}},
-  url = {{https://huggingface.co/{OUTPUT_REPO}}}
-}}
-```
----
-*Converted to GGUF format using llama.cpp*
-"""
-try:
-    api.upload_file(
-        path_or_fileobj=readme_content.encode(),
-        path_in_repo="README.md",
-        repo_id=OUTPUT_REPO,
-    )
-    print("   ✅ README uploaded")
-except Exception as e:
-    print(f"   ❌ README upload failed: {e}")
-print("\n" + "=" * 60)
-print("✅ GGUF Conversion Complete!")
-print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
-print(f"\n📥 Download with:")
-print(f"   huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf")
-print(f"\n🚀 Use with Ollama:")
-print("   1. Download the GGUF file")
-print(f"   2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf")
-print("   3. ollama create my-model -f Modelfile")
-print("   4. ollama run my-model")
-print("=" * 60)

 # ]
 # ///
+import os, sys, subprocess, torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from huggingface_hub import HfApi
+import huggingface_hub
+# Login
+token = os.environ.get("HF_TOKEN")
+if token:
+    huggingface_hub.login(token=token)
+    print("Logged in")
+# Install build tools
+print("Installing build tools...")
+subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
+subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], capture_output=True, check=True)
+print("Build tools installed")
+ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "erik1988/elias-memory-agent-v1")
+BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
+OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "erik1988/elias-memory-agent-v1-gguf")
+print(f"Base: {BASE_MODEL}, Adapter: {ADAPTER_MODEL}, Output: {OUTPUT_REPO}")
+# Load and merge
+print("Loading base model...")
+base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, dtype=torch.float16, device_map="auto", trust_remote_code=True)
+print("Loading adapter...")
+model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
+print("Merging...")
+merged = model.merge_and_unload()
+tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True)
 merged_dir = "/tmp/merged_model"
+merged.save_pretrained(merged_dir, safe_serialization=True)
+tokenizer.save_pretrained(merged_dir)
+print("Merged model saved")
+# Clone llama.cpp
+print("Cloning llama.cpp...")
+subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True)
+subprocess.run(["pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True)
+# Convert to F16 GGUF
+gguf_dir = "/tmp/gguf_output"
+os.makedirs(gguf_dir, exist_ok=True)
+model_name = ADAPTER_MODEL.split("/")[-1]
+f16_file = f"{gguf_dir}/{model_name}-f16.gguf"
+print("Converting to F16 GGUF...")
+subprocess.run([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", f16_file, "--outtype", "f16"], check=True)
+print(f"F16 GGUF: {os.path.getsize(f16_file) / 1024 / 1024:.0f} MB")
+# Build quantize tool
+print("Building quantize tool...")
 os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
+subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True)
+subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True)
+quantize = "/tmp/llama.cpp/build/bin/llama-quantize"
+quant_files = []
+for qt, desc in [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")]:
+    qf = f"{gguf_dir}/{model_name}-{qt.lower()}.gguf"
+    print(f"Quantizing {qt}...")
+    r = subprocess.run([quantize, f16_file, qf, qt], capture_output=True)
+    if r.returncode == 0:
+        size = os.path.getsize(qf) / 1024 / 1024
+        print(f"  {qt}: {size:.0f} MB")
+        quant_files.append((qf, qt))
+# Upload
+print("Uploading to Hub...")
 api = HfApi()
+api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
+api.upload_file(path_or_fileobj=f16_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO)
+print("F16 uploaded")
+for qf, qt in quant_files:
+    api.upload_file(path_or_fileobj=qf, path_in_repo=f"{model_name}-{qt.lower()}.gguf", repo_id=OUTPUT_REPO)
+    print(f"{qt} uploaded")
+print(f"\nDone! https://huggingface.co/{OUTPUT_REPO}")