# /// script # requires-python = ">=3.10" # dependencies = [ # "transformers>=4.36.0", # "peft>=0.7.0", # "torch>=2.0.0", # "accelerate>=0.24.0", # "huggingface_hub>=0.20.0", # "sentencepiece>=0.1.99", # "protobuf>=3.20.0", # "numpy", # "gguf", # ] # system_dependencies = ["build-essential", "cmake", "git"] # /// """ GGUF Conversion - Q4_K_M Only Converts fine-tuned model to GGUF with Q4_K_M quantization. """ import os import sys import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import HfApi import subprocess def install_build_tools(): """Install build tools required for llama.cpp.""" print(" Installing build tools...") try: # Update and install build tools subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True) subprocess.run([ "apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git" ], check=True, capture_output=True) print(" āœ… Build tools installed") return True except Exception as e: print(f" āŒ Failed to install build tools: {e}") return False def run_command(cmd, description): """Run a command with error handling.""" print(f" {description}...") try: result = subprocess.run( cmd, check=True, capture_output=True, text=True ) if result.stdout: print(f" {result.stdout[:200]}") return True except subprocess.CalledProcessError as e: print(f" āŒ Command failed: {' '.join(cmd)}") if e.stderr: print(f" STDERR: {e.stderr[:500]}") return False except FileNotFoundError: print(f" āŒ Command not found: {cmd[0]}") return False print("šŸ”„ GGUF Conversion - Q4_K_M") print("=" * 60) # Install build tools FIRST (before cloning llama.cpp) install_build_tools() # Configuration from environment variables ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "albertlieadrian/qwen3-0.6b-codeforces-sft") BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen3-0.6B") OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "albertlieadrian/qwen3-0.6b-codeforces-sft-gguf") HF_USERNAME = os.environ.get("HF_USERNAME", "albertlieadrian") print(f"\nšŸ“¦ Configuration:") print(f" Base model: {BASE_MODEL}") print(f" Adapter model: {ADAPTER_MODEL}") print(f" Output repo: {OUTPUT_REPO}") # Step 1: Load base model and adapter print("\nšŸ”§ Step 1: Loading base model and LoRA adapter...") try: base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, dtype=torch.float16, device_map="auto", trust_remote_code=True, ) print(" āœ… Base model loaded") except Exception as e: print(f" āŒ Failed to load base model: {e}") sys.exit(1) try: print(" Loading LoRA adapter...") model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) print(" āœ… Adapter loaded") print(" Merging adapter with base model...") merged_model = model.merge_and_unload() print(" āœ… Models merged!") except Exception as e: print(f" āŒ Failed to merge models: {e}") sys.exit(1) try: tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True) print(" āœ… Tokenizer loaded") except Exception as e: print(f" āŒ Failed to load tokenizer: {e}") sys.exit(1) # Step 2: Save merged model print("\nšŸ’¾ Step 2: Saving merged model...") merged_dir = "/tmp/merged_model" try: merged_model.save_pretrained(merged_dir, safe_serialization=True) tokenizer.save_pretrained(merged_dir) print(f" āœ… Merged model saved to {merged_dir}") except Exception as e: print(f" āŒ Failed to save merged model: {e}") sys.exit(1) # Step 3: Setup llama.cpp print("\nšŸ“„ Step 3: Setting up llama.cpp...") if not run_command( ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], "Cloning llama.cpp" ): sys.exit(1) print(" Installing Python dependencies...") run_command(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], "Installing requirements") run_command(["pip", "install", "sentencepiece", "protobuf"], "Installing tokenizer deps") # Step 4: Convert to GGUF (FP16) print("\nšŸ”„ Step 4: Converting to GGUF format (FP16)...") gguf_output_dir = "/tmp/gguf_output" os.makedirs(gguf_output_dir, exist_ok=True) convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py" model_name = ADAPTER_MODEL.split('/')[-1] gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" if not run_command( [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"], "Converting to FP16" ): print(" āŒ Conversion failed!") sys.exit(1) print(f" āœ… FP16 GGUF created: {gguf_file}") # Step 5: Quantize to Q4_K_M print("\nāš™ļø Step 5: Quantizing to Q4_K_M...") # Build quantize tool with CMake print(" Building quantize tool with CMake...") os.makedirs("/tmp/llama.cpp/build", exist_ok=True) if not run_command( ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], "Configuring with CMake" ): sys.exit(1) if not run_command( ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], "Building llama-quantize" ): sys.exit(1) print(" āœ… Quantize tool built") quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" quant_file = f"{gguf_output_dir}/{model_name}-q4_k_m.gguf" print(f" Creating Q4_K_M quantization...") if not run_command([quantize_bin, gguf_file, quant_file, "Q4_K_M"], "Quantizing to Q4_K_M"): print(" āŒ Quantization failed!") sys.exit(1) size_mb = os.path.getsize(quant_file) / (1024 * 1024) print(f" āœ… Q4_K_M: {size_mb:.1f} MB") # Step 6: Upload to Hub print("\nā˜ļø Step 6: Uploading to Hugging Face Hub...") api = HfApi() print(f" Creating repository: {OUTPUT_REPO}") try: api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) print(" āœ… Repository ready") except Exception as e: print(f" ā„¹ļø Repository may already exist: {e}") # Upload Q4_K_M print(" Uploading Q4_K_M GGUF...") try: api.upload_file( path_or_fileobj=quant_file, path_in_repo=f"{model_name}-q4_k_m.gguf", repo_id=OUTPUT_REPO, ) print(" āœ… Q4_K_M uploaded") except Exception as e: print(f" āŒ Upload failed: {e}") sys.exit(1) # Create README print("\nšŸ“ Creating README...") readme_content = f"""--- base_model: {BASE_MODEL} tags: - gguf - llama.cpp - quantized - trl - sft --- # {model_name}-gguf This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}). ## Model Details - **Base Model:** {BASE_MODEL} - **Fine-tuned Model:** {ADAPTER_MODEL} - **Training:** Supervised Fine-Tuning (SFT) with TRL - **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.) ## Quantization | File | Quant | Size | Description | |------|-------|------|-------------| | {model_name}-q4_k_m.gguf | Q4_K_M | ~{size_mb:.0f}MB | 4-bit medium (recommended) | ## Usage ### With llama.cpp ```bash huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt" ``` ### With Ollama 1. Create a `Modelfile`: ``` FROM ./{model_name}-q4_k_m.gguf ``` 2. Create and run: ```bash ollama create my-model -f Modelfile ollama run my-model ``` ### With LM Studio 1. Download the `.gguf` file 2. Import into LM Studio 3. Start chatting! ## License Inherits the license from the base model: {BASE_MODEL} --- *Converted to GGUF format using llama.cpp* """ try: api.upload_file( path_or_fileobj=readme_content.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO, ) print(" āœ… README uploaded") except Exception as e: print(f" āŒ README upload failed: {e}") print("\n" + "=" * 60) print("āœ… GGUF Conversion Complete!") print(f"šŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}") print(f"\nšŸ“„ Download with:") print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf") print(f"\nšŸš€ Use with Ollama:") print(f" 1. Download the GGUF file") print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf") print(" 3. ollama create my-model -f Modelfile") print(" 4. ollama run my-model") print("=" * 60)