# /// script # dependencies = ["transformers", "peft", "huggingface_hub", "torch"] # /// """ Convert fine-tuned LoRA model to GGUF format with quantization. Merges adapter with base model, converts to GGUF, and uploads to Hub. """ import os import subprocess from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch # Configuration from environment variables or defaults ADAPTER_MODEL = os.getenv("ADAPTER_MODEL", "nathens/qwen-codeforces-sft") BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B") OUTPUT_REPO = os.getenv("OUTPUT_REPO", "nathens/my-model-gguf") QUANTIZATION = os.getenv("QUANTIZATION", "Q4_K_M") print(f"šŸ”§ Converting model to GGUF") print(f" Base model: {BASE_MODEL}") print(f" Adapter: {ADAPTER_MODEL}") print(f" Output: {OUTPUT_REPO}") print(f" Quantization: {QUANTIZATION}") # Step 1: Load base model and tokenizer print("\nšŸ“¦ Loading base model and tokenizer...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, dtype=torch.float16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) # Step 2: Load and merge LoRA adapter print(f"šŸ”€ Loading and merging LoRA adapter from {ADAPTER_MODEL}...") model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) print("āš™ļø Merging adapter weights into base model...") merged_model = model.merge_and_unload() # Step 3: Save merged model print("šŸ’¾ Saving merged model...") merged_dir = "./merged_model" merged_model.save_pretrained(merged_dir) tokenizer.save_pretrained(merged_dir) print(f"āœ… Merged model saved to {merged_dir}") # Step 4: Install llama.cpp for conversion print("\nšŸ“„ Installing llama.cpp for GGUF conversion...") subprocess.run(["apt-get", "update", "-qq"], check=True) subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True) subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True) # Get number of processors nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True) nproc = nproc_result.stdout.strip() print(f"Building llama.cpp with {nproc} cores using CMake...") # Use CMake to build (CPU only - CUDA not needed for conversion/quantization) os.makedirs("llama.cpp/build", exist_ok=True) subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True) subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True) # Step 5: Convert to GGUF format print("\nšŸ”„ Converting to GGUF format...") subprocess.run([ "python3", "llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", "./model-f16.gguf", "--outtype", "f16" ], check=True) print("āœ… Converted to FP16 GGUF") # Step 6: Quantize to specified format print(f"\n⚔ Quantizing to {QUANTIZATION}...") subprocess.run([ "./llama.cpp/build/bin/llama-quantize", "./model-f16.gguf", f"./model-{QUANTIZATION}.gguf", QUANTIZATION ], check=True) print(f"āœ… Quantized to {QUANTIZATION}") # Step 7: Upload to Hub print(f"\nšŸ“¤ Uploading to {OUTPUT_REPO}...") from huggingface_hub import HfApi api = HfApi() # Create repo if it doesn't exist try: api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True) except Exception as e: print(f"Note: {e}") # Upload the quantized GGUF file api.upload_file( path_or_fileobj=f"./model-{QUANTIZATION}.gguf", path_in_repo=f"model-{QUANTIZATION}.gguf", repo_id=OUTPUT_REPO, repo_type="model" ) # Also upload the original FP16 version api.upload_file( path_or_fileobj="./model-f16.gguf", path_in_repo="model-f16.gguf", repo_id=OUTPUT_REPO, repo_type="model" ) # Upload tokenizer files for file in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json"]: try: api.upload_file( path_or_fileobj=f"{merged_dir}/{file}", path_in_repo=file, repo_id=OUTPUT_REPO, repo_type="model" ) except Exception: pass # Some files may not exist print(f"\nāœ… Conversion complete!") print(f"šŸ“ GGUF model available at: https://huggingface.co/{OUTPUT_REPO}") print(f"\nšŸ’” To use with Ollama:") print(f" 1. Download: huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf") print(f" 2. Create Modelfile with the downloaded GGUF") print(f" 3. Run: ollama create my-model -f Modelfile") print(f" 4. Use: ollama run my-model")