# /// script # dependencies = ["transformers", "peft", "huggingface_hub", "torch", "sentencepiece", "protobuf"] # /// """ Convert fine-tuned LoRA model to GGUF format with Q4_K_M quantization. """ import os import subprocess from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch # Hardcoded configuration ADAPTER_MODEL = "nathens/qwen-codeforces-sft" BASE_MODEL = "Qwen/Qwen2.5-0.5B" OUTPUT_REPO = "nathens/my-model-gguf" QUANTIZATION = "Q4_K_M" print(f"šŸ”§ Converting model to GGUF") print(f" Base model: {BASE_MODEL}") print(f" Adapter: {ADAPTER_MODEL}") print(f" Output: {OUTPUT_REPO}") print(f" Quantization: {QUANTIZATION}") # Step 1: Load base model and tokenizer print("\nšŸ“¦ Loading base model and tokenizer...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, dtype=torch.float16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) # Step 2: Load and merge LoRA adapter print(f"šŸ”€ Loading and merging LoRA adapter from {ADAPTER_MODEL}...") model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) print("āš™ļø Merging adapter weights into base model...") merged_model = model.merge_and_unload() # Step 3: Save merged model print("šŸ’¾ Saving merged model...") merged_dir = "./merged_model" merged_model.save_pretrained(merged_dir) tokenizer.save_pretrained(merged_dir) print(f"āœ… Merged model saved to {merged_dir}") # Step 4: Install llama.cpp for conversion print("\nšŸ“„ Installing llama.cpp for GGUF conversion...") subprocess.run(["apt-get", "update", "-qq"], check=True) subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True) subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True) # Build llama.cpp with CMake nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True) nproc = nproc_result.stdout.strip() print(f"Building llama.cpp with {nproc} cores using CMake...") os.makedirs("llama.cpp/build", exist_ok=True) subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True) subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True) # Step 5: Convert to GGUF format print("\nšŸ”„ Converting to GGUF format...") subprocess.run([ "python3", "llama.cpp/convert_hf_to_gguf.py", merged_dir, "--outfile", "./model-f16.gguf", "--outtype", "f16" ], check=True) print("āœ… Converted to FP16 GGUF") # Step 6: Quantize to Q4_K_M print(f"\n⚔ Quantizing to {QUANTIZATION}...") subprocess.run([ "./llama.cpp/build/bin/llama-quantize", "./model-f16.gguf", f"./model-{QUANTIZATION}.gguf", QUANTIZATION ], check=True) print(f"āœ… Quantized to {QUANTIZATION}") # Step 7: Upload to Hub print(f"\nšŸ“¤ Uploading to {OUTPUT_REPO}...") from huggingface_hub import HfApi api = HfApi() # Create repo try: api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True) except Exception as e: print(f"Note: {e}") # Upload GGUF files api.upload_file( path_or_fileobj=f"./model-{QUANTIZATION}.gguf", path_in_repo=f"model-{QUANTIZATION}.gguf", repo_id=OUTPUT_REPO, repo_type="model" ) api.upload_file( path_or_fileobj="./model-f16.gguf", path_in_repo="model-f16.gguf", repo_id=OUTPUT_REPO, repo_type="model" ) # Upload tokenizer files for file in ["tokenizer.json", "tokenizer_config.json"]: try: api.upload_file( path_or_fileobj=f"{merged_dir}/{file}", path_in_repo=file, repo_id=OUTPUT_REPO, repo_type="model" ) except Exception: pass print(f"\nāœ… Conversion complete!") print(f"šŸ“ GGUF model available at: https://huggingface.co/{OUTPUT_REPO}") print(f"\nšŸ’” To use with Ollama:") print(f" huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf")