| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| GGUF Conversion - Q4_K_M Only |
| |
| Converts fine-tuned model to GGUF with Q4_K_M quantization. |
| """ |
|
|
| import os |
| import sys |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import PeftModel |
| from huggingface_hub import HfApi |
| import subprocess |
|
|
|
|
| def install_build_tools(): |
| """Install build tools required for llama.cpp.""" |
| print(" Installing build tools...") |
| try: |
| |
| subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True) |
| subprocess.run([ |
| "apt-get", "install", "-y", "-qq", |
| "build-essential", "cmake", "git" |
| ], check=True, capture_output=True) |
| print(" β
Build tools installed") |
| return True |
| except Exception as e: |
| print(f" β Failed to install build tools: {e}") |
| return False |
|
|
|
|
| def run_command(cmd, description): |
| """Run a command with error handling.""" |
| print(f" {description}...") |
| try: |
| result = subprocess.run( |
| cmd, |
| check=True, |
| capture_output=True, |
| text=True |
| ) |
| if result.stdout: |
| print(f" {result.stdout[:200]}") |
| return True |
| except subprocess.CalledProcessError as e: |
| print(f" β Command failed: {' '.join(cmd)}") |
| if e.stderr: |
| print(f" STDERR: {e.stderr[:500]}") |
| return False |
| except FileNotFoundError: |
| print(f" β Command not found: {cmd[0]}") |
| return False |
|
|
|
|
| print("π GGUF Conversion - Q4_K_M") |
| print("=" * 60) |
|
|
| |
| install_build_tools() |
|
|
| |
| ADAPTER_MODEL = os.environ.get("ADAPTER_MODEL", "albertlieadrian/qwen3-0.6b-codeforces-sft") |
| BASE_MODEL = os.environ.get("BASE_MODEL", "Qwen/Qwen3-0.6B") |
| OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "albertlieadrian/qwen3-0.6b-codeforces-sft-gguf") |
| HF_USERNAME = os.environ.get("HF_USERNAME", "albertlieadrian") |
|
|
| print(f"\nπ¦ Configuration:") |
| print(f" Base model: {BASE_MODEL}") |
| print(f" Adapter model: {ADAPTER_MODEL}") |
| print(f" Output repo: {OUTPUT_REPO}") |
|
|
| |
| print("\nπ§ Step 1: Loading base model and LoRA adapter...") |
|
|
| try: |
| base_model = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| print(" β
Base model loaded") |
| except Exception as e: |
| print(f" β Failed to load base model: {e}") |
| sys.exit(1) |
|
|
| try: |
| print(" Loading LoRA adapter...") |
| model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) |
| print(" β
Adapter loaded") |
|
|
| print(" Merging adapter with base model...") |
| merged_model = model.merge_and_unload() |
| print(" β
Models merged!") |
| except Exception as e: |
| print(f" β Failed to merge models: {e}") |
| sys.exit(1) |
|
|
| try: |
| tokenizer = AutoTokenizer.from_pretrained(ADAPTER_MODEL, trust_remote_code=True) |
| print(" β
Tokenizer loaded") |
| except Exception as e: |
| print(f" β Failed to load tokenizer: {e}") |
| sys.exit(1) |
|
|
| |
| print("\nπΎ Step 2: Saving merged model...") |
| merged_dir = "/tmp/merged_model" |
| try: |
| merged_model.save_pretrained(merged_dir, safe_serialization=True) |
| tokenizer.save_pretrained(merged_dir) |
| print(f" β
Merged model saved to {merged_dir}") |
| except Exception as e: |
| print(f" β Failed to save merged model: {e}") |
| sys.exit(1) |
|
|
| |
| print("\nπ₯ Step 3: Setting up llama.cpp...") |
|
|
| if not run_command( |
| ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], |
| "Cloning llama.cpp" |
| ): |
| sys.exit(1) |
|
|
| print(" Installing Python dependencies...") |
| run_command(["pip", "install", "-r", "/tmp/llama.cpp/requirements.txt"], "Installing requirements") |
| run_command(["pip", "install", "sentencepiece", "protobuf"], "Installing tokenizer deps") |
|
|
| |
| print("\nπ Step 4: Converting to GGUF format (FP16)...") |
| gguf_output_dir = "/tmp/gguf_output" |
| os.makedirs(gguf_output_dir, exist_ok=True) |
|
|
| convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py" |
| model_name = ADAPTER_MODEL.split('/')[-1] |
| gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" |
|
|
| if not run_command( |
| [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"], |
| "Converting to FP16" |
| ): |
| print(" β Conversion failed!") |
| sys.exit(1) |
|
|
| print(f" β
FP16 GGUF created: {gguf_file}") |
|
|
| |
| print("\nβοΈ Step 5: Quantizing to Q4_K_M...") |
|
|
| |
| print(" Building quantize tool with CMake...") |
| os.makedirs("/tmp/llama.cpp/build", exist_ok=True) |
|
|
| if not run_command( |
| ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], |
| "Configuring with CMake" |
| ): |
| sys.exit(1) |
|
|
| if not run_command( |
| ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], |
| "Building llama-quantize" |
| ): |
| sys.exit(1) |
|
|
| print(" β
Quantize tool built") |
|
|
| quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" |
| quant_file = f"{gguf_output_dir}/{model_name}-q4_k_m.gguf" |
|
|
| print(f" Creating Q4_K_M quantization...") |
| if not run_command([quantize_bin, gguf_file, quant_file, "Q4_K_M"], "Quantizing to Q4_K_M"): |
| print(" β Quantization failed!") |
| sys.exit(1) |
|
|
| size_mb = os.path.getsize(quant_file) / (1024 * 1024) |
| print(f" β
Q4_K_M: {size_mb:.1f} MB") |
|
|
| |
| print("\nβοΈ Step 6: Uploading to Hugging Face Hub...") |
| api = HfApi() |
|
|
| print(f" Creating repository: {OUTPUT_REPO}") |
| try: |
| api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) |
| print(" β
Repository ready") |
| except Exception as e: |
| print(f" βΉοΈ Repository may already exist: {e}") |
|
|
| |
| print(" Uploading Q4_K_M GGUF...") |
| try: |
| api.upload_file( |
| path_or_fileobj=quant_file, |
| path_in_repo=f"{model_name}-q4_k_m.gguf", |
| repo_id=OUTPUT_REPO, |
| ) |
| print(" β
Q4_K_M uploaded") |
| except Exception as e: |
| print(f" β Upload failed: {e}") |
| sys.exit(1) |
|
|
| |
| print("\nπ Creating README...") |
| readme_content = f"""--- |
| base_model: {BASE_MODEL} |
| tags: |
| - gguf |
| - llama.cpp |
| - quantized |
| - trl |
| - sft |
| --- |
| |
| # {model_name}-gguf |
| |
| This is a GGUF conversion of [{ADAPTER_MODEL}](https://huggingface.co/{ADAPTER_MODEL}), which is a LoRA fine-tuned version of [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}). |
| |
| ## Model Details |
| |
| - **Base Model:** {BASE_MODEL} |
| - **Fine-tuned Model:** {ADAPTER_MODEL} |
| - **Training:** Supervised Fine-Tuning (SFT) with TRL |
| - **Format:** GGUF (for llama.cpp, Ollama, LM Studio, etc.) |
| |
| ## Quantization |
| |
| | File | Quant | Size | Description | |
| |------|-------|------|-------------| |
| | {model_name}-q4_k_m.gguf | Q4_K_M | ~{size_mb:.0f}MB | 4-bit medium (recommended) | |
| |
| ## Usage |
| |
| ### With llama.cpp |
| |
| ```bash |
| huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf |
| ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt" |
| ``` |
| |
| ### With Ollama |
| |
| 1. Create a `Modelfile`: |
| ``` |
| FROM ./{model_name}-q4_k_m.gguf |
| ``` |
| |
| 2. Create and run: |
| ```bash |
| ollama create my-model -f Modelfile |
| ollama run my-model |
| ``` |
| |
| ### With LM Studio |
| |
| 1. Download the `.gguf` file |
| 2. Import into LM Studio |
| 3. Start chatting! |
| |
| ## License |
| |
| Inherits the license from the base model: {BASE_MODEL} |
| |
| --- |
| |
| *Converted to GGUF format using llama.cpp* |
| """ |
|
|
| try: |
| api.upload_file( |
| path_or_fileobj=readme_content.encode(), |
| path_in_repo="README.md", |
| repo_id=OUTPUT_REPO, |
| ) |
| print(" β
README uploaded") |
| except Exception as e: |
| print(f" β README upload failed: {e}") |
|
|
| print("\n" + "=" * 60) |
| print("β
GGUF Conversion Complete!") |
| print(f"π¦ Repository: https://huggingface.co/{OUTPUT_REPO}") |
| print(f"\nπ₯ Download with:") |
| print(f" huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf") |
| print(f"\nπ Use with Ollama:") |
| print(f" 1. Download the GGUF file") |
| print(f" 2. Create Modelfile: FROM ./{model_name}-q4_k_m.gguf") |
| print(" 3. ollama create my-model -f Modelfile") |
| print(" 4. ollama run my-model") |
| print("=" * 60) |
|
|