| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """GGUF Conversion for Full Model (not LoRA adapter)""" |
|
|
| import os |
| import subprocess |
|
|
| print("π GGUF Conversion Script") |
| print("=" * 60) |
|
|
| MODEL_ID = os.environ.get("MODEL_ID", "chaddy81/qwen3-0.6b-multicode-grpo") |
| OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "chaddy81/qwen3-0.6b-multicode-grpo-gguf") |
| username = os.environ.get("HF_USERNAME", MODEL_ID.split('/')[0]) |
|
|
| print(f"\nπ¦ Configuration:") |
| print(f" Model: {MODEL_ID}") |
| print(f" Output repo: {OUTPUT_REPO}") |
|
|
| |
| print("\nπ₯ Step 1: Downloading model...") |
| from huggingface_hub import snapshot_download |
| model_dir = snapshot_download(repo_id=MODEL_ID, local_dir="/tmp/model") |
| print(f" β
Model downloaded to {model_dir}") |
|
|
| |
| print("\nπ§ Step 2: Installing build tools...") |
| subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True) |
| subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], check=True, capture_output=True) |
| print(" β
Build tools installed") |
|
|
| |
| print("\nπ₯ Step 3: Setting up llama.cpp...") |
| subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True) |
| subprocess.run(["pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], check=True, capture_output=True) |
| subprocess.run(["pip", "install", "-q", "sentencepiece", "protobuf"], check=True, capture_output=True) |
| print(" β
llama.cpp ready") |
|
|
| |
| print("\nπ Step 4: Converting to GGUF format (FP16)...") |
| gguf_output_dir = "/tmp/gguf_output" |
| os.makedirs(gguf_output_dir, exist_ok=True) |
|
|
| model_name = MODEL_ID.split('/')[-1] |
| gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" |
|
|
| try: |
| result = subprocess.run( |
| ["python", "/tmp/llama.cpp/convert_hf_to_gguf.py", model_dir, "--outfile", gguf_file, "--outtype", "f16"], |
| check=True, capture_output=True, text=True |
| ) |
| print(result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout) |
| except subprocess.CalledProcessError as e: |
| print(f"β Conversion failed! STDERR: {e.stderr}") |
| raise |
| print(f" β
FP16 GGUF created: {gguf_file}") |
|
|
| |
| print("\nβοΈ Step 5: Building quantize tool and creating quantized versions...") |
| os.makedirs("/tmp/llama.cpp/build", exist_ok=True) |
| subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True, text=True) |
| subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True, text=True) |
| print(" β
Quantize tool built") |
|
|
| quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" |
| quant_formats = [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")] |
| quantized_files = [] |
|
|
| for quant_type, desc in quant_formats: |
| print(f" Creating {quant_type} ({desc})...") |
| quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf" |
| subprocess.run([quantize_bin, gguf_file, quant_file, quant_type], check=True, capture_output=True) |
| quantized_files.append((quant_file, quant_type)) |
| size_mb = os.path.getsize(quant_file) / (1024 * 1024) |
| print(f" β
{quant_type}: {size_mb:.1f} MB") |
|
|
| |
| print("\nβοΈ Step 6: Uploading to Hugging Face Hub...") |
| from huggingface_hub import HfApi |
| api = HfApi() |
|
|
| api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) |
| print(f" β
Repository {OUTPUT_REPO} ready") |
|
|
| print(" Uploading FP16 GGUF...") |
| api.upload_file(path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO) |
|
|
| for quant_file, quant_type in quantized_files: |
| print(f" Uploading {quant_type}...") |
| api.upload_file(path_or_fileobj=quant_file, path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=OUTPUT_REPO) |
|
|
| |
| readme = f"""--- |
| base_model: {MODEL_ID} |
| tags: |
| - gguf |
| - llama.cpp |
| - quantized |
| - trl |
| - grpo |
| --- |
| |
| # {OUTPUT_REPO.split('/')[-1]} |
| |
| GGUF conversion of [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), trained using GRPO (Group Relative Policy Optimization). |
| |
| ## Available Quantizations |
| |
| | File | Quant | Description | |
| |------|-------|-------------| |
| | {model_name}-f16.gguf | F16 | Full precision | |
| | {model_name}-q8_0.gguf | Q8_0 | 8-bit, high quality | |
| | {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit, good quality | |
| | {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit, recommended | |
| |
| ## Usage |
| |
| ### With Ollama |
| ```bash |
| huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf |
| echo "FROM ./{model_name}-q4_k_m.gguf" > Modelfile |
| ollama create {model_name} -f Modelfile |
| ollama run {model_name} |
| ``` |
| |
| ### With llama.cpp |
| ```bash |
| ./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt" |
| ``` |
| """ |
| api.upload_file(path_or_fileobj=readme.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO) |
| print(" β
README uploaded") |
|
|
| print("\n" + "=" * 60) |
| print("β
GGUF Conversion Complete!") |
| print(f"π¦ Repository: https://huggingface.co/{OUTPUT_REPO}") |
| print("=" * 60) |
|
|