# /// script
# dependencies = [
#     "transformers>=4.36.0",
#     "torch>=2.0.0",
#     "accelerate>=0.24.0",
#     "huggingface_hub>=0.20.0",
#     "sentencepiece>=0.1.99",
#     "protobuf>=3.20.0",
#     "numpy",
#     "gguf",
# ]
# ///

"""GGUF Conversion for Full Model (not LoRA adapter)"""

import os
import subprocess

print("🔄 GGUF Conversion Script")
print("=" * 60)

MODEL_ID = os.environ.get("MODEL_ID", "chaddy81/qwen3-0.6b-multicode-grpo")
OUTPUT_REPO = os.environ.get("OUTPUT_REPO", "chaddy81/qwen3-0.6b-multicode-grpo-gguf")
username = os.environ.get("HF_USERNAME", MODEL_ID.split('/')[0])

print(f"\n📦 Configuration:")
print(f"   Model: {MODEL_ID}")
print(f"   Output repo: {OUTPUT_REPO}")

# Step 1: Download model
print("\n📥 Step 1: Downloading model...")
from huggingface_hub import snapshot_download
model_dir = snapshot_download(repo_id=MODEL_ID, local_dir="/tmp/model")
print(f"   ✅ Model downloaded to {model_dir}")

# Step 2: Install build tools
print("\n🔧 Step 2: Installing build tools...")
subprocess.run(["apt-get", "update", "-qq"], check=True, capture_output=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake"], check=True, capture_output=True)
print("   ✅ Build tools installed")

# Step 3: Setup llama.cpp
print("\n📥 Step 3: Setting up llama.cpp...")
subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], check=True, capture_output=True)
subprocess.run(["pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], check=True, capture_output=True)
subprocess.run(["pip", "install", "-q", "sentencepiece", "protobuf"], check=True, capture_output=True)
print("   ✅ llama.cpp ready")

# Step 4: Convert to GGUF
print("\n🔄 Step 4: Converting to GGUF format (FP16)...")
gguf_output_dir = "/tmp/gguf_output"
os.makedirs(gguf_output_dir, exist_ok=True)

model_name = MODEL_ID.split('/')[-1]
gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf"

try:
    result = subprocess.run(
        ["python", "/tmp/llama.cpp/convert_hf_to_gguf.py", model_dir, "--outfile", gguf_file, "--outtype", "f16"],
        check=True, capture_output=True, text=True
    )
    print(result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout)
except subprocess.CalledProcessError as e:
    print(f"❌ Conversion failed! STDERR: {e.stderr}")
    raise
print(f"   ✅ FP16 GGUF created: {gguf_file}")

# Step 5: Build quantize tool and quantize
print("\n⚙️  Step 5: Building quantize tool and creating quantized versions...")
os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
subprocess.run(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], check=True, capture_output=True, text=True)
subprocess.run(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], check=True, capture_output=True, text=True)
print("   ✅ Quantize tool built")

quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
quant_formats = [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")]
quantized_files = []

for quant_type, desc in quant_formats:
    print(f"   Creating {quant_type} ({desc})...")
    quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf"
    subprocess.run([quantize_bin, gguf_file, quant_file, quant_type], check=True, capture_output=True)
    quantized_files.append((quant_file, quant_type))
    size_mb = os.path.getsize(quant_file) / (1024 * 1024)
    print(f"   ✅ {quant_type}: {size_mb:.1f} MB")

# Step 6: Upload to Hub
print("\n☁️  Step 6: Uploading to Hugging Face Hub...")
from huggingface_hub import HfApi
api = HfApi()

api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True)
print(f"   ✅ Repository {OUTPUT_REPO} ready")

print("   Uploading FP16 GGUF...")
api.upload_file(path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO)

for quant_file, quant_type in quantized_files:
    print(f"   Uploading {quant_type}...")
    api.upload_file(path_or_fileobj=quant_file, path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=OUTPUT_REPO)

# Create README
readme = f"""---
base_model: {MODEL_ID}
tags:
- gguf
- llama.cpp
- quantized
- trl
- grpo
---

# {OUTPUT_REPO.split('/')[-1]}

GGUF conversion of [{MODEL_ID}](https://huggingface.co/{MODEL_ID}), trained using GRPO (Group Relative Policy Optimization).

## Available Quantizations

| File | Quant | Description |
|------|-------|-------------|
| {model_name}-f16.gguf | F16 | Full precision |
| {model_name}-q8_0.gguf | Q8_0 | 8-bit, high quality |
| {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit, good quality |
| {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit, recommended |

## Usage

### With Ollama
```bash
huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf
echo "FROM ./{model_name}-q4_k_m.gguf" > Modelfile
ollama create {model_name} -f Modelfile
ollama run {model_name}
```

### With llama.cpp
```bash
./llama-cli -m {model_name}-q4_k_m.gguf -p "Your prompt"
```
"""
api.upload_file(path_or_fileobj=readme.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO)
print("   ✅ README uploaded")

print("\n" + "=" * 60)
print("✅ GGUF Conversion Complete!")
print(f"📦 Repository: https://huggingface.co/{OUTPUT_REPO}")
print("=" * 60)