File size: 17,931 Bytes

# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "runpod>=1.6.0",
#     "requests>=2.28.0",
#     "python-dotenv>=1.0.0",
# ]
# ///
"""
RunPod setup script for Bamboo-1 training.

Usage:
    # Set your RunPod API key
    export RUNPOD_API_KEY="your-api-key"

    # Create a network volume for data
    uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10

    # List volumes
    uv run scripts/runpod_setup.py volume-list

    # Launch training pod with volume
    uv run scripts/runpod_setup.py launch --volume <volume-id>

    # Check pod status
    uv run scripts/runpod_setup.py status

    # Stop pod
    uv run scripts/runpod_setup.py stop
"""

import os
from pathlib import Path

import click
import runpod
import requests
from dotenv import load_dotenv

# Load .env file from project root
load_dotenv(Path(__file__).parent.parent / ".env")


@click.group()
def cli():
    """RunPod management for Bamboo-1 training."""
    api_key = os.environ.get("RUNPOD_API_KEY")
    if not api_key:
        raise click.ClickException(
            "RUNPOD_API_KEY environment variable not set.\n"
            "Get your API key from https://runpod.io/console/user/settings"
        )
    runpod.api_key = api_key


def get_ssh_public_key() -> str:
    """Get the user's SSH public key."""
    from pathlib import Path
    for key_file in ["~/.ssh/id_rsa.pub", "~/.ssh/id_ed25519.pub"]:
        path = Path(key_file).expanduser()
        if path.exists():
            return path.read_text().strip()
    return None


# Default images
DEFAULT_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
BAMBOO1_IMAGE = "undertheseanlp/bamboo-1:latest"  # Pre-built image with dependencies


@cli.command()
@click.option("--gpu", default="NVIDIA RTX A4000", help="GPU type")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--prebuilt", is_flag=True, help="Use pre-built bamboo-1 image (faster startup)")
@click.option("--disk", default=20, type=int, help="Disk size in GB")
@click.option("--name", default="bamboo-1-training", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
@click.option("--epochs", default=100, type=int, help="Number of epochs")
def launch(gpu, image, prebuilt, disk, name, volume, wandb_key, sample, epochs):
    """Launch a RunPod instance for training."""

    # Use pre-built image if requested
    if prebuilt:
        image = BAMBOO1_IMAGE

    click.echo("Launching RunPod instance...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Image: {image}")
    click.echo(f"  Disk: {disk}GB")

    # Build training command
    train_cmd = "uv run scripts/train.py"
    if sample > 0:
        train_cmd += f" --sample {sample}"
    train_cmd += f" --epochs {epochs}"
    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    # Add SSH public key
    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",  # Expose SSH port
        network_volume_id=volume,  # Attach network volume
    )

    click.echo("\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # Generate one-liner training command
    click.echo("\n" + "="*60)
    click.echo("SSH into the pod and run this command:")
    click.echo("="*60)

    if prebuilt:
        # Pre-built image: dependencies already installed
        one_liner = f"cd /workspace/bamboo-1 && {train_cmd}"
    else:
        # Standard image: need to install everything
        one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 && cd bamboo-1 && uv sync && {train_cmd}"""

    click.echo(one_liner)
    click.echo("="*60)


@cli.command()
def status():
    """Check status of all pods."""
    pods = runpod.get_pods()

    if not pods:
        click.echo("No active pods.")
        return

    click.echo("Active pods:")
    for pod in pods:
        click.echo(f"\n  {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}")
        runtime = pod.get('runtime') or {}
        ports = runtime.get('ports') or []
        for p in ports:
            if p.get('privatePort') == 22:
                click.echo(f"    SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}")


@cli.command()
@click.argument("pod_id")
def stop(pod_id):
    """Stop a pod by ID."""
    click.echo(f"Stopping pod {pod_id}...")
    runpod.stop_pod(pod_id)
    click.echo("Pod stopped.")


@cli.command()
@click.argument("pod_id")
def terminate(pod_id):
    """Terminate a pod by ID."""
    click.echo(f"Terminating pod {pod_id}...")
    runpod.terminate_pod(pod_id)
    click.echo("Pod terminated.")


GPU_RECOMMENDATIONS = {
    "budget": "NVIDIA RTX A4000",       # 16GB, $0.20/hr - Basic training
    "balanced": "NVIDIA RTX A5000",     # 24GB, $0.30/hr - Good balance (Recommended)
    "fast": "NVIDIA RTX A6000",         # 48GB, $0.50/hr - Larger batches, faster
    "fastest": "NVIDIA A100 80GB PCIe", # 80GB, $1.50/hr - Best for production
}


@cli.command("launch-phobert")
@click.option("--gpu", default="NVIDIA RTX A5000",
              help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)")
@click.option("--name", default="bamboo-1-phobert", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--dataset", type=click.Choice(["udd1", "ud-vtb"]), default="udd1",
              help="Dataset: udd1 or ud-vtb (Trankit benchmark)")
@click.option("--encoder", default="vinai/phobert-base",
              help="Encoder: vinai/phobert-base or vinai/phobert-large")
@click.option("--epochs", default=100, type=int, help="Number of epochs")
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
@click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)")
def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size):
    """Launch a RunPod instance for PhoBERT training.

    This launches a pod configured for training the PhoBERT-based dependency parser.
    After the pod starts, SSH in and run the training command printed below.

    GPU Recommendations:
        A4000 (16GB) - Budget option, batch_size=32
        A5000 (24GB) - Recommended balance, batch_size=48-64
        A6000 (48GB) - Fast training, batch_size=64-96
        A100 (80GB) - Fastest, batch_size=128+

    Example:
        uv run scripts/runpod_setup.py launch-phobert
        uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000"  # Faster
        uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb  # Trankit benchmark
        uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000"
    """
    # Auto-select batch size based on GPU if not specified
    if batch_size == 0:
        if "A100" in gpu or "H100" in gpu:
            batch_size = 128
        elif "A6000" in gpu:
            batch_size = 64
        elif "A5000" in gpu:
            batch_size = 48
        else:  # A4000 or unknown
            batch_size = 32

        # Reduce batch size for large encoder
        if "large" in encoder:
            batch_size = batch_size // 2

    click.echo("Launching RunPod instance for PhoBERT training...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Image: {image}")
    click.echo(f"  Disk: {disk}GB")
    click.echo(f"  Dataset: {dataset}")
    click.echo(f"  Encoder: {encoder}")
    click.echo(f"  Batch size: {batch_size}")

    # Build training command with optimizations
    train_cmd = f"uv run scripts/train_phobert.py --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16"
    if sample > 0:
        train_cmd += f" --sample {sample}"
    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1-phobert"

    # Output directory based on config
    output_suffix = ""
    if dataset == "ud-vtb":
        output_suffix += "-vtb"
    if "large" in encoder:
        output_suffix += "-large"
    train_cmd += f" --output models/bamboo-1-phobert{output_suffix}"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    # Add SSH public key
    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",
        network_volume_id=volume,
    )

    click.echo("\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # Generate setup and training commands
    click.echo("\n" + "="*70)
    click.echo("After SSH into the pod, run these commands:")
    click.echo("="*70)

    setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh | sh && \\
source $HOME/.local/bin/env && \\
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
cd /workspace/bamboo-1 && uv sync"""

    click.echo("\n# 1. Setup (run once):")
    click.echo(setup_cmd)

    click.echo("\n# 2. Train:")
    click.echo(f"cd /workspace/bamboo-1 && {train_cmd}")

    click.echo("\n" + "="*70)

    if dataset == "ud-vtb":
        click.echo("\nTranskit benchmark reference:")
        click.echo("  Trankit base:  70.96% UAS / 64.76% LAS")
        click.echo("  Trankit large: 71.07% UAS / 65.37% LAS")
        click.echo("")


# =============================================================================
# Volume Management
# =============================================================================

DATACENTERS = {
    "EU-RO-1": "Europe (Romania)",
    "EU-CZ-1": "Europe (Czech Republic)",
    "EUR-IS-1": "Europe (Iceland)",
    "US-KS-2": "US (Kansas)",
    "US-CA-2": "US (California)",
}


def _graphql_request(query: str, variables: dict = None) -> dict:
    """Make a GraphQL request to RunPod API."""
    api_key = os.environ.get("RUNPOD_API_KEY")
    response = requests.post(
        "https://api.runpod.io/graphql",
        headers={"Authorization": f"Bearer {api_key}"},
        json={"query": query, "variables": variables or {}}
    )
    return response.json()


@cli.command("launch-fast")
@click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--disk", default=30, type=int, help="Disk size in GB")
@click.option("--name", default="bamboo-1-trankit", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--encoder", default="vinai/phobert-base", help="Encoder model")
def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder):
    """Launch pod for FAST Trankit reproduction (<5 minutes).

    Trains on UD Vietnamese VTB to reproduce Trankit benchmark:
    - Trankit base:  70.96% UAS / 64.76% LAS
    - Trankit large: 71.07% UAS / 65.37% LAS

    Uses H100 with aggressive settings for <5 min training.

    Example:
        uv run scripts/runpod_setup.py launch-fast
        uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large
    """
    dataset = "ud-vtb"  # Always use UD-VTB for Trankit reproduction

    # Set batch size based on GPU
    if "H100" in gpu:
        batch_size = 256
        epochs = 30
    elif "A100" in gpu:
        batch_size = 128
        epochs = 40
    else:
        batch_size = 64
        epochs = 50
        click.echo("WARNING: For <5 min training, use H100!")

    # Reduce batch for large model
    if "large" in encoder:
        batch_size = batch_size // 2

    click.echo("Launching FAST Trankit reproduction (<5 minutes)...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Batch size: {batch_size}")
    click.echo(f"  Epochs: {epochs}")
    click.echo(f"  Dataset: {dataset} (UD Vietnamese VTB)")
    click.echo(f"  Encoder: {encoder}")
    click.echo("")
    click.echo("  Target: Trankit base 70.96% UAS / 64.76% LAS")

    # Output name
    output_name = "models/bamboo-1-phobert-vtb"
    if "large" in encoder:
        output_name += "-large"

    # Build optimized training command
    train_cmd = f"""uv run scripts/train_phobert.py \\
        --encoder {encoder} \\
        --dataset {dataset} \\
        --output {output_name} \\
        --epochs {epochs} \\
        --batch-size {batch_size} \\
        --patience 5 \\
        --warmup-steps 50 \\
        --num-workers 8 \\
        --fp16"""

    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1-phobert"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",
        network_volume_id=volume,
    )

    click.echo(f"\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # One-liner setup + train
    click.echo("\n" + "="*70)
    click.echo("SSH in and run this ONE command for <5 min training:")
    click.echo("="*70)

    one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && \\
source $HOME/.local/bin/env && \\
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
cd /workspace/bamboo-1 && uv sync && \\
{train_cmd}"""

    click.echo(one_liner)
    click.echo("="*70)


@cli.command("volume-list")
def volume_list():
    """List all network volumes."""
    query = """
    query {
        myself {
            networkVolumes {
                id
                name
                size
                dataCenterId
            }
        }
    }
    """
    result = _graphql_request(query)
    volumes = result.get("data", {}).get("myself", {}).get("networkVolumes", [])

    if not volumes:
        click.echo("No network volumes found.")
        click.echo(f"\nCreate one with: uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10")
        return

    click.echo("Network Volumes:")
    for vol in volumes:
        dc = DATACENTERS.get(vol['dataCenterId'], vol['dataCenterId'])
        click.echo(f"  - {vol['name']} ({vol['id']}): {vol['size']}GB @ {dc}")


@cli.command("volume-create")
@click.option("--name", default="bamboo-data", help="Volume name")
@click.option("--size", default=10, type=int, help="Size in GB")
@click.option("--datacenter", default="EUR-IS-1", type=click.Choice(list(DATACENTERS.keys())), help="Datacenter")
def volume_create(name, size, datacenter):
    """Create a network volume for data storage."""
    click.echo(f"Creating network volume...")
    click.echo(f"  Name: {name}")
    click.echo(f"  Size: {size}GB")
    click.echo(f"  Datacenter: {DATACENTERS[datacenter]}")

    query = """
    mutation createNetworkVolume($input: CreateNetworkVolumeInput!) {
        createNetworkVolume(input: $input) {
            id
            name
            size
            dataCenterId
        }
    }
    """
    variables = {
        "input": {
            "name": name,
            "size": size,
            "dataCenterId": datacenter
        }
    }

    result = _graphql_request(query, variables)

    if "errors" in result:
        click.echo(f"\nError: {result['errors'][0]['message']}")
        return

    volume = result.get("data", {}).get("createNetworkVolume", {})
    click.echo(f"\nVolume created!")
    click.echo(f"  ID: {volume['id']}")
    click.echo(f"\nUse with: uv run scripts/runpod_setup.py launch --volume {volume['id']}")


@cli.command("volume-delete")
@click.argument("volume_id")
@click.confirmation_option(prompt="Are you sure you want to delete this volume?")
def volume_delete(volume_id):
    """Delete a network volume."""
    query = """
    mutation deleteNetworkVolume($input: DeleteNetworkVolumeInput!) {
        deleteNetworkVolume(input: $input)
    }
    """
    variables = {"input": {"id": volume_id}}

    result = _graphql_request(query, variables)

    if "errors" in result:
        click.echo(f"Error: {result['errors'][0]['message']}")
        return

    click.echo(f"Volume {volume_id} deleted.")


if __name__ == "__main__":
    cli()