# /// script # requires-python = ">=3.10" # dependencies = [ # "runpod>=1.6.0", # "requests>=2.28.0", # "python-dotenv>=1.0.0", # ] # /// """ RunPod setup script for Bamboo-1 training. Usage: # Set your RunPod API key export RUNPOD_API_KEY="your-api-key" # Create a network volume for data uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10 # List volumes uv run scripts/runpod_setup.py volume-list # Launch training pod with volume uv run scripts/runpod_setup.py launch --volume # Check pod status uv run scripts/runpod_setup.py status # Stop pod uv run scripts/runpod_setup.py stop """ import os from pathlib import Path import click import runpod import requests from dotenv import load_dotenv # Load .env file from project root load_dotenv(Path(__file__).parent.parent / ".env") @click.group() def cli(): """RunPod management for Bamboo-1 training.""" api_key = os.environ.get("RUNPOD_API_KEY") if not api_key: raise click.ClickException( "RUNPOD_API_KEY environment variable not set.\n" "Get your API key from https://runpod.io/console/user/settings" ) runpod.api_key = api_key def get_ssh_public_key() -> str: """Get the user's SSH public key.""" from pathlib import Path for key_file in ["~/.ssh/id_rsa.pub", "~/.ssh/id_ed25519.pub"]: path = Path(key_file).expanduser() if path.exists(): return path.read_text().strip() return None # Default images DEFAULT_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04" BAMBOO1_IMAGE = "undertheseanlp/bamboo-1:latest" # Pre-built image with dependencies @cli.command() @click.option("--gpu", default="NVIDIA RTX A4000", help="GPU type") @click.option("--image", default=DEFAULT_IMAGE, help="Docker image") @click.option("--prebuilt", is_flag=True, help="Use pre-built bamboo-1 image (faster startup)") @click.option("--disk", default=20, type=int, help="Disk size in GB") @click.option("--name", default="bamboo-1-training", help="Pod name") @click.option("--volume", default=None, help="Network volume ID to attach") @click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") @click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)") @click.option("--epochs", default=100, type=int, help="Number of epochs") def launch(gpu, image, prebuilt, disk, name, volume, wandb_key, sample, epochs): """Launch a RunPod instance for training.""" # Use pre-built image if requested if prebuilt: image = BAMBOO1_IMAGE click.echo("Launching RunPod instance...") click.echo(f" GPU: {gpu}") click.echo(f" Image: {image}") click.echo(f" Disk: {disk}GB") # Build training command train_cmd = "uv run scripts/train.py" if sample > 0: train_cmd += f" --sample {sample}" train_cmd += f" --epochs {epochs}" if wandb_key: train_cmd += " --wandb --wandb-project bamboo-1" # Set environment variables env_vars = {} if wandb_key: env_vars["WANDB_API_KEY"] = wandb_key # Add SSH public key ssh_key = get_ssh_public_key() if ssh_key: env_vars["PUBLIC_KEY"] = ssh_key click.echo(" SSH key: configured") if volume: click.echo(f" Volume: {volume}") pod = runpod.create_pod( name=name, image_name=image, gpu_type_id=gpu, volume_in_gb=disk, env=env_vars if env_vars else None, ports="22/tcp", # Expose SSH port network_volume_id=volume, # Attach network volume ) click.echo("\nPod created!") click.echo(f" ID: {pod['id']}") click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") click.echo("\nMonitor at: https://runpod.io/console/pods") # Generate one-liner training command click.echo("\n" + "="*60) click.echo("SSH into the pod and run this command:") click.echo("="*60) if prebuilt: # Pre-built image: dependencies already installed one_liner = f"cd /workspace/bamboo-1 && {train_cmd}" else: # Standard image: need to install everything one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 && cd bamboo-1 && uv sync && {train_cmd}""" click.echo(one_liner) click.echo("="*60) @cli.command() def status(): """Check status of all pods.""" pods = runpod.get_pods() if not pods: click.echo("No active pods.") return click.echo("Active pods:") for pod in pods: click.echo(f"\n {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}") runtime = pod.get('runtime') or {} ports = runtime.get('ports') or [] for p in ports: if p.get('privatePort') == 22: click.echo(f" SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}") @cli.command() @click.argument("pod_id") def stop(pod_id): """Stop a pod by ID.""" click.echo(f"Stopping pod {pod_id}...") runpod.stop_pod(pod_id) click.echo("Pod stopped.") @cli.command() @click.argument("pod_id") def terminate(pod_id): """Terminate a pod by ID.""" click.echo(f"Terminating pod {pod_id}...") runpod.terminate_pod(pod_id) click.echo("Pod terminated.") GPU_RECOMMENDATIONS = { "budget": "NVIDIA RTX A4000", # 16GB, $0.20/hr - Basic training "balanced": "NVIDIA RTX A5000", # 24GB, $0.30/hr - Good balance (Recommended) "fast": "NVIDIA RTX A6000", # 48GB, $0.50/hr - Larger batches, faster "fastest": "NVIDIA A100 80GB PCIe", # 80GB, $1.50/hr - Best for production } @cli.command("launch-phobert") @click.option("--gpu", default="NVIDIA RTX A5000", help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)") @click.option("--image", default=DEFAULT_IMAGE, help="Docker image") @click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)") @click.option("--name", default="bamboo-1-phobert", help="Pod name") @click.option("--volume", default=None, help="Network volume ID to attach") @click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") @click.option("--dataset", type=click.Choice(["udd1", "ud-vtb"]), default="udd1", help="Dataset: udd1 or ud-vtb (Trankit benchmark)") @click.option("--encoder", default="vinai/phobert-base", help="Encoder: vinai/phobert-base or vinai/phobert-large") @click.option("--epochs", default=100, type=int, help="Number of epochs") @click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)") @click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)") def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size): """Launch a RunPod instance for PhoBERT training. This launches a pod configured for training the PhoBERT-based dependency parser. After the pod starts, SSH in and run the training command printed below. GPU Recommendations: A4000 (16GB) - Budget option, batch_size=32 A5000 (24GB) - Recommended balance, batch_size=48-64 A6000 (48GB) - Fast training, batch_size=64-96 A100 (80GB) - Fastest, batch_size=128+ Example: uv run scripts/runpod_setup.py launch-phobert uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000" # Faster uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb # Trankit benchmark uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000" """ # Auto-select batch size based on GPU if not specified if batch_size == 0: if "A100" in gpu or "H100" in gpu: batch_size = 128 elif "A6000" in gpu: batch_size = 64 elif "A5000" in gpu: batch_size = 48 else: # A4000 or unknown batch_size = 32 # Reduce batch size for large encoder if "large" in encoder: batch_size = batch_size // 2 click.echo("Launching RunPod instance for PhoBERT training...") click.echo(f" GPU: {gpu}") click.echo(f" Image: {image}") click.echo(f" Disk: {disk}GB") click.echo(f" Dataset: {dataset}") click.echo(f" Encoder: {encoder}") click.echo(f" Batch size: {batch_size}") # Build training command with optimizations train_cmd = f"uv run scripts/train_phobert.py --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16" if sample > 0: train_cmd += f" --sample {sample}" if wandb_key: train_cmd += " --wandb --wandb-project bamboo-1-phobert" # Output directory based on config output_suffix = "" if dataset == "ud-vtb": output_suffix += "-vtb" if "large" in encoder: output_suffix += "-large" train_cmd += f" --output models/bamboo-1-phobert{output_suffix}" # Set environment variables env_vars = {} if wandb_key: env_vars["WANDB_API_KEY"] = wandb_key # Add SSH public key ssh_key = get_ssh_public_key() if ssh_key: env_vars["PUBLIC_KEY"] = ssh_key click.echo(" SSH key: configured") if volume: click.echo(f" Volume: {volume}") pod = runpod.create_pod( name=name, image_name=image, gpu_type_id=gpu, volume_in_gb=disk, env=env_vars if env_vars else None, ports="22/tcp", network_volume_id=volume, ) click.echo("\nPod created!") click.echo(f" ID: {pod['id']}") click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") click.echo("\nMonitor at: https://runpod.io/console/pods") # Generate setup and training commands click.echo("\n" + "="*70) click.echo("After SSH into the pod, run these commands:") click.echo("="*70) setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh | sh && \\ source $HOME/.local/bin/env && \\ git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\ cd /workspace/bamboo-1 && uv sync""" click.echo("\n# 1. Setup (run once):") click.echo(setup_cmd) click.echo("\n# 2. Train:") click.echo(f"cd /workspace/bamboo-1 && {train_cmd}") click.echo("\n" + "="*70) if dataset == "ud-vtb": click.echo("\nTranskit benchmark reference:") click.echo(" Trankit base: 70.96% UAS / 64.76% LAS") click.echo(" Trankit large: 71.07% UAS / 65.37% LAS") click.echo("") # ============================================================================= # Volume Management # ============================================================================= DATACENTERS = { "EU-RO-1": "Europe (Romania)", "EU-CZ-1": "Europe (Czech Republic)", "EUR-IS-1": "Europe (Iceland)", "US-KS-2": "US (Kansas)", "US-CA-2": "US (California)", } def _graphql_request(query: str, variables: dict = None) -> dict: """Make a GraphQL request to RunPod API.""" api_key = os.environ.get("RUNPOD_API_KEY") response = requests.post( "https://api.runpod.io/graphql", headers={"Authorization": f"Bearer {api_key}"}, json={"query": query, "variables": variables or {}} ) return response.json() @cli.command("launch-fast") @click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)") @click.option("--image", default=DEFAULT_IMAGE, help="Docker image") @click.option("--disk", default=30, type=int, help="Disk size in GB") @click.option("--name", default="bamboo-1-trankit", help="Pod name") @click.option("--volume", default=None, help="Network volume ID to attach") @click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") @click.option("--encoder", default="vinai/phobert-base", help="Encoder model") def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder): """Launch pod for FAST Trankit reproduction (<5 minutes). Trains on UD Vietnamese VTB to reproduce Trankit benchmark: - Trankit base: 70.96% UAS / 64.76% LAS - Trankit large: 71.07% UAS / 65.37% LAS Uses H100 with aggressive settings for <5 min training. Example: uv run scripts/runpod_setup.py launch-fast uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large """ dataset = "ud-vtb" # Always use UD-VTB for Trankit reproduction # Set batch size based on GPU if "H100" in gpu: batch_size = 256 epochs = 30 elif "A100" in gpu: batch_size = 128 epochs = 40 else: batch_size = 64 epochs = 50 click.echo("WARNING: For <5 min training, use H100!") # Reduce batch for large model if "large" in encoder: batch_size = batch_size // 2 click.echo("Launching FAST Trankit reproduction (<5 minutes)...") click.echo(f" GPU: {gpu}") click.echo(f" Batch size: {batch_size}") click.echo(f" Epochs: {epochs}") click.echo(f" Dataset: {dataset} (UD Vietnamese VTB)") click.echo(f" Encoder: {encoder}") click.echo("") click.echo(" Target: Trankit base 70.96% UAS / 64.76% LAS") # Output name output_name = "models/bamboo-1-phobert-vtb" if "large" in encoder: output_name += "-large" # Build optimized training command train_cmd = f"""uv run scripts/train_phobert.py \\ --encoder {encoder} \\ --dataset {dataset} \\ --output {output_name} \\ --epochs {epochs} \\ --batch-size {batch_size} \\ --patience 5 \\ --warmup-steps 50 \\ --num-workers 8 \\ --fp16""" if wandb_key: train_cmd += " --wandb --wandb-project bamboo-1-phobert" # Set environment variables env_vars = {} if wandb_key: env_vars["WANDB_API_KEY"] = wandb_key ssh_key = get_ssh_public_key() if ssh_key: env_vars["PUBLIC_KEY"] = ssh_key click.echo(" SSH key: configured") if volume: click.echo(f" Volume: {volume}") pod = runpod.create_pod( name=name, image_name=image, gpu_type_id=gpu, volume_in_gb=disk, env=env_vars if env_vars else None, ports="22/tcp", network_volume_id=volume, ) click.echo(f"\nPod created!") click.echo(f" ID: {pod['id']}") click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") click.echo("\nMonitor at: https://runpod.io/console/pods") # One-liner setup + train click.echo("\n" + "="*70) click.echo("SSH in and run this ONE command for <5 min training:") click.echo("="*70) one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && \\ source $HOME/.local/bin/env && \\ git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\ cd /workspace/bamboo-1 && uv sync && \\ {train_cmd}""" click.echo(one_liner) click.echo("="*70) @cli.command("volume-list") def volume_list(): """List all network volumes.""" query = """ query { myself { networkVolumes { id name size dataCenterId } } } """ result = _graphql_request(query) volumes = result.get("data", {}).get("myself", {}).get("networkVolumes", []) if not volumes: click.echo("No network volumes found.") click.echo(f"\nCreate one with: uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10") return click.echo("Network Volumes:") for vol in volumes: dc = DATACENTERS.get(vol['dataCenterId'], vol['dataCenterId']) click.echo(f" - {vol['name']} ({vol['id']}): {vol['size']}GB @ {dc}") @cli.command("volume-create") @click.option("--name", default="bamboo-data", help="Volume name") @click.option("--size", default=10, type=int, help="Size in GB") @click.option("--datacenter", default="EUR-IS-1", type=click.Choice(list(DATACENTERS.keys())), help="Datacenter") def volume_create(name, size, datacenter): """Create a network volume for data storage.""" click.echo(f"Creating network volume...") click.echo(f" Name: {name}") click.echo(f" Size: {size}GB") click.echo(f" Datacenter: {DATACENTERS[datacenter]}") query = """ mutation createNetworkVolume($input: CreateNetworkVolumeInput!) { createNetworkVolume(input: $input) { id name size dataCenterId } } """ variables = { "input": { "name": name, "size": size, "dataCenterId": datacenter } } result = _graphql_request(query, variables) if "errors" in result: click.echo(f"\nError: {result['errors'][0]['message']}") return volume = result.get("data", {}).get("createNetworkVolume", {}) click.echo(f"\nVolume created!") click.echo(f" ID: {volume['id']}") click.echo(f"\nUse with: uv run scripts/runpod_setup.py launch --volume {volume['id']}") @cli.command("volume-delete") @click.argument("volume_id") @click.confirmation_option(prompt="Are you sure you want to delete this volume?") def volume_delete(volume_id): """Delete a network volume.""" query = """ mutation deleteNetworkVolume($input: DeleteNetworkVolumeInput!) { deleteNetworkVolume(input: $input) } """ variables = {"input": {"id": volume_id}} result = _graphql_request(query, variables) if "errors" in result: click.echo(f"Error: {result['errors'][0]['message']}") return click.echo(f"Volume {volume_id} deleted.") if __name__ == "__main__": cli()