|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
RunPod setup script for Bamboo-1 training. |
|
|
|
|
|
Usage: |
|
|
# Set your RunPod API key |
|
|
export RUNPOD_API_KEY="your-api-key" |
|
|
|
|
|
# Create a network volume for data |
|
|
uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10 |
|
|
|
|
|
# List volumes |
|
|
uv run scripts/runpod_setup.py volume-list |
|
|
|
|
|
# Launch training pod with volume |
|
|
uv run scripts/runpod_setup.py launch --volume <volume-id> |
|
|
|
|
|
# Check pod status |
|
|
uv run scripts/runpod_setup.py status |
|
|
|
|
|
# Stop pod |
|
|
uv run scripts/runpod_setup.py stop |
|
|
""" |
|
|
|
|
|
import os |
|
|
from pathlib import Path |
|
|
|
|
|
import click |
|
|
import runpod |
|
|
import requests |
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
|
load_dotenv(Path(__file__).parent.parent / ".env") |
|
|
|
|
|
|
|
|
@click.group() |
|
|
def cli(): |
|
|
"""RunPod management for Bamboo-1 training.""" |
|
|
api_key = os.environ.get("RUNPOD_API_KEY") |
|
|
if not api_key: |
|
|
raise click.ClickException( |
|
|
"RUNPOD_API_KEY environment variable not set.\n" |
|
|
"Get your API key from https://runpod.io/console/user/settings" |
|
|
) |
|
|
runpod.api_key = api_key |
|
|
|
|
|
|
|
|
def get_ssh_public_key() -> str: |
|
|
"""Get the user's SSH public key.""" |
|
|
from pathlib import Path |
|
|
for key_file in ["~/.ssh/id_rsa.pub", "~/.ssh/id_ed25519.pub"]: |
|
|
path = Path(key_file).expanduser() |
|
|
if path.exists(): |
|
|
return path.read_text().strip() |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04" |
|
|
BAMBOO1_IMAGE = "undertheseanlp/bamboo-1:latest" |
|
|
|
|
|
|
|
|
@cli.command() |
|
|
@click.option("--gpu", default="NVIDIA RTX A4000", help="GPU type") |
|
|
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image") |
|
|
@click.option("--prebuilt", is_flag=True, help="Use pre-built bamboo-1 image (faster startup)") |
|
|
@click.option("--disk", default=20, type=int, help="Disk size in GB") |
|
|
@click.option("--name", default="bamboo-1-training", help="Pod name") |
|
|
@click.option("--volume", default=None, help="Network volume ID to attach") |
|
|
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") |
|
|
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)") |
|
|
@click.option("--epochs", default=100, type=int, help="Number of epochs") |
|
|
def launch(gpu, image, prebuilt, disk, name, volume, wandb_key, sample, epochs): |
|
|
"""Launch a RunPod instance for training.""" |
|
|
|
|
|
|
|
|
if prebuilt: |
|
|
image = BAMBOO1_IMAGE |
|
|
|
|
|
click.echo("Launching RunPod instance...") |
|
|
click.echo(f" GPU: {gpu}") |
|
|
click.echo(f" Image: {image}") |
|
|
click.echo(f" Disk: {disk}GB") |
|
|
|
|
|
|
|
|
train_cmd = "uv run scripts/train.py" |
|
|
if sample > 0: |
|
|
train_cmd += f" --sample {sample}" |
|
|
train_cmd += f" --epochs {epochs}" |
|
|
if wandb_key: |
|
|
train_cmd += " --wandb --wandb-project bamboo-1" |
|
|
|
|
|
|
|
|
env_vars = {} |
|
|
if wandb_key: |
|
|
env_vars["WANDB_API_KEY"] = wandb_key |
|
|
|
|
|
|
|
|
ssh_key = get_ssh_public_key() |
|
|
if ssh_key: |
|
|
env_vars["PUBLIC_KEY"] = ssh_key |
|
|
click.echo(" SSH key: configured") |
|
|
|
|
|
if volume: |
|
|
click.echo(f" Volume: {volume}") |
|
|
|
|
|
pod = runpod.create_pod( |
|
|
name=name, |
|
|
image_name=image, |
|
|
gpu_type_id=gpu, |
|
|
volume_in_gb=disk, |
|
|
env=env_vars if env_vars else None, |
|
|
ports="22/tcp", |
|
|
network_volume_id=volume, |
|
|
) |
|
|
|
|
|
click.echo("\nPod created!") |
|
|
click.echo(f" ID: {pod['id']}") |
|
|
click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") |
|
|
click.echo("\nMonitor at: https://runpod.io/console/pods") |
|
|
|
|
|
|
|
|
click.echo("\n" + "="*60) |
|
|
click.echo("SSH into the pod and run this command:") |
|
|
click.echo("="*60) |
|
|
|
|
|
if prebuilt: |
|
|
|
|
|
one_liner = f"cd /workspace/bamboo-1 && {train_cmd}" |
|
|
else: |
|
|
|
|
|
one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 && cd bamboo-1 && uv sync && {train_cmd}""" |
|
|
|
|
|
click.echo(one_liner) |
|
|
click.echo("="*60) |
|
|
|
|
|
|
|
|
@cli.command() |
|
|
def status(): |
|
|
"""Check status of all pods.""" |
|
|
pods = runpod.get_pods() |
|
|
|
|
|
if not pods: |
|
|
click.echo("No active pods.") |
|
|
return |
|
|
|
|
|
click.echo("Active pods:") |
|
|
for pod in pods: |
|
|
click.echo(f"\n {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}") |
|
|
runtime = pod.get('runtime') or {} |
|
|
ports = runtime.get('ports') or [] |
|
|
for p in ports: |
|
|
if p.get('privatePort') == 22: |
|
|
click.echo(f" SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}") |
|
|
|
|
|
|
|
|
@cli.command() |
|
|
@click.argument("pod_id") |
|
|
def stop(pod_id): |
|
|
"""Stop a pod by ID.""" |
|
|
click.echo(f"Stopping pod {pod_id}...") |
|
|
runpod.stop_pod(pod_id) |
|
|
click.echo("Pod stopped.") |
|
|
|
|
|
|
|
|
@cli.command() |
|
|
@click.argument("pod_id") |
|
|
def terminate(pod_id): |
|
|
"""Terminate a pod by ID.""" |
|
|
click.echo(f"Terminating pod {pod_id}...") |
|
|
runpod.terminate_pod(pod_id) |
|
|
click.echo("Pod terminated.") |
|
|
|
|
|
|
|
|
GPU_RECOMMENDATIONS = { |
|
|
"budget": "NVIDIA RTX A4000", |
|
|
"balanced": "NVIDIA RTX A5000", |
|
|
"fast": "NVIDIA RTX A6000", |
|
|
"fastest": "NVIDIA A100 80GB PCIe", |
|
|
} |
|
|
|
|
|
|
|
|
@cli.command("launch-phobert") |
|
|
@click.option("--gpu", default="NVIDIA RTX A5000", |
|
|
help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)") |
|
|
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image") |
|
|
@click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)") |
|
|
@click.option("--name", default="bamboo-1-phobert", help="Pod name") |
|
|
@click.option("--volume", default=None, help="Network volume ID to attach") |
|
|
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") |
|
|
@click.option("--dataset", type=click.Choice(["udd1", "ud-vtb"]), default="udd1", |
|
|
help="Dataset: udd1 or ud-vtb (Trankit benchmark)") |
|
|
@click.option("--encoder", default="vinai/phobert-base", |
|
|
help="Encoder: vinai/phobert-base or vinai/phobert-large") |
|
|
@click.option("--epochs", default=100, type=int, help="Number of epochs") |
|
|
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)") |
|
|
@click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)") |
|
|
def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size): |
|
|
"""Launch a RunPod instance for PhoBERT training. |
|
|
|
|
|
This launches a pod configured for training the PhoBERT-based dependency parser. |
|
|
After the pod starts, SSH in and run the training command printed below. |
|
|
|
|
|
GPU Recommendations: |
|
|
A4000 (16GB) - Budget option, batch_size=32 |
|
|
A5000 (24GB) - Recommended balance, batch_size=48-64 |
|
|
A6000 (48GB) - Fast training, batch_size=64-96 |
|
|
A100 (80GB) - Fastest, batch_size=128+ |
|
|
|
|
|
Example: |
|
|
uv run scripts/runpod_setup.py launch-phobert |
|
|
uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000" # Faster |
|
|
uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb # Trankit benchmark |
|
|
uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000" |
|
|
""" |
|
|
|
|
|
if batch_size == 0: |
|
|
if "A100" in gpu or "H100" in gpu: |
|
|
batch_size = 128 |
|
|
elif "A6000" in gpu: |
|
|
batch_size = 64 |
|
|
elif "A5000" in gpu: |
|
|
batch_size = 48 |
|
|
else: |
|
|
batch_size = 32 |
|
|
|
|
|
|
|
|
if "large" in encoder: |
|
|
batch_size = batch_size // 2 |
|
|
|
|
|
click.echo("Launching RunPod instance for PhoBERT training...") |
|
|
click.echo(f" GPU: {gpu}") |
|
|
click.echo(f" Image: {image}") |
|
|
click.echo(f" Disk: {disk}GB") |
|
|
click.echo(f" Dataset: {dataset}") |
|
|
click.echo(f" Encoder: {encoder}") |
|
|
click.echo(f" Batch size: {batch_size}") |
|
|
|
|
|
|
|
|
train_cmd = f"uv run scripts/train_phobert.py --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16" |
|
|
if sample > 0: |
|
|
train_cmd += f" --sample {sample}" |
|
|
if wandb_key: |
|
|
train_cmd += " --wandb --wandb-project bamboo-1-phobert" |
|
|
|
|
|
|
|
|
output_suffix = "" |
|
|
if dataset == "ud-vtb": |
|
|
output_suffix += "-vtb" |
|
|
if "large" in encoder: |
|
|
output_suffix += "-large" |
|
|
train_cmd += f" --output models/bamboo-1-phobert{output_suffix}" |
|
|
|
|
|
|
|
|
env_vars = {} |
|
|
if wandb_key: |
|
|
env_vars["WANDB_API_KEY"] = wandb_key |
|
|
|
|
|
|
|
|
ssh_key = get_ssh_public_key() |
|
|
if ssh_key: |
|
|
env_vars["PUBLIC_KEY"] = ssh_key |
|
|
click.echo(" SSH key: configured") |
|
|
|
|
|
if volume: |
|
|
click.echo(f" Volume: {volume}") |
|
|
|
|
|
pod = runpod.create_pod( |
|
|
name=name, |
|
|
image_name=image, |
|
|
gpu_type_id=gpu, |
|
|
volume_in_gb=disk, |
|
|
env=env_vars if env_vars else None, |
|
|
ports="22/tcp", |
|
|
network_volume_id=volume, |
|
|
) |
|
|
|
|
|
click.echo("\nPod created!") |
|
|
click.echo(f" ID: {pod['id']}") |
|
|
click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") |
|
|
click.echo("\nMonitor at: https://runpod.io/console/pods") |
|
|
|
|
|
|
|
|
click.echo("\n" + "="*70) |
|
|
click.echo("After SSH into the pod, run these commands:") |
|
|
click.echo("="*70) |
|
|
|
|
|
setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh | sh && \\ |
|
|
source $HOME/.local/bin/env && \\ |
|
|
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\ |
|
|
cd /workspace/bamboo-1 && uv sync""" |
|
|
|
|
|
click.echo("\n# 1. Setup (run once):") |
|
|
click.echo(setup_cmd) |
|
|
|
|
|
click.echo("\n# 2. Train:") |
|
|
click.echo(f"cd /workspace/bamboo-1 && {train_cmd}") |
|
|
|
|
|
click.echo("\n" + "="*70) |
|
|
|
|
|
if dataset == "ud-vtb": |
|
|
click.echo("\nTranskit benchmark reference:") |
|
|
click.echo(" Trankit base: 70.96% UAS / 64.76% LAS") |
|
|
click.echo(" Trankit large: 71.07% UAS / 65.37% LAS") |
|
|
click.echo("") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DATACENTERS = { |
|
|
"EU-RO-1": "Europe (Romania)", |
|
|
"EU-CZ-1": "Europe (Czech Republic)", |
|
|
"EUR-IS-1": "Europe (Iceland)", |
|
|
"US-KS-2": "US (Kansas)", |
|
|
"US-CA-2": "US (California)", |
|
|
} |
|
|
|
|
|
|
|
|
def _graphql_request(query: str, variables: dict = None) -> dict: |
|
|
"""Make a GraphQL request to RunPod API.""" |
|
|
api_key = os.environ.get("RUNPOD_API_KEY") |
|
|
response = requests.post( |
|
|
"https://api.runpod.io/graphql", |
|
|
headers={"Authorization": f"Bearer {api_key}"}, |
|
|
json={"query": query, "variables": variables or {}} |
|
|
) |
|
|
return response.json() |
|
|
|
|
|
|
|
|
@cli.command("launch-fast") |
|
|
@click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)") |
|
|
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image") |
|
|
@click.option("--disk", default=30, type=int, help="Disk size in GB") |
|
|
@click.option("--name", default="bamboo-1-trankit", help="Pod name") |
|
|
@click.option("--volume", default=None, help="Network volume ID to attach") |
|
|
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging") |
|
|
@click.option("--encoder", default="vinai/phobert-base", help="Encoder model") |
|
|
def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder): |
|
|
"""Launch pod for FAST Trankit reproduction (<5 minutes). |
|
|
|
|
|
Trains on UD Vietnamese VTB to reproduce Trankit benchmark: |
|
|
- Trankit base: 70.96% UAS / 64.76% LAS |
|
|
- Trankit large: 71.07% UAS / 65.37% LAS |
|
|
|
|
|
Uses H100 with aggressive settings for <5 min training. |
|
|
|
|
|
Example: |
|
|
uv run scripts/runpod_setup.py launch-fast |
|
|
uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large |
|
|
""" |
|
|
dataset = "ud-vtb" |
|
|
|
|
|
|
|
|
if "H100" in gpu: |
|
|
batch_size = 256 |
|
|
epochs = 30 |
|
|
elif "A100" in gpu: |
|
|
batch_size = 128 |
|
|
epochs = 40 |
|
|
else: |
|
|
batch_size = 64 |
|
|
epochs = 50 |
|
|
click.echo("WARNING: For <5 min training, use H100!") |
|
|
|
|
|
|
|
|
if "large" in encoder: |
|
|
batch_size = batch_size // 2 |
|
|
|
|
|
click.echo("Launching FAST Trankit reproduction (<5 minutes)...") |
|
|
click.echo(f" GPU: {gpu}") |
|
|
click.echo(f" Batch size: {batch_size}") |
|
|
click.echo(f" Epochs: {epochs}") |
|
|
click.echo(f" Dataset: {dataset} (UD Vietnamese VTB)") |
|
|
click.echo(f" Encoder: {encoder}") |
|
|
click.echo("") |
|
|
click.echo(" Target: Trankit base 70.96% UAS / 64.76% LAS") |
|
|
|
|
|
|
|
|
output_name = "models/bamboo-1-phobert-vtb" |
|
|
if "large" in encoder: |
|
|
output_name += "-large" |
|
|
|
|
|
|
|
|
train_cmd = f"""uv run scripts/train_phobert.py \\ |
|
|
--encoder {encoder} \\ |
|
|
--dataset {dataset} \\ |
|
|
--output {output_name} \\ |
|
|
--epochs {epochs} \\ |
|
|
--batch-size {batch_size} \\ |
|
|
--patience 5 \\ |
|
|
--warmup-steps 50 \\ |
|
|
--num-workers 8 \\ |
|
|
--fp16""" |
|
|
|
|
|
if wandb_key: |
|
|
train_cmd += " --wandb --wandb-project bamboo-1-phobert" |
|
|
|
|
|
|
|
|
env_vars = {} |
|
|
if wandb_key: |
|
|
env_vars["WANDB_API_KEY"] = wandb_key |
|
|
|
|
|
ssh_key = get_ssh_public_key() |
|
|
if ssh_key: |
|
|
env_vars["PUBLIC_KEY"] = ssh_key |
|
|
click.echo(" SSH key: configured") |
|
|
|
|
|
if volume: |
|
|
click.echo(f" Volume: {volume}") |
|
|
|
|
|
pod = runpod.create_pod( |
|
|
name=name, |
|
|
image_name=image, |
|
|
gpu_type_id=gpu, |
|
|
volume_in_gb=disk, |
|
|
env=env_vars if env_vars else None, |
|
|
ports="22/tcp", |
|
|
network_volume_id=volume, |
|
|
) |
|
|
|
|
|
click.echo(f"\nPod created!") |
|
|
click.echo(f" ID: {pod['id']}") |
|
|
click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}") |
|
|
click.echo("\nMonitor at: https://runpod.io/console/pods") |
|
|
|
|
|
|
|
|
click.echo("\n" + "="*70) |
|
|
click.echo("SSH in and run this ONE command for <5 min training:") |
|
|
click.echo("="*70) |
|
|
|
|
|
one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && \\ |
|
|
source $HOME/.local/bin/env && \\ |
|
|
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\ |
|
|
cd /workspace/bamboo-1 && uv sync && \\ |
|
|
{train_cmd}""" |
|
|
|
|
|
click.echo(one_liner) |
|
|
click.echo("="*70) |
|
|
|
|
|
|
|
|
@cli.command("volume-list") |
|
|
def volume_list(): |
|
|
"""List all network volumes.""" |
|
|
query = """ |
|
|
query { |
|
|
myself { |
|
|
networkVolumes { |
|
|
id |
|
|
name |
|
|
size |
|
|
dataCenterId |
|
|
} |
|
|
} |
|
|
} |
|
|
""" |
|
|
result = _graphql_request(query) |
|
|
volumes = result.get("data", {}).get("myself", {}).get("networkVolumes", []) |
|
|
|
|
|
if not volumes: |
|
|
click.echo("No network volumes found.") |
|
|
click.echo(f"\nCreate one with: uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10") |
|
|
return |
|
|
|
|
|
click.echo("Network Volumes:") |
|
|
for vol in volumes: |
|
|
dc = DATACENTERS.get(vol['dataCenterId'], vol['dataCenterId']) |
|
|
click.echo(f" - {vol['name']} ({vol['id']}): {vol['size']}GB @ {dc}") |
|
|
|
|
|
|
|
|
@cli.command("volume-create") |
|
|
@click.option("--name", default="bamboo-data", help="Volume name") |
|
|
@click.option("--size", default=10, type=int, help="Size in GB") |
|
|
@click.option("--datacenter", default="EUR-IS-1", type=click.Choice(list(DATACENTERS.keys())), help="Datacenter") |
|
|
def volume_create(name, size, datacenter): |
|
|
"""Create a network volume for data storage.""" |
|
|
click.echo(f"Creating network volume...") |
|
|
click.echo(f" Name: {name}") |
|
|
click.echo(f" Size: {size}GB") |
|
|
click.echo(f" Datacenter: {DATACENTERS[datacenter]}") |
|
|
|
|
|
query = """ |
|
|
mutation createNetworkVolume($input: CreateNetworkVolumeInput!) { |
|
|
createNetworkVolume(input: $input) { |
|
|
id |
|
|
name |
|
|
size |
|
|
dataCenterId |
|
|
} |
|
|
} |
|
|
""" |
|
|
variables = { |
|
|
"input": { |
|
|
"name": name, |
|
|
"size": size, |
|
|
"dataCenterId": datacenter |
|
|
} |
|
|
} |
|
|
|
|
|
result = _graphql_request(query, variables) |
|
|
|
|
|
if "errors" in result: |
|
|
click.echo(f"\nError: {result['errors'][0]['message']}") |
|
|
return |
|
|
|
|
|
volume = result.get("data", {}).get("createNetworkVolume", {}) |
|
|
click.echo(f"\nVolume created!") |
|
|
click.echo(f" ID: {volume['id']}") |
|
|
click.echo(f"\nUse with: uv run scripts/runpod_setup.py launch --volume {volume['id']}") |
|
|
|
|
|
|
|
|
@cli.command("volume-delete") |
|
|
@click.argument("volume_id") |
|
|
@click.confirmation_option(prompt="Are you sure you want to delete this volume?") |
|
|
def volume_delete(volume_id): |
|
|
"""Delete a network volume.""" |
|
|
query = """ |
|
|
mutation deleteNetworkVolume($input: DeleteNetworkVolumeInput!) { |
|
|
deleteNetworkVolume(input: $input) |
|
|
} |
|
|
""" |
|
|
variables = {"input": {"id": volume_id}} |
|
|
|
|
|
result = _graphql_request(query, variables) |
|
|
|
|
|
if "errors" in result: |
|
|
click.echo(f"Error: {result['errors'][0]['message']}") |
|
|
return |
|
|
|
|
|
click.echo(f"Volume {volume_id} deleted.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
cli() |
|
|
|