bamboo-1 / src /runpod_setup.py

Consolidate project: merge scripts/, bamboo1/ into src/, optimize training

24ec440 about 1 month ago

18.1 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "runpod>=1.6.0",
	# "requests>=2.28.0",
	# "python-dotenv>=1.0.0",
	# ]
	# ///
	"""
	RunPod setup script for Bamboo-1 training.

	Usage:
	# Set your RunPod API key
	export RUNPOD_API_KEY="your-api-key"

	# Create a network volume for data
	uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10

	# List volumes
	uv run scripts/runpod_setup.py volume-list

	# Launch training pod with volume
	uv run scripts/runpod_setup.py launch --volume <volume-id>

	# Check pod status
	uv run scripts/runpod_setup.py status

	# Stop pod
	uv run scripts/runpod_setup.py stop
	"""

	import os
	from pathlib import Path

	import click
	import runpod
	import requests
	from dotenv import load_dotenv

	# Load .env file from project root
	load_dotenv(Path(__file__).parent.parent / ".env")


	@click.group()
	def cli():
	"""RunPod management for Bamboo-1 training."""
	api_key = os.environ.get("RUNPOD_API_KEY")
	if not api_key:
	raise click.ClickException(
	"RUNPOD_API_KEY environment variable not set.\n"
	"Get your API key from https://runpod.io/console/user/settings"
	)
	runpod.api_key = api_key


	def get_ssh_public_key() -> str:
	"""Get the user's SSH public key."""
	from pathlib import Path
	for key_file in ["~/.ssh/id_rsa.pub", "~/.ssh/id_ed25519.pub"]:
	path = Path(key_file).expanduser()
	if path.exists():
	return path.read_text().strip()
	return None


	# Default images
	DEFAULT_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
	BAMBOO1_IMAGE = "undertheseanlp/bamboo-1:latest" # Pre-built image with dependencies


	@cli.command()
	@click.option("--gpu", default="NVIDIA RTX A4000", help="GPU type")
	@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
	@click.option("--prebuilt", is_flag=True, help="Use pre-built bamboo-1 image (faster startup)")
	@click.option("--disk", default=20, type=int, help="Disk size in GB")
	@click.option("--name", default="bamboo-1-training", help="Pod name")
	@click.option("--volume", default=None, help="Network volume ID to attach")
	@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
	@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
	@click.option("--epochs", default=100, type=int, help="Number of epochs")
	def launch(gpu, image, prebuilt, disk, name, volume, wandb_key, sample, epochs):
	"""Launch a RunPod instance for training."""

	# Use pre-built image if requested
	if prebuilt:
	image = BAMBOO1_IMAGE

	click.echo("Launching RunPod instance...")
	click.echo(f" GPU: {gpu}")
	click.echo(f" Image: {image}")
	click.echo(f" Disk: {disk}GB")

	# Build training command
	train_cmd = "uv run src/train.py"
	if sample > 0:
	train_cmd += f" --sample {sample}"
	train_cmd += f" --epochs {epochs}"
	if wandb_key:
	train_cmd += " --wandb --wandb-project bamboo-1"

	# Set environment variables
	env_vars = {}
	if wandb_key:
	env_vars["WANDB_API_KEY"] = wandb_key

	# Add SSH public key
	ssh_key = get_ssh_public_key()
	if ssh_key:
	env_vars["PUBLIC_KEY"] = ssh_key
	click.echo(" SSH key: configured")

	if volume:
	click.echo(f" Volume: {volume}")

	pod = runpod.create_pod(
	name=name,
	image_name=image,
	gpu_type_id=gpu,
	volume_in_gb=disk,
	env=env_vars if env_vars else None,
	ports="22/tcp", # Expose SSH port
	network_volume_id=volume, # Attach network volume
	)

	click.echo("\nPod created!")
	click.echo(f" ID: {pod['id']}")
	click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}")
	click.echo("\nMonitor at: https://runpod.io/console/pods")

	# Generate one-liner training command
	click.echo("\n" + "="*60)
	click.echo("SSH into the pod and run this command:")
	click.echo("="*60)

	if prebuilt:
	# Pre-built image: dependencies already installed
	one_liner = f"cd /workspace/bamboo-1 && {train_cmd}"
	else:
	# Standard image: need to install everything
	one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh \| sh && source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 && cd bamboo-1 && uv sync && {train_cmd}"""

	click.echo(one_liner)
	click.echo("="*60)


	@cli.command()
	def status():
	"""Check status of all pods."""
	pods = runpod.get_pods()

	if not pods:
	click.echo("No active pods.")
	return

	click.echo("Active pods:")
	for pod in pods:
	click.echo(f"\n {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}")
	runtime = pod.get('runtime') or {}
	ports = runtime.get('ports') or []
	for p in ports:
	if p.get('privatePort') == 22:
	click.echo(f" SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}")


	@cli.command()
	@click.argument("pod_id")
	def stop(pod_id):
	"""Stop a pod by ID."""
	click.echo(f"Stopping pod {pod_id}...")
	runpod.stop_pod(pod_id)
	click.echo("Pod stopped.")


	@cli.command()
	@click.argument("pod_id")
	def terminate(pod_id):
	"""Terminate a pod by ID."""
	click.echo(f"Terminating pod {pod_id}...")
	runpod.terminate_pod(pod_id)
	click.echo("Pod terminated.")


	GPU_RECOMMENDATIONS = {
	"budget": "NVIDIA RTX A4000", # 16GB, $0.20/hr - Basic training
	"balanced": "NVIDIA RTX A5000", # 24GB, $0.30/hr - Good balance (Recommended)
	"fast": "NVIDIA RTX A6000", # 48GB, $0.50/hr - Larger batches, faster
	"fastest": "NVIDIA A100 80GB PCIe", # 80GB, $1.50/hr - Best for production
	}


	@cli.command("launch-phobert")
	@click.option("--gpu", default="NVIDIA RTX A5000",
	help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)")
	@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
	@click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)")
	@click.option("--name", default="bamboo-1-phobert", help="Pod name")
	@click.option("--volume", default=None, help="Network volume ID to attach")
	@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
	@click.option("--dataset", type=click.Choice(["udd1", "ud-vtb", "vndt"]), default="udd1",
	help="Dataset: udd1, ud-vtb (Trankit benchmark), or vndt (VnDT v1.1)")
	@click.option("--encoder", default="vinai/phobert-base",
	help="Encoder: vinai/phobert-base or vinai/phobert-large")
	@click.option("--epochs", default=100, type=int, help="Number of epochs")
	@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
	@click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)")
	def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size):
	"""Launch a RunPod instance for PhoBERT training.

	This launches a pod configured for training the PhoBERT-based dependency parser.
	After the pod starts, SSH in and run the training command printed below.

	GPU Recommendations:
	A4000 (16GB) - Budget option, batch_size=32
	A5000 (24GB) - Recommended balance, batch_size=48-64
	A6000 (48GB) - Fast training, batch_size=64-96
	A100 (80GB) - Fastest, batch_size=128+

	Example:
	uv run scripts/runpod_setup.py launch-phobert
	uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000" # Faster
	uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb # Trankit benchmark
	uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000"
	"""
	# Auto-select batch size based on GPU if not specified
	if batch_size == 0:
	if "A100" in gpu or "H100" in gpu:
	batch_size = 128
	elif "A6000" in gpu:
	batch_size = 64
	elif "A5000" in gpu:
	batch_size = 48
	else: # A4000 or unknown
	batch_size = 32

	# Reduce batch size for large encoder
	if "large" in encoder:
	batch_size = batch_size // 2

	click.echo("Launching RunPod instance for PhoBERT training...")
	click.echo(f" GPU: {gpu}")
	click.echo(f" Image: {image}")
	click.echo(f" Disk: {disk}GB")
	click.echo(f" Dataset: {dataset}")
	click.echo(f" Encoder: {encoder}")
	click.echo(f" Batch size: {batch_size}")

	# Build training command with optimizations
	train_cmd = f"uv run src/train.py --method trankit --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16"
	if sample > 0:
	train_cmd += f" --sample {sample}"
	if wandb_key:
	train_cmd += " --wandb --wandb-project bamboo-1-phobert"

	# Output directory based on config
	output_suffix = ""
	if dataset == "ud-vtb":
	output_suffix += "-vtb"
	elif dataset == "vndt":
	output_suffix += "-vndt"
	if "large" in encoder:
	output_suffix += "-large"
	train_cmd += f" --output models/bamboo-1-phobert{output_suffix}"

	# Set environment variables
	env_vars = {}
	if wandb_key:
	env_vars["WANDB_API_KEY"] = wandb_key

	# Add SSH public key
	ssh_key = get_ssh_public_key()
	if ssh_key:
	env_vars["PUBLIC_KEY"] = ssh_key
	click.echo(" SSH key: configured")

	if volume:
	click.echo(f" Volume: {volume}")

	pod = runpod.create_pod(
	name=name,
	image_name=image,
	gpu_type_id=gpu,
	volume_in_gb=disk,
	env=env_vars if env_vars else None,
	ports="22/tcp",
	network_volume_id=volume,
	)

	click.echo("\nPod created!")
	click.echo(f" ID: {pod['id']}")
	click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}")
	click.echo("\nMonitor at: https://runpod.io/console/pods")

	# Generate setup and training commands
	click.echo("\n" + "="*70)
	click.echo("After SSH into the pod, run these commands:")
	click.echo("="*70)

	setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh \| sh && \\
	source $HOME/.local/bin/env && \\
	git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
	cd /workspace/bamboo-1 && uv sync"""

	click.echo("\n# 1. Setup (run once):")
	click.echo(setup_cmd)

	click.echo("\n# 2. Train:")
	click.echo(f"cd /workspace/bamboo-1 && {train_cmd}")

	click.echo("\n" + "="*70)

	if dataset == "ud-vtb":
	click.echo("\nTranskit benchmark reference:")
	click.echo(" Trankit base: 70.96% UAS / 64.76% LAS")
	click.echo(" Trankit large: 71.07% UAS / 65.37% LAS")
	click.echo("")


	# =============================================================================
	# Volume Management
	# =============================================================================

	DATACENTERS = {
	"CA-MTL-1": "Canada (Montreal)",
	"EU-CZ-1": "Europe (Czech Republic)",
	"EU-NL-1": "Europe (Netherlands)",
	"EU-RO-1": "Europe (Romania)",
	"EUR-IS-1": "Europe (Iceland)",
	"US-CA-2": "US (California)",
	"US-KS-2": "US (Kansas)",
	"US-TX-3": "US (Texas)",
	}


	def _graphql_request(query: str, variables: dict = None) -> dict:
	"""Make a GraphQL request to RunPod API."""
	api_key = os.environ.get("RUNPOD_API_KEY")
	response = requests.post(
	"https://api.runpod.io/graphql",
	headers={"Authorization": f"Bearer {api_key}"},
	json={"query": query, "variables": variables or {}}
	)
	return response.json()


	@cli.command("launch-fast")
	@click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)")
	@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
	@click.option("--disk", default=30, type=int, help="Disk size in GB")
	@click.option("--name", default="bamboo-1-trankit", help="Pod name")
	@click.option("--volume", default=None, help="Network volume ID to attach")
	@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
	@click.option("--encoder", default="vinai/phobert-base", help="Encoder model")
	def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder):
	"""Launch pod for FAST Trankit reproduction (<5 minutes).

	Trains on UD Vietnamese VTB to reproduce Trankit benchmark:
	- Trankit base: 70.96% UAS / 64.76% LAS
	- Trankit large: 71.07% UAS / 65.37% LAS

	Uses H100 with aggressive settings for <5 min training.

	Example:
	uv run scripts/runpod_setup.py launch-fast
	uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large
	"""
	dataset = "ud-vtb" # Always use UD-VTB for Trankit reproduction

	# Set batch size based on GPU
	if "H100" in gpu:
	batch_size = 256
	epochs = 30
	elif "A100" in gpu:
	batch_size = 128
	epochs = 40
	else:
	batch_size = 64
	epochs = 50
	click.echo("WARNING: For <5 min training, use H100!")

	# Reduce batch for large model
	if "large" in encoder:
	batch_size = batch_size // 2

	click.echo("Launching FAST Trankit reproduction (<5 minutes)...")
	click.echo(f" GPU: {gpu}")
	click.echo(f" Batch size: {batch_size}")
	click.echo(f" Epochs: {epochs}")
	click.echo(f" Dataset: {dataset} (UD Vietnamese VTB)")
	click.echo(f" Encoder: {encoder}")
	click.echo("")
	click.echo(" Target: Trankit base 70.96% UAS / 64.76% LAS")

	# Output name
	output_name = "models/bamboo-1-phobert-vtb"
	if "large" in encoder:
	output_name += "-large"

	# Build optimized training command
	train_cmd = f"""uv run src/train.py --method trankit \\
	--encoder {encoder} \\
	--dataset {dataset} \\
	--output {output_name} \\
	--epochs {epochs} \\
	--batch-size {batch_size} \\
	--patience 5 \\
	--warmup-steps 50 \\
	--fp16"""

	if wandb_key:
	train_cmd += " --wandb --wandb-project bamboo-1-phobert"

	# Set environment variables
	env_vars = {}
	if wandb_key:
	env_vars["WANDB_API_KEY"] = wandb_key

	ssh_key = get_ssh_public_key()
	if ssh_key:
	env_vars["PUBLIC_KEY"] = ssh_key
	click.echo(" SSH key: configured")

	if volume:
	click.echo(f" Volume: {volume}")

	pod = runpod.create_pod(
	name=name,
	image_name=image,
	gpu_type_id=gpu,
	volume_in_gb=disk,
	env=env_vars if env_vars else None,
	ports="22/tcp",
	network_volume_id=volume,
	)

	click.echo(f"\nPod created!")
	click.echo(f" ID: {pod['id']}")
	click.echo(f" Status: {pod.get('desiredStatus', 'PENDING')}")
	click.echo("\nMonitor at: https://runpod.io/console/pods")

	# One-liner setup + train
	click.echo("\n" + "="*70)
	click.echo("SSH in and run this ONE command for <5 min training:")
	click.echo("="*70)

	one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh \| sh && \\
	source $HOME/.local/bin/env && \\
	git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
	cd /workspace/bamboo-1 && uv sync && \\
	{train_cmd}"""

	click.echo(one_liner)
	click.echo("="*70)


	@cli.command("volume-list")
	def volume_list():
	"""List all network volumes."""
	query = """
	query {
	myself {
	networkVolumes {
	id
	name
	size
	dataCenterId
	}
	}
	}
	"""
	result = _graphql_request(query)
	volumes = result.get("data", {}).get("myself", {}).get("networkVolumes", [])

	if not volumes:
	click.echo("No network volumes found.")
	click.echo(f"\nCreate one with: uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10")
	return

	click.echo("Network Volumes:")
	for vol in volumes:
	dc = DATACENTERS.get(vol['dataCenterId'], vol['dataCenterId'])
	click.echo(f" - {vol['name']} ({vol['id']}): {vol['size']}GB @ {dc}")


	@cli.command("volume-create")
	@click.option("--name", default="bamboo-data", help="Volume name")
	@click.option("--size", default=10, type=int, help="Size in GB")
	@click.option("--datacenter", default="EUR-IS-1", type=click.Choice(list(DATACENTERS.keys())), help="Datacenter")
	def volume_create(name, size, datacenter):
	"""Create a network volume for data storage."""
	click.echo(f"Creating network volume...")
	click.echo(f" Name: {name}")
	click.echo(f" Size: {size}GB")
	click.echo(f" Datacenter: {DATACENTERS[datacenter]}")

	query = """
	mutation createNetworkVolume($input: CreateNetworkVolumeInput!) {
	createNetworkVolume(input: $input) {
	id
	name
	size
	dataCenterId
	}
	}
	"""
	variables = {
	"input": {
	"name": name,
	"size": size,
	"dataCenterId": datacenter
	}
	}

	result = _graphql_request(query, variables)

	if "errors" in result:
	click.echo(f"\nError: {result['errors'][0]['message']}")
	return

	volume = result.get("data", {}).get("createNetworkVolume", {})
	click.echo(f"\nVolume created!")
	click.echo(f" ID: {volume['id']}")
	click.echo(f"\nUse with: uv run scripts/runpod_setup.py launch --volume {volume['id']}")


	@cli.command("volume-delete")
	@click.argument("volume_id")
	@click.confirmation_option(prompt="Are you sure you want to delete this volume?")
	def volume_delete(volume_id):
	"""Delete a network volume."""
	query = """
	mutation deleteNetworkVolume($input: DeleteNetworkVolumeInput!) {
	deleteNetworkVolume(input: $input)
	}
	"""
	variables = {"input": {"id": volume_id}}

	result = _graphql_request(query, variables)

	if "errors" in result:
	click.echo(f"Error: {result['errors'][0]['message']}")
	return

	click.echo(f"Volume {volume_id} deleted.")


	if __name__ == "__main__":
	cli()