#!/usr/bin/env python3 """ Lambda Labs GPU Instance Launcher for AQuA-RAT Training This script automates launching an 8x H100 GPU instance on Lambda Labs and deploying the nanochatAquaRat training pipeline. Prerequisites: 1. Lambda Labs API key (set as LAMBDA_API_KEY environment variable) 2. Your SSH public key added to Lambda Labs account 3. W&B API key for logging (set as WANDB_API_KEY environment variable) Usage: python launch_lambda.py --instance-type gpu_8x_h100_sxm5 --region us-west-1 """ import os import sys import time import argparse import subprocess from pathlib import Path try: import lambda_cloud_client from lambda_cloud_client.rest import ApiException except ImportError: print("Installing lambda-cloud-client...") subprocess.check_call([sys.executable, "-m", "pip", "install", "lambda-cloud-client"]) import lambda_cloud_client from lambda_cloud_client.rest import ApiException def check_env_vars(): """Check required environment variables are set""" required_vars = { 'LAMBDA_API_KEY': 'Lambda Labs API key', 'WANDB_API_KEY': 'Weights & Biases API key' } missing = [] for var, description in required_vars.items(): if not os.getenv(var): missing.append(f" - {var} ({description})") if missing: print("ERROR: Missing required environment variables:") print("\n".join(missing)) print("\nSet them with:") print(" export LAMBDA_API_KEY='your-lambda-api-key'") print(" export WANDB_API_KEY='your-wandb-api-key'") sys.exit(1) def get_api_client(): """Initialize Lambda Cloud API client""" api_key = os.getenv('LAMBDA_API_KEY') configuration = lambda_cloud_client.Configuration( host="https://cloud.lambdalabs.com/api/v1", access_token=api_key ) return lambda_cloud_client.ApiClient(configuration) def list_available_instance_types(api_client): """List available instance types and regions""" api_instance = lambda_cloud_client.DefaultApi(api_client) try: response = api_instance.instance_types() print("\nAvailable Instance Types:") print("-" * 80) for type_name, details in response.data.items(): if details.instance_type.regions_with_capacity_available: print(f"\n{type_name}:") print(f" GPUs: {details.instance_type.specs.gpus}") print(f" GPU Memory: {details.instance_type.specs.memory_gbs} GB") print(f" Price: ${details.instance_type.specs.price_cents_per_hour / 100}/hour") print(f" Available regions: {', '.join(details.instance_type.regions_with_capacity_available)}") return response.data except ApiException as e: print(f"Error fetching instance types: {e}") sys.exit(1) def launch_instance(api_client, instance_type, region, name="nanochat-aquarat-training"): """Launch a Lambda Labs GPU instance""" api_instance = lambda_cloud_client.DefaultApi(api_client) # Get SSH keys try: ssh_keys_response = api_instance.list_ssh_keys() if not ssh_keys_response.data: print("ERROR: No SSH keys found in your Lambda Labs account.") print("Please add an SSH key at: https://cloud.lambdalabs.com/ssh-keys") sys.exit(1) ssh_key_names = [key.name for key in ssh_keys_response.data] print(f"Using SSH keys: {', '.join(ssh_key_names)}") except ApiException as e: print(f"Error fetching SSH keys: {e}") sys.exit(1) # Launch instance launch_request = lambda_cloud_client.LaunchInstanceRequest( region_name=region, instance_type_name=instance_type, ssh_key_names=ssh_key_names, name=name, quantity=1 ) print(f"\nLaunching {instance_type} instance in {region}...") try: response = api_instance.launch_instance(launch_request) if response.data and response.data.instance_ids: instance_id = response.data.instance_ids[0] print(f"✓ Instance launched successfully!") print(f" Instance ID: {instance_id}") return instance_id else: print("ERROR: Instance launch failed") sys.exit(1) except ApiException as e: print(f"Error launching instance: {e}") sys.exit(1) def wait_for_instance(api_client, instance_id, timeout=300): """Wait for instance to be ready""" api_instance = lambda_cloud_client.DefaultApi(api_client) print("\nWaiting for instance to be ready...") start_time = time.time() while time.time() - start_time < timeout: try: response = api_instance.get_instance(instance_id) instance = response.data if instance.status == "active": print(f"✓ Instance is ready!") print(f" IP Address: {instance.ip}") print(f" SSH Command: ssh ubuntu@{instance.ip}") return instance print(f" Status: {instance.status}... waiting") time.sleep(10) except ApiException as e: print(f"Error checking instance status: {e}") time.sleep(10) print("ERROR: Timeout waiting for instance to be ready") sys.exit(1) def generate_startup_script(): """Generate the startup script to run on the instance""" wandb_key = os.getenv('WANDB_API_KEY') script = f"""#!/bin/bash set -euo pipefail # Create .env file with credentials cat > /home/ubuntu/nanochatAquaRat/.env << 'EOF' WANDB_API_KEY={wandb_key} WANDB_PROJECT=nanochat-aquarat WANDB_ENTITY=${{WANDB_ENTITY:-}} EOF # Clone repository if not exists cd /home/ubuntu if [ ! -d "nanochatAquaRat" ]; then git clone https://github.com/HarleyCoops/nanochatAquaRat.git fi cd nanochatAquaRat # Make script executable chmod +x run_aquarat_small.sh # Run training in screen session screen -dmS training bash -c './run_aquarat_small.sh 2>&1 | tee training.log' echo "Training started in screen session 'training'" echo "To attach: screen -r training" echo "To detach: Ctrl+A then D" echo "To view log: tail -f training.log" """ return script def deploy_and_run(instance_ip): """Deploy code and start training on the instance""" print("\nDeploying code and starting training...") startup_script = generate_startup_script() # Save startup script locally script_path = Path("/tmp/lambda_startup.sh") script_path.write_text(startup_script) # Copy startup script to instance print(" Copying startup script...") subprocess.run([ "scp", "-o", "StrictHostKeyChecking=no", str(script_path), f"ubuntu@{instance_ip}:/tmp/startup.sh" ], check=True) # Execute startup script print(" Starting training...") subprocess.run([ "ssh", "-o", "StrictHostKeyChecking=no", f"ubuntu@{instance_ip}", "bash /tmp/startup.sh" ], check=True) print("\n" + "=" * 80) print("✓ Training deployment complete!") print("=" * 80) print("\nTo monitor your training:") print(f" 1. SSH: ssh ubuntu@{instance_ip}") print(f" 2. Attach to screen: screen -r training") print(f" 3. View log: tail -f ~/nanochatAquaRat/training.log") print(f" 4. W&B Dashboard: https://wandb.ai") print("\nTo detach from screen: Ctrl+A then D") print("\nRemember to terminate the instance when done to avoid charges!") def main(): parser = argparse.ArgumentParser(description="Launch Lambda Labs instance for AQuA-RAT training") parser.add_argument("--instance-type", default="gpu_8x_h100_sxm5", help="Instance type (default: gpu_8x_h100_sxm5)") parser.add_argument("--region", default="us-west-1", help="Region to launch in (default: us-west-1)") parser.add_argument("--name", default="nanochat-aquarat-training", help="Instance name (default: nanochat-aquarat-training)") parser.add_argument("--list-types", action="store_true", help="List available instance types and exit") parser.add_argument("--no-deploy", action="store_true", help="Launch instance but don't deploy code") args = parser.parse_args() print("=" * 80) print("Lambda Labs GPU Instance Launcher for AQuA-RAT Training") print("=" * 80) # Check environment variables check_env_vars() # Initialize API client api_client = get_api_client() # List available types if requested if args.list_types: list_available_instance_types(api_client) return # Launch instance instance_id = launch_instance(api_client, args.instance_type, args.region, args.name) # Wait for instance to be ready instance = wait_for_instance(api_client, instance_id) # Deploy and run training if not args.no_deploy: time.sleep(5) # Give SSH a moment to be fully ready try: deploy_and_run(instance.ip) except subprocess.CalledProcessError as e: print(f"\nWarning: Deployment encountered an error: {e}") print(f"You can manually SSH to the instance and run the training:") print(f" ssh ubuntu@{instance.ip}") print(f" cd nanochatAquaRat && bash run_aquarat_small.sh") print("\n" + "=" * 80) print("Instance Information") print("=" * 80) print(f"Instance ID: {instance_id}") print(f"IP Address: {instance.ip}") print(f"Status: {instance.status}") print("\nTo terminate this instance:") print(f" python launch_lambda.py --terminate {instance_id}") if __name__ == "__main__": main()