#!/bin/bash # Script to launch and configure AWS g5.xlarge instance for Seriguela training # FIXED VERSION - Includes Wandb validation and proper setup # Usage: ./launch_instance_fixed.sh [--hf-token TOKEN] [--wandb-key KEY] set -e # Colors GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' BLUE='\033[0;34m' NC='\033[0m' print_status() { echo -e "${GREEN}[INFO]${NC} $1"; } print_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; } print_error() { echo -e "${RED}[ERROR]${NC} $1"; } # Default configuration INSTANCE_TYPE="g5.xlarge" AMI_ID="" # Will be auto-detected KEY_NAME="" # Will be auto-detected SECURITY_GROUP="" # Will be auto-detected or created REGION=$(aws configure get region 2>/dev/null || echo "us-east-1") VOLUME_SIZE=100 INSTANCE_NAME="seriguela-training" HF_TOKEN="" WANDB_KEY="" # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --hf-token) HF_TOKEN="$2"; shift 2;; --wandb-key) WANDB_KEY="$2"; shift 2;; --instance-type) INSTANCE_TYPE="$2"; shift 2;; --key-name) KEY_NAME="$2"; shift 2;; --help) echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --hf-token TOKEN HuggingFace token (required for push to hub)" echo " --wandb-key KEY Wandb API key (required for logging)" echo " --instance-type TYPE Instance type (default: g5.xlarge)" echo " --key-name NAME SSH key pair name" echo "" echo "Example:" echo " $0 --hf-token hf_xxx --wandb-key wandb_v1_xxx" exit 0;; *) echo "Unknown option: $1"; exit 1;; esac done # Validate required tokens if [ -z "$WANDB_KEY" ]; then print_error "Wandb API key is required! Use --wandb-key" print_warning "Get your key from: https://wandb.ai/authorize" exit 1 fi if [ -z "$HF_TOKEN" ]; then print_warning "HuggingFace token not provided. Model won't be pushed to Hub." print_warning "Get your token from: https://huggingface.co/settings/tokens" fi print_status "Launching Seriguela training instance with validated setup..." # Find Deep Learning AMI print_status "Finding Deep Learning AMI..." AMI_ID=$(aws ec2 describe-images \ --owners amazon \ --filters "Name=name,Values=*Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \ --query "Images | sort_by(@, &CreationDate) | [-1].ImageId" \ --output text) if [ -z "$AMI_ID" ] || [ "$AMI_ID" == "None" ]; then print_error "Could not find Deep Learning AMI" exit 1 fi print_status "Using AMI: $AMI_ID" # Find or select key pair if [ -z "$KEY_NAME" ]; then KEY_NAME=$(aws ec2 describe-key-pairs --query "KeyPairs[0].KeyName" --output text 2>/dev/null) fi if [ -z "$KEY_NAME" ] || [ "$KEY_NAME" == "None" ]; then print_error "No SSH key pair found. Create one first or specify with --key-name" exit 1 fi print_status "Using key pair: $KEY_NAME" # Find or create security group SECURITY_GROUP=$(aws ec2 describe-security-groups \ --filters "Name=group-name,Values=seriguela-sg" \ --query "SecurityGroups[0].GroupId" \ --output text 2>/dev/null) if [ -z "$SECURITY_GROUP" ] || [ "$SECURITY_GROUP" == "None" ]; then print_status "Creating security group..." SECURITY_GROUP=$(aws ec2 create-security-group \ --group-name seriguela-sg \ --description "Security group for Seriguela training" \ --query "GroupId" --output text) # Get current IP and add SSH rule MY_IP=$(curl -s ifconfig.me) aws ec2 authorize-security-group-ingress \ --group-id "$SECURITY_GROUP" \ --protocol tcp --port 22 \ --cidr "${MY_IP}/32" print_status "Created security group with SSH access from $MY_IP" else # Update security group with current IP MY_IP=$(curl -s ifconfig.me) aws ec2 authorize-security-group-ingress \ --group-id "$SECURITY_GROUP" \ --protocol tcp --port 22 \ --cidr "${MY_IP}/32" 2>/dev/null || true fi print_status "Using security group: $SECURITY_GROUP" # Create user-data script for automatic setup with validation USER_DATA=$(cat << USERDATA #!/bin/bash exec > /var/log/user-data.log 2>&1 set -x echo "==========================================" echo "Seriguela Instance Setup - VALIDATED" echo "Started: \$(date)" echo "==========================================" # Wait for cloud-init to complete cloud-init status --wait # Setup as ubuntu user sudo -u ubuntu bash << 'UBUNTUSETUP' cd /home/ubuntu echo "[1/8] Installing system dependencies..." sudo apt-get update -qq sudo apt-get install -y -qq python3-venv python3-pip git dos2unix echo "[2/8] Cloning repository..." git clone https://github.com/augustocsc/seriguela.git cd seriguela echo "[3/8] Creating virtual environment..." python3 -m venv venv source venv/bin/activate echo "[4/8] Upgrading pip..." pip install --upgrade pip -q echo "[5/8] Installing requirements..." pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 -q echo "[6/8] Upgrading Wandb to latest version..." pip install --upgrade 'wandb>=0.24.1' -q echo "[7/8] Configuring environment..." # Create .env file cat > .env << 'ENVFILE' HF_TOKEN=$HF_TOKEN WANDB_API_KEY=$WANDB_KEY ENVFILE echo "[8/8] Validating setup..." # Validate Python packages python3 << 'PYCHECK' import sys print("Testing imports...") try: import transformers print(f"✅ transformers {transformers.__version__}") import torch print(f"✅ torch {torch.__version__}") import wandb print(f"✅ wandb {wandb.__version__}") import peft print(f"✅ peft {peft.__version__}") except ImportError as e: print(f"❌ Import failed: {e}") sys.exit(1) PYCHECK if [ \$? -ne 0 ]; then echo "❌ Package validation failed" exit 1 fi # Validate GPU echo "Checking GPU..." if nvidia-smi &> /dev/null; then echo "✅ GPU detected:" nvidia-smi --query-gpu=name,memory.total --format=csv,noheader else echo "❌ No GPU detected" exit 1 fi # Validate Wandb authentication if [ -n "$WANDB_KEY" ]; then echo "Validating Wandb authentication..." python3 << PYVALIDATE import wandb import os try: result = wandb.login(key='$WANDB_KEY') if result: print("✅ Wandb authentication successful") # Get user info import requests response = requests.get('https://api.wandb.ai/graphql', headers={'Authorization': f'Bearer $WANDB_KEY'}, json={'query': '{viewer{entity}}'}) if response.status_code == 200: print(f" Logged in to Wandb") else: print("❌ Wandb authentication failed") exit(1) except Exception as e: print(f"❌ Wandb validation error: {e}") exit(1) PYVALIDATE if [ \$? -ne 0 ]; then echo "❌ Wandb authentication failed" exit 1 fi else echo "⚠️ No Wandb key provided - skipping validation" fi # Validate HuggingFace token if [ -n "$HF_TOKEN" ]; then echo "Validating HuggingFace authentication..." python3 << PYVALIDATE from huggingface_hub import HfApi try: api = HfApi(token='$HF_TOKEN') user = api.whoami() print(f"✅ HuggingFace authentication successful") print(f" Logged in as: {user.get('name', 'unknown')}") except Exception as e: print(f"❌ HuggingFace validation error: {e}") exit(1) PYVALIDATE if [ \$? -ne 0 ]; then echo "❌ HuggingFace authentication failed" exit 1 fi else echo "⚠️ No HuggingFace token provided - model won't be pushed to Hub" fi # All validations passed echo "" echo "==========================================" echo "✅ Setup Complete and Validated!" echo "Finished: \$(date)" echo "==========================================" # Create completion markers touch /home/ubuntu/.setup_complete touch /home/ubuntu/.setup_validated # Create info file cat > /home/ubuntu/setup_info.txt << 'INFOFILE' Setup completed successfully! Validated: - Python packages installed - GPU detected - Wandb authenticated - HuggingFace authenticated (if token provided) Ready to train! Quick commands: cd ~/seriguela source venv/bin/activate python scripts/train.py --help Monitor scripts: bash scripts/aws/monitor_training_auto.sh INFOFILE echo "Setup info saved to ~/setup_info.txt" UBUNTUSETUP # End of setup echo "User-data script completed" USERDATA ) # Replace placeholder tokens in user-data USER_DATA="${USER_DATA//\$HF_TOKEN/$HF_TOKEN}" USER_DATA="${USER_DATA//\$WANDB_KEY/$WANDB_KEY}" # Launch instance print_status "Launching instance..." INSTANCE_ID=$(aws ec2 run-instances \ --image-id "$AMI_ID" \ --instance-type "$INSTANCE_TYPE" \ --key-name "$KEY_NAME" \ --security-group-ids "$SECURITY_GROUP" \ --block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \ --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME},{Key=Project,Value=seriguela},{Key=AutoSetup,Value=validated}]" \ --user-data "$USER_DATA" \ --query "Instances[0].InstanceId" \ --output text) print_status "Instance launched: $INSTANCE_ID" # Wait for instance to be running print_status "Waiting for instance to start..." aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" # Get public IP PUBLIC_IP=$(aws ec2 describe-instances \ --instance-ids "$INSTANCE_ID" \ --query "Reservations[0].Instances[0].PublicIpAddress" \ --output text) echo "" echo "==========================================" echo -e "${GREEN}Instance Ready!${NC}" echo "==========================================" echo "Instance ID: $INSTANCE_ID" echo "Public IP: $PUBLIC_IP" echo "Key Pair: $KEY_NAME" echo "" echo -e "${BLUE}Connect with:${NC}" echo " ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP}" echo "" echo -e "${BLUE}Check setup progress:${NC}" echo " ssh ubuntu@${PUBLIC_IP} 'tail -f /var/log/user-data.log'" echo "" echo -e "${BLUE}Wait for VALIDATED setup to complete:${NC}" echo " ssh ubuntu@${PUBLIC_IP} 'while [ ! -f ~/.setup_validated ]; do sleep 10; echo \"Setup in progress...\"; done; echo \"✅ Setup validated!\"; cat ~/setup_info.txt'" echo "" echo -e "${BLUE}Then run training:${NC}" echo " ssh ubuntu@${PUBLIC_IP} 'cd seriguela && source venv/bin/activate && bash scripts/aws/run_all_training.sh'" echo "" echo -e "${YELLOW}Setup includes:${NC}" echo " ✅ Wandb 0.24.1+ with authentication test" echo " ✅ HuggingFace authentication test" echo " ✅ GPU validation" echo " ✅ All packages validated" echo "" # Save instance info INFO_DIR="${HOME}/.seriguela" mkdir -p "$INFO_DIR" echo "$INSTANCE_ID" > "$INFO_DIR/last_instance_id.txt" echo "$PUBLIC_IP" > "$INFO_DIR/last_instance_ip.txt" echo "$KEY_NAME" > "$INFO_DIR/last_key_name.txt" cat > "$INFO_DIR/last_instance_info.txt" << INFOEND Instance ID: $INSTANCE_ID Public IP: $PUBLIC_IP Key Name: $KEY_NAME Instance Type: $INSTANCE_TYPE Region: $REGION Launched: $(date) Setup: Validated (Wandb + HF + GPU) INFOEND print_status "Instance info saved to: $INFO_DIR/" echo ""