File size: 11,147 Bytes
3742716 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
#!/bin/bash
# Script to launch and configure AWS g5.xlarge instance for Seriguela training
# FIXED VERSION - Includes Wandb validation and proper setup
# Usage: ./launch_instance_fixed.sh [--hf-token TOKEN] [--wandb-key KEY]
set -e
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m'
print_status() { echo -e "${GREEN}[INFO]${NC} $1"; }
print_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; }
print_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Default configuration
INSTANCE_TYPE="g5.xlarge"
AMI_ID="" # Will be auto-detected
KEY_NAME="" # Will be auto-detected
SECURITY_GROUP="" # Will be auto-detected or created
REGION=$(aws configure get region 2>/dev/null || echo "us-east-1")
VOLUME_SIZE=100
INSTANCE_NAME="seriguela-training"
HF_TOKEN=""
WANDB_KEY=""
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--hf-token) HF_TOKEN="$2"; shift 2;;
--wandb-key) WANDB_KEY="$2"; shift 2;;
--instance-type) INSTANCE_TYPE="$2"; shift 2;;
--key-name) KEY_NAME="$2"; shift 2;;
--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --hf-token TOKEN HuggingFace token (required for push to hub)"
echo " --wandb-key KEY Wandb API key (required for logging)"
echo " --instance-type TYPE Instance type (default: g5.xlarge)"
echo " --key-name NAME SSH key pair name"
echo ""
echo "Example:"
echo " $0 --hf-token hf_xxx --wandb-key wandb_v1_xxx"
exit 0;;
*) echo "Unknown option: $1"; exit 1;;
esac
done
# Validate required tokens
if [ -z "$WANDB_KEY" ]; then
print_error "Wandb API key is required! Use --wandb-key"
print_warning "Get your key from: https://wandb.ai/authorize"
exit 1
fi
if [ -z "$HF_TOKEN" ]; then
print_warning "HuggingFace token not provided. Model won't be pushed to Hub."
print_warning "Get your token from: https://huggingface.co/settings/tokens"
fi
print_status "Launching Seriguela training instance with validated setup..."
# Find Deep Learning AMI
print_status "Finding Deep Learning AMI..."
AMI_ID=$(aws ec2 describe-images \
--owners amazon \
--filters "Name=name,Values=*Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \
--query "Images | sort_by(@, &CreationDate) | [-1].ImageId" \
--output text)
if [ -z "$AMI_ID" ] || [ "$AMI_ID" == "None" ]; then
print_error "Could not find Deep Learning AMI"
exit 1
fi
print_status "Using AMI: $AMI_ID"
# Find or select key pair
if [ -z "$KEY_NAME" ]; then
KEY_NAME=$(aws ec2 describe-key-pairs --query "KeyPairs[0].KeyName" --output text 2>/dev/null)
fi
if [ -z "$KEY_NAME" ] || [ "$KEY_NAME" == "None" ]; then
print_error "No SSH key pair found. Create one first or specify with --key-name"
exit 1
fi
print_status "Using key pair: $KEY_NAME"
# Find or create security group
SECURITY_GROUP=$(aws ec2 describe-security-groups \
--filters "Name=group-name,Values=seriguela-sg" \
--query "SecurityGroups[0].GroupId" \
--output text 2>/dev/null)
if [ -z "$SECURITY_GROUP" ] || [ "$SECURITY_GROUP" == "None" ]; then
print_status "Creating security group..."
SECURITY_GROUP=$(aws ec2 create-security-group \
--group-name seriguela-sg \
--description "Security group for Seriguela training" \
--query "GroupId" --output text)
# Get current IP and add SSH rule
MY_IP=$(curl -s ifconfig.me)
aws ec2 authorize-security-group-ingress \
--group-id "$SECURITY_GROUP" \
--protocol tcp --port 22 \
--cidr "${MY_IP}/32"
print_status "Created security group with SSH access from $MY_IP"
else
# Update security group with current IP
MY_IP=$(curl -s ifconfig.me)
aws ec2 authorize-security-group-ingress \
--group-id "$SECURITY_GROUP" \
--protocol tcp --port 22 \
--cidr "${MY_IP}/32" 2>/dev/null || true
fi
print_status "Using security group: $SECURITY_GROUP"
# Create user-data script for automatic setup with validation
USER_DATA=$(cat << USERDATA
#!/bin/bash
exec > /var/log/user-data.log 2>&1
set -x
echo "=========================================="
echo "Seriguela Instance Setup - VALIDATED"
echo "Started: \$(date)"
echo "=========================================="
# Wait for cloud-init to complete
cloud-init status --wait
# Setup as ubuntu user
sudo -u ubuntu bash << 'UBUNTUSETUP'
cd /home/ubuntu
echo "[1/8] Installing system dependencies..."
sudo apt-get update -qq
sudo apt-get install -y -qq python3-venv python3-pip git dos2unix
echo "[2/8] Cloning repository..."
git clone https://github.com/augustocsc/seriguela.git
cd seriguela
echo "[3/8] Creating virtual environment..."
python3 -m venv venv
source venv/bin/activate
echo "[4/8] Upgrading pip..."
pip install --upgrade pip -q
echo "[5/8] Installing requirements..."
pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 -q
echo "[6/8] Upgrading Wandb to latest version..."
pip install --upgrade 'wandb>=0.24.1' -q
echo "[7/8] Configuring environment..."
# Create .env file
cat > .env << 'ENVFILE'
HF_TOKEN=$HF_TOKEN
WANDB_API_KEY=$WANDB_KEY
ENVFILE
echo "[8/8] Validating setup..."
# Validate Python packages
python3 << 'PYCHECK'
import sys
print("Testing imports...")
try:
import transformers
print(f"β
transformers {transformers.__version__}")
import torch
print(f"β
torch {torch.__version__}")
import wandb
print(f"β
wandb {wandb.__version__}")
import peft
print(f"β
peft {peft.__version__}")
except ImportError as e:
print(f"β Import failed: {e}")
sys.exit(1)
PYCHECK
if [ \$? -ne 0 ]; then
echo "β Package validation failed"
exit 1
fi
# Validate GPU
echo "Checking GPU..."
if nvidia-smi &> /dev/null; then
echo "β
GPU detected:"
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
else
echo "β No GPU detected"
exit 1
fi
# Validate Wandb authentication
if [ -n "$WANDB_KEY" ]; then
echo "Validating Wandb authentication..."
python3 << PYVALIDATE
import wandb
import os
try:
result = wandb.login(key='$WANDB_KEY')
if result:
print("β
Wandb authentication successful")
# Get user info
import requests
response = requests.get('https://api.wandb.ai/graphql',
headers={'Authorization': f'Bearer $WANDB_KEY'},
json={'query': '{viewer{entity}}'})
if response.status_code == 200:
print(f" Logged in to Wandb")
else:
print("β Wandb authentication failed")
exit(1)
except Exception as e:
print(f"β Wandb validation error: {e}")
exit(1)
PYVALIDATE
if [ \$? -ne 0 ]; then
echo "β Wandb authentication failed"
exit 1
fi
else
echo "β οΈ No Wandb key provided - skipping validation"
fi
# Validate HuggingFace token
if [ -n "$HF_TOKEN" ]; then
echo "Validating HuggingFace authentication..."
python3 << PYVALIDATE
from huggingface_hub import HfApi
try:
api = HfApi(token='$HF_TOKEN')
user = api.whoami()
print(f"β
HuggingFace authentication successful")
print(f" Logged in as: {user.get('name', 'unknown')}")
except Exception as e:
print(f"β HuggingFace validation error: {e}")
exit(1)
PYVALIDATE
if [ \$? -ne 0 ]; then
echo "β HuggingFace authentication failed"
exit 1
fi
else
echo "β οΈ No HuggingFace token provided - model won't be pushed to Hub"
fi
# All validations passed
echo ""
echo "=========================================="
echo "β
Setup Complete and Validated!"
echo "Finished: \$(date)"
echo "=========================================="
# Create completion markers
touch /home/ubuntu/.setup_complete
touch /home/ubuntu/.setup_validated
# Create info file
cat > /home/ubuntu/setup_info.txt << 'INFOFILE'
Setup completed successfully!
Validated:
- Python packages installed
- GPU detected
- Wandb authenticated
- HuggingFace authenticated (if token provided)
Ready to train!
Quick commands:
cd ~/seriguela
source venv/bin/activate
python scripts/train.py --help
Monitor scripts:
bash scripts/aws/monitor_training_auto.sh
INFOFILE
echo "Setup info saved to ~/setup_info.txt"
UBUNTUSETUP
# End of setup
echo "User-data script completed"
USERDATA
)
# Replace placeholder tokens in user-data
USER_DATA="${USER_DATA//\$HF_TOKEN/$HF_TOKEN}"
USER_DATA="${USER_DATA//\$WANDB_KEY/$WANDB_KEY}"
# Launch instance
print_status "Launching instance..."
INSTANCE_ID=$(aws ec2 run-instances \
--image-id "$AMI_ID" \
--instance-type "$INSTANCE_TYPE" \
--key-name "$KEY_NAME" \
--security-group-ids "$SECURITY_GROUP" \
--block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME},{Key=Project,Value=seriguela},{Key=AutoSetup,Value=validated}]" \
--user-data "$USER_DATA" \
--query "Instances[0].InstanceId" \
--output text)
print_status "Instance launched: $INSTANCE_ID"
# Wait for instance to be running
print_status "Waiting for instance to start..."
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID"
# Get public IP
PUBLIC_IP=$(aws ec2 describe-instances \
--instance-ids "$INSTANCE_ID" \
--query "Reservations[0].Instances[0].PublicIpAddress" \
--output text)
echo ""
echo "=========================================="
echo -e "${GREEN}Instance Ready!${NC}"
echo "=========================================="
echo "Instance ID: $INSTANCE_ID"
echo "Public IP: $PUBLIC_IP"
echo "Key Pair: $KEY_NAME"
echo ""
echo -e "${BLUE}Connect with:${NC}"
echo " ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP}"
echo ""
echo -e "${BLUE}Check setup progress:${NC}"
echo " ssh ubuntu@${PUBLIC_IP} 'tail -f /var/log/user-data.log'"
echo ""
echo -e "${BLUE}Wait for VALIDATED setup to complete:${NC}"
echo " ssh ubuntu@${PUBLIC_IP} 'while [ ! -f ~/.setup_validated ]; do sleep 10; echo \"Setup in progress...\"; done; echo \"β
Setup validated!\"; cat ~/setup_info.txt'"
echo ""
echo -e "${BLUE}Then run training:${NC}"
echo " ssh ubuntu@${PUBLIC_IP} 'cd seriguela && source venv/bin/activate && bash scripts/aws/run_all_training.sh'"
echo ""
echo -e "${YELLOW}Setup includes:${NC}"
echo " β
Wandb 0.24.1+ with authentication test"
echo " β
HuggingFace authentication test"
echo " β
GPU validation"
echo " β
All packages validated"
echo ""
# Save instance info
INFO_DIR="${HOME}/.seriguela"
mkdir -p "$INFO_DIR"
echo "$INSTANCE_ID" > "$INFO_DIR/last_instance_id.txt"
echo "$PUBLIC_IP" > "$INFO_DIR/last_instance_ip.txt"
echo "$KEY_NAME" > "$INFO_DIR/last_key_name.txt"
cat > "$INFO_DIR/last_instance_info.txt" << INFOEND
Instance ID: $INSTANCE_ID
Public IP: $PUBLIC_IP
Key Name: $KEY_NAME
Instance Type: $INSTANCE_TYPE
Region: $REGION
Launched: $(date)
Setup: Validated (Wandb + HF + GPU)
INFOEND
print_status "Instance info saved to: $INFO_DIR/"
echo ""
|