| #!/bin/bash |
| |
| |
| |
|
|
| set -e |
|
|
| |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| RED='\033[0;31m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| print_status() { echo -e "${GREEN}[INFO]${NC} $1"; } |
| print_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; } |
| print_error() { echo -e "${RED}[ERROR]${NC} $1"; } |
|
|
| |
| INSTANCE_TYPE="g5.xlarge" |
| AMI_ID="" |
| KEY_NAME="" |
| SECURITY_GROUP="" |
| REGION=$(aws configure get region 2>/dev/null || echo "us-east-1") |
| VOLUME_SIZE=100 |
| INSTANCE_NAME="augusto-seriguela-training" |
| HF_TOKEN="" |
| WANDB_KEY="" |
|
|
| |
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --hf-token) HF_TOKEN="$2"; shift 2;; |
| --wandb-key) WANDB_KEY="$2"; shift 2;; |
| --instance-type) INSTANCE_TYPE="$2"; shift 2;; |
| --key-name) KEY_NAME="$2"; shift 2;; |
| --help) |
| echo "Usage: $0 [OPTIONS]" |
| echo "Options:" |
| echo " --hf-token TOKEN HuggingFace token (required for push to hub)" |
| echo " --wandb-key KEY Wandb API key (required for logging)" |
| echo " --instance-type TYPE Instance type (default: g5.xlarge)" |
| echo " --key-name NAME SSH key pair name" |
| echo "" |
| echo "Example:" |
| echo " $0 --hf-token hf_xxx --wandb-key wandb_v1_xxx" |
| exit 0;; |
| *) echo "Unknown option: $1"; exit 1;; |
| esac |
| done |
|
|
| |
| if [ -z "$WANDB_KEY" ]; then |
| print_error "Wandb API key is required! Use --wandb-key" |
| print_warning "Get your key from: https://wandb.ai/authorize" |
| exit 1 |
| fi |
|
|
| if [ -z "$HF_TOKEN" ]; then |
| print_warning "HuggingFace token not provided. Model won't be pushed to Hub." |
| print_warning "Get your token from: https://huggingface.co/settings/tokens" |
| fi |
|
|
| print_status "Launching Seriguela training instance with validated setup..." |
|
|
| |
| print_status "Finding Deep Learning AMI..." |
| AMI_ID=$(aws ec2 describe-images \ |
| --owners amazon \ |
| --filters "Name=name,Values=*Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \ |
| --query "Images | sort_by(@, &CreationDate) | [-1].ImageId" \ |
| --output text) |
|
|
| if [ -z "$AMI_ID" ] || [ "$AMI_ID" == "None" ]; then |
| print_error "Could not find Deep Learning AMI" |
| exit 1 |
| fi |
| print_status "Using AMI: $AMI_ID" |
|
|
| |
| if [ -z "$KEY_NAME" ]; then |
| KEY_NAME=$(aws ec2 describe-key-pairs --query "KeyPairs[0].KeyName" --output text 2>/dev/null) |
| fi |
| if [ -z "$KEY_NAME" ] || [ "$KEY_NAME" == "None" ]; then |
| print_error "No SSH key pair found. Create one first or specify with --key-name" |
| exit 1 |
| fi |
| print_status "Using key pair: $KEY_NAME" |
|
|
| |
| SECURITY_GROUP=$(aws ec2 describe-security-groups \ |
| --filters "Name=group-name,Values=seriguela-sg" \ |
| --query "SecurityGroups[0].GroupId" \ |
| --output text 2>/dev/null) |
|
|
| if [ -z "$SECURITY_GROUP" ] || [ "$SECURITY_GROUP" == "None" ]; then |
| print_status "Creating security group..." |
| SECURITY_GROUP=$(aws ec2 create-security-group \ |
| --group-name seriguela-sg \ |
| --description "Security group for Seriguela training" \ |
| --query "GroupId" --output text) |
|
|
| |
| MY_IP=$(curl -s ifconfig.me) |
| aws ec2 authorize-security-group-ingress \ |
| --group-id "$SECURITY_GROUP" \ |
| --protocol tcp --port 22 \ |
| --cidr "${MY_IP}/32" |
| print_status "Created security group with SSH access from $MY_IP" |
| else |
| |
| MY_IP=$(curl -s ifconfig.me) |
| aws ec2 authorize-security-group-ingress \ |
| --group-id "$SECURITY_GROUP" \ |
| --protocol tcp --port 22 \ |
| --cidr "${MY_IP}/32" 2>/dev/null || true |
| fi |
| print_status "Using security group: $SECURITY_GROUP" |
|
|
| |
| USER_DATA=$(cat << USERDATA |
| #!/bin/bash |
| exec > /var/log/user-data.log 2>&1 |
| set -x |
| |
| echo "==========================================" |
| echo "Seriguela Instance Setup - VALIDATED" |
| echo "Started: \$(date)" |
| echo "==========================================" |
| |
| # Wait for cloud-init to complete |
| cloud-init status --wait |
| |
| # Setup as ubuntu user |
| sudo -u ubuntu bash << 'UBUNTUSETUP' |
| cd /home/ubuntu |
| |
| echo "[1/8] Installing system dependencies..." |
| sudo apt-get update -qq |
| sudo apt-get install -y -qq python3-venv python3-pip git dos2unix |
| |
| echo "[2/8] Cloning repository..." |
| git clone https://github.com/augustocsc/seriguela.git |
| cd seriguela |
| |
| echo "[3/8] Creating virtual environment..." |
| python3 -m venv venv |
| source venv/bin/activate |
| |
| echo "[4/8] Upgrading pip..." |
| pip install --upgrade pip -q |
| |
| echo "[5/8] Installing requirements..." |
| pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 -q |
| |
| echo "[6/8] Upgrading Wandb to latest version..." |
| pip install --upgrade 'wandb>=0.24.1' -q |
| |
| echo "[7/8] Configuring environment..." |
| # Create .env file |
| cat > .env << 'ENVFILE' |
| HF_TOKEN=$HF_TOKEN |
| WANDB_API_KEY=$WANDB_KEY |
| ENVFILE |
| |
| echo "[8/8] Validating setup..." |
| |
| # Validate Python packages |
| python3 << 'PYCHECK' |
| import sys |
| print("Testing imports...") |
| try: |
| import transformers |
| print(f"✅ transformers {transformers.__version__}") |
| import torch |
| print(f"✅ torch {torch.__version__}") |
| import wandb |
| print(f"✅ wandb {wandb.__version__}") |
| import peft |
| print(f"✅ peft {peft.__version__}") |
| except ImportError as e: |
| print(f"❌ Import failed: {e}") |
| sys.exit(1) |
| PYCHECK |
| |
| if [ \$? -ne 0 ]; then |
| echo "❌ Package validation failed" |
| exit 1 |
| fi |
| |
| # Validate GPU |
| echo "Checking GPU..." |
| if nvidia-smi &> /dev/null; then |
| echo "✅ GPU detected:" |
| nvidia-smi --query-gpu=name,memory.total --format=csv,noheader |
| else |
| echo "❌ No GPU detected" |
| exit 1 |
| fi |
| |
| # Validate Wandb authentication |
| if [ -n "$WANDB_KEY" ]; then |
| echo "Validating Wandb authentication..." |
| python3 << PYVALIDATE |
| import wandb |
| import os |
| try: |
| result = wandb.login(key='$WANDB_KEY') |
| if result: |
| print("✅ Wandb authentication successful") |
| # Get user info |
| import requests |
| response = requests.get('https://api.wandb.ai/graphql', |
| headers={'Authorization': f'Bearer $WANDB_KEY'}, |
| json={'query': '{viewer{entity}}'}) |
| if response.status_code == 200: |
| print(f" Logged in to Wandb") |
| else: |
| print("❌ Wandb authentication failed") |
| exit(1) |
| except Exception as e: |
| print(f"❌ Wandb validation error: {e}") |
| exit(1) |
| PYVALIDATE |
| |
| if [ \$? -ne 0 ]; then |
| echo "❌ Wandb authentication failed" |
| exit 1 |
| fi |
| else |
| echo "⚠️ No Wandb key provided - skipping validation" |
| fi |
| |
| # Validate HuggingFace token |
| if [ -n "$HF_TOKEN" ]; then |
| echo "Validating HuggingFace authentication..." |
| python3 << PYVALIDATE |
| from huggingface_hub import HfApi |
| try: |
| api = HfApi(token='$HF_TOKEN') |
| user = api.whoami() |
| print(f"✅ HuggingFace authentication successful") |
| print(f" Logged in as: {user.get('name', 'unknown')}") |
| except Exception as e: |
| print(f"❌ HuggingFace validation error: {e}") |
| exit(1) |
| PYVALIDATE |
| |
| if [ \$? -ne 0 ]; then |
| echo "❌ HuggingFace authentication failed" |
| exit 1 |
| fi |
| else |
| echo "⚠️ No HuggingFace token provided - model won't be pushed to Hub" |
| fi |
| |
| # All validations passed |
| echo "" |
| echo "==========================================" |
| echo "✅ Setup Complete and Validated!" |
| echo "Finished: \$(date)" |
| echo "==========================================" |
| |
| # Create completion markers |
| touch /home/ubuntu/.setup_complete |
| touch /home/ubuntu/.setup_validated |
| |
| # Create info file |
| cat > /home/ubuntu/setup_info.txt << 'INFOFILE' |
| Setup completed successfully! |
| |
| Validated: |
| - Python packages installed |
| - GPU detected |
| - Wandb authenticated |
| - HuggingFace authenticated (if token provided) |
| |
| Ready to train! |
| |
| Quick commands: |
| cd ~/seriguela |
| source venv/bin/activate |
| python scripts/train.py --help |
| |
| Monitor scripts: |
| bash scripts/aws/monitor_training_auto.sh |
| INFOFILE |
| |
| echo "Setup info saved to ~/setup_info.txt" |
| UBUNTUSETUP |
| |
| # End of setup |
| echo "User-data script completed" |
| USERDATA |
| ) |
|
|
| |
| USER_DATA="${USER_DATA//\$HF_TOKEN/$HF_TOKEN}" |
| USER_DATA="${USER_DATA//\$WANDB_KEY/$WANDB_KEY}" |
|
|
| |
| print_status "Launching instance..." |
| INSTANCE_ID=$(aws ec2 run-instances \ |
| --image-id "$AMI_ID" \ |
| --instance-type "$INSTANCE_TYPE" \ |
| --key-name "$KEY_NAME" \ |
| --security-group-ids "$SECURITY_GROUP" \ |
| --block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \ |
| --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME},{Key=Project,Value=seriguela},{Key=AutoSetup,Value=validated}]" \ |
| --user-data "$USER_DATA" \ |
| --query "Instances[0].InstanceId" \ |
| --output text) |
|
|
| print_status "Instance launched: $INSTANCE_ID" |
|
|
| |
| print_status "Waiting for instance to start..." |
| aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" |
|
|
| |
| PUBLIC_IP=$(aws ec2 describe-instances \ |
| --instance-ids "$INSTANCE_ID" \ |
| --query "Reservations[0].Instances[0].PublicIpAddress" \ |
| --output text) |
|
|
| echo "" |
| echo "==========================================" |
| echo -e "${GREEN}Instance Ready!${NC}" |
| echo "==========================================" |
| echo "Instance ID: $INSTANCE_ID" |
| echo "Public IP: $PUBLIC_IP" |
| echo "Key Pair: $KEY_NAME" |
| echo "" |
| echo -e "${BLUE}Connect with:${NC}" |
| echo " ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP}" |
| echo "" |
| echo -e "${BLUE}Check setup progress:${NC}" |
| echo " ssh ubuntu@${PUBLIC_IP} 'tail -f /var/log/user-data.log'" |
| echo "" |
| echo -e "${BLUE}Wait for VALIDATED setup to complete:${NC}" |
| echo " ssh ubuntu@${PUBLIC_IP} 'while [ ! -f ~/.setup_validated ]; do sleep 10; echo \"Setup in progress...\"; done; echo \"✅ Setup validated!\"; cat ~/setup_info.txt'" |
| echo "" |
| echo -e "${BLUE}Then run training:${NC}" |
| echo " ssh ubuntu@${PUBLIC_IP} 'cd seriguela && source venv/bin/activate && bash scripts/aws/run_all_training.sh'" |
| echo "" |
| echo -e "${YELLOW}Setup includes:${NC}" |
| echo " ✅ Wandb 0.24.1+ with authentication test" |
| echo " ✅ HuggingFace authentication test" |
| echo " ✅ GPU validation" |
| echo " ✅ All packages validated" |
| echo "" |
|
|
| |
| INFO_DIR="${HOME}/.seriguela" |
| mkdir -p "$INFO_DIR" |
| echo "$INSTANCE_ID" > "$INFO_DIR/last_instance_id.txt" |
| echo "$PUBLIC_IP" > "$INFO_DIR/last_instance_ip.txt" |
| echo "$KEY_NAME" > "$INFO_DIR/last_key_name.txt" |
|
|
| cat > "$INFO_DIR/last_instance_info.txt" << INFOEND |
| Instance ID: $INSTANCE_ID |
| Public IP: $PUBLIC_IP |
| Key Name: $KEY_NAME |
| Instance Type: $INSTANCE_TYPE |
| Region: $REGION |
| Launched: $(date) |
| Setup: Validated (Wandb + HF + GPU) |
| INFOEND |
|
|
| print_status "Instance info saved to: $INFO_DIR/" |
| echo "" |
|
|