| #!/bin/bash |
| |
| |
| |
|
|
| set -e |
|
|
| HF_TOKEN="${1:-}" |
| WANDB_KEY="${2:-}" |
|
|
| if [ -z "$HF_TOKEN" ] || [ -z "$WANDB_KEY" ]; then |
| echo "Usage: $0 <HF_TOKEN> <WANDB_KEY>" |
| exit 1 |
| fi |
|
|
| INSTANCE_TYPE="g5.xlarge" |
| AMI_ID="ami-0c2b0d3d5d8a8a0a0" |
| KEY_NAME="chave-gpu" |
| SECURITY_GROUP="sg-0deaa73e23482e3f6" |
|
|
| |
| echo "Auto-detecting Deep Learning AMI..." |
| AMI_ID=$(aws ec2 describe-images \ |
| --owners amazon \ |
| --filters "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \ |
| "Name=state,Values=available" \ |
| --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \ |
| --output text \ |
| --region us-east-1) |
|
|
| echo "Using AMI: $AMI_ID" |
|
|
| |
| create_userdata() { |
| local instance_name=$1 |
| cat > aws/temp/userdata_${instance_name}.sh << 'USERDATA_EOF' |
| |
| set -x |
| exec > >(tee -a /home/ubuntu/setup.log) 2>&1 |
|
|
| echo "===== Starting Setup =====" |
| date |
|
|
| |
| apt-get update |
| apt-get install -y git python3-pip python3-venv htop |
|
|
| |
| cd /home/ubuntu |
| sudo -u ubuntu git clone https://github.com/augustocsc/seriguela.git || true |
| cd seriguela |
| sudo -u ubuntu git pull |
|
|
| |
| sudo -u ubuntu python3 -m venv venv |
| sudo -u ubuntu bash -c "source venv/bin/activate && pip install --upgrade pip" |
| sudo -u ubuntu bash -c "source venv/bin/activate && pip install torch --index-url https://download.pytorch.org/whl/cu121" |
| sudo -u ubuntu bash -c "source venv/bin/activate && pip install -r requirements.txt" |
|
|
| |
| echo "export HF_TOKEN=PLACEHOLDER_HF_TOKEN" >> /home/ubuntu/.bashrc |
| echo "export WANDB_API_KEY=PLACEHOLDER_WANDB_KEY" >> /home/ubuntu/.bashrc |
|
|
| |
| sudo -u ubuntu mkdir -p /home/ubuntu/seriguela/output |
| sudo -u ubuntu mkdir -p /home/ubuntu/seriguela/results |
|
|
| echo "===== Setup Complete =====" |
| touch /home/ubuntu/.setup_complete |
| date |
| USERDATA_EOF |
|
|
| |
| sed -i "s/PLACEHOLDER_HF_TOKEN/$HF_TOKEN/g" aws/temp/userdata_${instance_name}.sh |
| sed -i "s/PLACEHOLDER_WANDB_KEY/$WANDB_KEY/g" aws/temp/userdata_${instance_name}.sh |
| } |
|
|
| |
| launch_instance() { |
| local name=$1 |
| echo "" |
| echo "Launching instance: $name" |
|
|
| create_userdata "$name" |
|
|
| INSTANCE_ID=$(aws ec2 run-instances \ |
| --image-id $AMI_ID \ |
| --instance-type $INSTANCE_TYPE \ |
| --key-name $KEY_NAME \ |
| --security-group-ids $SECURITY_GROUP \ |
| --user-data file://aws/temp/userdata_${name}.sh \ |
| --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=seriguela-${name}}]" \ |
| --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":100,"VolumeType":"gp3"}}]' \ |
| --query 'Instances[0].InstanceId' \ |
| --output text) |
|
|
| echo "Instance launched: $INSTANCE_ID" |
| echo "$name=$INSTANCE_ID" >> aws/temp/instance_ids.txt |
| } |
|
|
| |
| rm -f aws/temp/instance_ids.txt |
|
|
| |
| echo "==========================================" |
| echo "Launching 3 AWS instances in parallel" |
| echo "==========================================" |
|
|
| launch_instance "eval-basic" & |
| launch_instance "nguyen-1-6" & |
| launch_instance "nguyen-7-12" & |
|
|
| wait |
|
|
| echo "" |
| echo "All instances launched!" |
| echo "" |
| cat aws/temp/instance_ids.txt |
| echo "" |
| echo "Waiting for instances to be running..." |
|
|
| |
| INSTANCE_IDS=$(cat aws/temp/instance_ids.txt | cut -d'=' -f2 | tr '\n' ' ') |
|
|
| aws ec2 wait instance-running --instance-ids $INSTANCE_IDS |
|
|
| echo "" |
| echo "All instances are running!" |
| echo "" |
| echo "Getting public IPs..." |
|
|
| for line in $(cat aws/temp/instance_ids.txt); do |
| name=$(echo $line | cut -d'=' -f1) |
| id=$(echo $line | cut -d'=' -f2) |
| ip=$(aws ec2 describe-instances \ |
| --instance-ids $id \ |
| --query 'Reservations[0].Instances[0].PublicIpAddress' \ |
| --output text) |
| echo "$name: $ip (ID: $id)" |
| echo "$name=$ip" >> aws/temp/instance_ips.txt |
| done |
|
|
| echo "" |
| echo "==========================================" |
| echo "Instances ready!" |
| echo "==========================================" |
| echo "" |
| echo "Next steps:" |
| echo "1. Wait ~3 minutes for setup to complete" |
| echo "2. Upload models to instances" |
| echo "3. Start evaluations" |
| echo "" |
|
|