|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
HF_TOKEN="${1:-}" |
|
|
WANDB_KEY="${2:-}" |
|
|
|
|
|
if [ -z "$HF_TOKEN" ] || [ -z "$WANDB_KEY" ]; then |
|
|
echo "Usage: $0 <HF_TOKEN> <WANDB_KEY>" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
INSTANCE_TYPE="g5.xlarge" |
|
|
AMI_ID="ami-0c2b0d3d5d8a8a0a0" |
|
|
KEY_NAME="chave-gpu" |
|
|
SECURITY_GROUP="sg-0deaa73e23482e3f6" |
|
|
|
|
|
|
|
|
echo "Auto-detecting Deep Learning AMI..." |
|
|
AMI_ID=$(aws ec2 describe-images \ |
|
|
--owners amazon \ |
|
|
--filters "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \ |
|
|
"Name=state,Values=available" \ |
|
|
--query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \ |
|
|
--output text \ |
|
|
--region us-east-1) |
|
|
|
|
|
echo "Using AMI: $AMI_ID" |
|
|
|
|
|
|
|
|
create_userdata() { |
|
|
local instance_name=$1 |
|
|
cat > aws/temp/userdata_${instance_name}.sh << 'USERDATA_EOF' |
|
|
|
|
|
set -x |
|
|
exec > >(tee -a /home/ubuntu/setup.log) 2>&1 |
|
|
|
|
|
echo "===== Starting Setup =====" |
|
|
date |
|
|
|
|
|
|
|
|
apt-get update |
|
|
apt-get install -y git python3-pip python3-venv htop |
|
|
|
|
|
|
|
|
cd /home/ubuntu |
|
|
sudo -u ubuntu git clone https://github.com/augustocsc/seriguela.git || true |
|
|
cd seriguela |
|
|
sudo -u ubuntu git pull |
|
|
|
|
|
|
|
|
sudo -u ubuntu python3 -m venv venv |
|
|
sudo -u ubuntu bash -c "source venv/bin/activate && pip install --upgrade pip" |
|
|
sudo -u ubuntu bash -c "source venv/bin/activate && pip install torch --index-url https://download.pytorch.org/whl/cu121" |
|
|
sudo -u ubuntu bash -c "source venv/bin/activate && pip install -r requirements.txt" |
|
|
|
|
|
|
|
|
echo "export HF_TOKEN=PLACEHOLDER_HF_TOKEN" >> /home/ubuntu/.bashrc |
|
|
echo "export WANDB_API_KEY=PLACEHOLDER_WANDB_KEY" >> /home/ubuntu/.bashrc |
|
|
|
|
|
|
|
|
sudo -u ubuntu mkdir -p /home/ubuntu/seriguela/output |
|
|
sudo -u ubuntu mkdir -p /home/ubuntu/seriguela/results |
|
|
|
|
|
echo "===== Setup Complete =====" |
|
|
touch /home/ubuntu/.setup_complete |
|
|
date |
|
|
USERDATA_EOF |
|
|
|
|
|
|
|
|
sed -i "s/PLACEHOLDER_HF_TOKEN/$HF_TOKEN/g" aws/temp/userdata_${instance_name}.sh |
|
|
sed -i "s/PLACEHOLDER_WANDB_KEY/$WANDB_KEY/g" aws/temp/userdata_${instance_name}.sh |
|
|
} |
|
|
|
|
|
|
|
|
launch_instance() { |
|
|
local name=$1 |
|
|
echo "" |
|
|
echo "Launching instance: $name" |
|
|
|
|
|
create_userdata "$name" |
|
|
|
|
|
INSTANCE_ID=$(aws ec2 run-instances \ |
|
|
--image-id $AMI_ID \ |
|
|
--instance-type $INSTANCE_TYPE \ |
|
|
--key-name $KEY_NAME \ |
|
|
--security-group-ids $SECURITY_GROUP \ |
|
|
--user-data file://aws/temp/userdata_${name}.sh \ |
|
|
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=seriguela-${name}}]" \ |
|
|
--block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":100,"VolumeType":"gp3"}}]' \ |
|
|
--query 'Instances[0].InstanceId' \ |
|
|
--output text) |
|
|
|
|
|
echo "Instance launched: $INSTANCE_ID" |
|
|
echo "$name=$INSTANCE_ID" >> aws/temp/instance_ids.txt |
|
|
} |
|
|
|
|
|
|
|
|
rm -f aws/temp/instance_ids.txt |
|
|
|
|
|
|
|
|
echo "==========================================" |
|
|
echo "Launching 3 AWS instances in parallel" |
|
|
echo "==========================================" |
|
|
|
|
|
launch_instance "eval-basic" & |
|
|
launch_instance "nguyen-1-6" & |
|
|
launch_instance "nguyen-7-12" & |
|
|
|
|
|
wait |
|
|
|
|
|
echo "" |
|
|
echo "All instances launched!" |
|
|
echo "" |
|
|
cat aws/temp/instance_ids.txt |
|
|
echo "" |
|
|
echo "Waiting for instances to be running..." |
|
|
|
|
|
|
|
|
INSTANCE_IDS=$(cat aws/temp/instance_ids.txt | cut -d'=' -f2 | tr '\n' ' ') |
|
|
|
|
|
aws ec2 wait instance-running --instance-ids $INSTANCE_IDS |
|
|
|
|
|
echo "" |
|
|
echo "All instances are running!" |
|
|
echo "" |
|
|
echo "Getting public IPs..." |
|
|
|
|
|
for line in $(cat aws/temp/instance_ids.txt); do |
|
|
name=$(echo $line | cut -d'=' -f1) |
|
|
id=$(echo $line | cut -d'=' -f2) |
|
|
ip=$(aws ec2 describe-instances \ |
|
|
--instance-ids $id \ |
|
|
--query 'Reservations[0].Instances[0].PublicIpAddress' \ |
|
|
--output text) |
|
|
echo "$name: $ip (ID: $id)" |
|
|
echo "$name=$ip" >> aws/temp/instance_ips.txt |
|
|
done |
|
|
|
|
|
echo "" |
|
|
echo "==========================================" |
|
|
echo "Instances ready!" |
|
|
echo "==========================================" |
|
|
echo "" |
|
|
echo "Next steps:" |
|
|
echo "1. Wait ~3 minutes for setup to complete" |
|
|
echo "2. Upload models to instances" |
|
|
echo "3. Start evaluations" |
|
|
echo "" |
|
|
|