File size: 4,432 Bytes
a1190da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
#!/bin/bash
# Launch AWS instance for model evaluation with GPU
set -e
# Default values
INSTANCE_TYPE="g5.xlarge"
VOLUME_SIZE=100
INSTANCE_NAME="seriguela-evaluation"
KEY_NAME="chave-gpu-nova"
SECURITY_GROUP="seriguela-sg"
AMI_ID="" # Will be auto-detected
# Parse arguments
HF_TOKEN=""
WANDB_KEY=""
while [[ $# -gt 0 ]]; do
case $1 in
--hf-token)
HF_TOKEN="$2"
shift 2
;;
--wandb-key)
WANDB_KEY="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ -z "$HF_TOKEN" ] || [ -z "$WANDB_KEY" ]; then
echo "Error: --hf-token and --wandb-key are required"
exit 1
fi
echo "Launching evaluation instance..."
echo "Instance type: $INSTANCE_TYPE"
# Create user data script
cat > aws/temp/userdata_eval.sh << 'USERDATA_EOF'
#!/bin/bash
set -x
exec > >(tee /home/ubuntu/setup.log) 2>&1
echo "=== Starting evaluation setup ==="
date
# Update system
export DEBIAN_FRONTEND=noninteractive
sudo apt-get update
sudo apt-get install -y git python3-pip python3-venv
# Setup as ubuntu user
cd /home/ubuntu
# Clone repository
if [ ! -d "seriguela" ]; then
sudo -u ubuntu git clone https://github.com/augustocsc/seriguela.git
cd seriguela
else
cd seriguela
sudo -u ubuntu git pull
fi
# Create virtual environment
sudo -u ubuntu python3 -m venv venv
source venv/bin/activate
# Install dependencies
pip install --upgrade pip
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
pip install -r requirements.txt
# Set tokens
echo "export HF_TOKEN='PLACEHOLDER_HF_TOKEN'" >> ~/.bashrc
echo "export WANDB_API_KEY='PLACEHOLDER_WANDB_KEY'" >> ~/.bashrc
source ~/.bashrc
# Create results directory
mkdir -p results
# Mark setup complete
touch /home/ubuntu/.evaluation_ready
echo "=== Evaluation setup complete ==="
date
USERDATA_EOF
# Replace placeholders in userdata
sed -i "s/PLACEHOLDER_HF_TOKEN/$HF_TOKEN/g" aws/temp/userdata_eval.sh
sed -i "s/PLACEHOLDER_WANDB_KEY/$WANDB_KEY/g" aws/temp/userdata_eval.sh
# Auto-detect AMI if not set
if [ -z "$AMI_ID" ]; then
echo "Auto-detecting Deep Learning AMI..."
AMI_ID=$(aws ec2 describe-images \
--owners amazon \
--filters "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \
"Name=state,Values=available" \
--query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \
--output text \
--region us-east-1)
if [ -z "$AMI_ID" ] || [ "$AMI_ID" = "None" ]; then
echo "ERROR: Could not find AMI"
exit 1
fi
echo "Using AMI: $AMI_ID"
fi
# Launch instance
echo "Launching instance..."
INSTANCE_ID=$(aws ec2 run-instances \
--image-id $AMI_ID \
--instance-type $INSTANCE_TYPE \
--key-name $KEY_NAME \
--security-group-ids $SECURITY_GROUP \
--block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \
--user-data file://aws/temp/userdata_eval.sh \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME}]" \
--query 'Instances[0].InstanceId' \
--output text)
echo "Instance launched: $INSTANCE_ID"
echo "Waiting for instance to be running..."
aws ec2 wait instance-running --instance-ids $INSTANCE_ID
# Get public IP
PUBLIC_IP=$(aws ec2 describe-instances \
--instance-ids $INSTANCE_ID \
--query 'Reservations[0].Instances[0].PublicIpAddress' \
--output text)
echo ""
echo "==================================="
echo "Evaluation instance ready!"
echo "==================================="
echo "Instance ID: $INSTANCE_ID"
echo "Public IP: $PUBLIC_IP"
echo ""
echo "Waiting 60 seconds for system to initialize..."
sleep 60
echo ""
echo "To check setup progress:"
echo " ssh -i ~/.ssh/chave-gpu.pem ubuntu@$PUBLIC_IP 'tail -f ~/setup.log'"
echo ""
echo "To upload models:"
echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_base_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_medium_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_large_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo ""
echo "Instance info saved to: aws/evaluation_instance.txt"
# Save instance info
mkdir -p aws
cat > aws/evaluation_instance.txt << EOF
Instance ID: $INSTANCE_ID
Public IP: $PUBLIC_IP
Instance Type: $INSTANCE_TYPE
Launch Time: $(date)
EOF
echo "Done!"
|