| #!/bin/bash |
| |
|
|
| set -e |
|
|
| |
| INSTANCE_TYPE="g5.xlarge" |
| VOLUME_SIZE=100 |
| INSTANCE_NAME="augusto-seriguela-evaluation" |
| KEY_NAME="chave-gpu-nova" |
| SECURITY_GROUP="seriguela-sg" |
| AMI_ID="" |
|
|
| |
| HF_TOKEN="" |
| WANDB_KEY="" |
|
|
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --hf-token) |
| HF_TOKEN="$2" |
| shift 2 |
| ;; |
| --wandb-key) |
| WANDB_KEY="$2" |
| shift 2 |
| ;; |
| *) |
| echo "Unknown option: $1" |
| exit 1 |
| ;; |
| esac |
| done |
|
|
| if [ -z "$HF_TOKEN" ] || [ -z "$WANDB_KEY" ]; then |
| echo "Error: --hf-token and --wandb-key are required" |
| exit 1 |
| fi |
|
|
| echo "Launching evaluation instance..." |
| echo "Instance type: $INSTANCE_TYPE" |
|
|
| |
| cat > aws/temp/userdata_eval.sh << 'USERDATA_EOF' |
| |
| set -x |
| exec > >(tee /home/ubuntu/setup.log) 2>&1 |
|
|
| echo "=== Starting evaluation setup ===" |
| date |
|
|
| |
| export DEBIAN_FRONTEND=noninteractive |
| sudo apt-get update |
| sudo apt-get install -y git python3-pip python3-venv |
|
|
| |
| cd /home/ubuntu |
|
|
| |
| if [ ! -d "seriguela" ]; then |
| sudo -u ubuntu git clone https://github.com/augustocsc/seriguela.git |
| cd seriguela |
| else |
| cd seriguela |
| sudo -u ubuntu git pull |
| fi |
|
|
| |
| sudo -u ubuntu python3 -m venv venv |
| source venv/bin/activate |
|
|
| |
| pip install --upgrade pip |
| pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 |
| pip install -r requirements.txt |
|
|
| |
| echo "export HF_TOKEN='PLACEHOLDER_HF_TOKEN'" >> ~/.bashrc |
| echo "export WANDB_API_KEY='PLACEHOLDER_WANDB_KEY'" >> ~/.bashrc |
| source ~/.bashrc |
|
|
| |
| mkdir -p results |
|
|
| |
| touch /home/ubuntu/.evaluation_ready |
|
|
| echo "=== Evaluation setup complete ===" |
| date |
| USERDATA_EOF |
|
|
| |
| sed -i "s/PLACEHOLDER_HF_TOKEN/$HF_TOKEN/g" aws/temp/userdata_eval.sh |
| sed -i "s/PLACEHOLDER_WANDB_KEY/$WANDB_KEY/g" aws/temp/userdata_eval.sh |
|
|
| |
| if [ -z "$AMI_ID" ]; then |
| echo "Auto-detecting Deep Learning AMI..." |
| AMI_ID=$(aws ec2 describe-images \ |
| --owners amazon \ |
| --filters "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \ |
| "Name=state,Values=available" \ |
| --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \ |
| --output text \ |
| --region us-east-1) |
|
|
| if [ -z "$AMI_ID" ] || [ "$AMI_ID" = "None" ]; then |
| echo "ERROR: Could not find AMI" |
| exit 1 |
| fi |
| echo "Using AMI: $AMI_ID" |
| fi |
|
|
| |
| echo "Launching instance..." |
| INSTANCE_ID=$(aws ec2 run-instances \ |
| --image-id $AMI_ID \ |
| --instance-type $INSTANCE_TYPE \ |
| --key-name $KEY_NAME \ |
| --security-group-ids $SECURITY_GROUP \ |
| --block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \ |
| --user-data file://aws/temp/userdata_eval.sh \ |
| --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME}]" \ |
| --query 'Instances[0].InstanceId' \ |
| --output text) |
|
|
| echo "Instance launched: $INSTANCE_ID" |
| echo "Waiting for instance to be running..." |
|
|
| aws ec2 wait instance-running --instance-ids $INSTANCE_ID |
|
|
| |
| PUBLIC_IP=$(aws ec2 describe-instances \ |
| --instance-ids $INSTANCE_ID \ |
| --query 'Reservations[0].Instances[0].PublicIpAddress' \ |
| --output text) |
|
|
| echo "" |
| echo "===================================" |
| echo "Evaluation instance ready!" |
| echo "===================================" |
| echo "Instance ID: $INSTANCE_ID" |
| echo "Public IP: $PUBLIC_IP" |
| echo "" |
| echo "Waiting 60 seconds for system to initialize..." |
| sleep 60 |
| echo "" |
| echo "To check setup progress:" |
| echo " ssh -i ~/.ssh/chave-gpu.pem ubuntu@$PUBLIC_IP 'tail -f ~/setup.log'" |
| echo "" |
| echo "To upload models:" |
| echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_base_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/" |
| echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_medium_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/" |
| echo " scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_large_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/" |
| echo "" |
| echo "Instance info saved to: aws/evaluation_instance.txt" |
|
|
| |
| mkdir -p aws |
| cat > aws/evaluation_instance.txt << EOF |
| Instance ID: $INSTANCE_ID |
| Public IP: $PUBLIC_IP |
| Instance Type: $INSTANCE_TYPE |
| Launch Time: $(date) |
| EOF |
|
|
| echo "Done!" |
|
|