File size: 4,432 Bytes
a1190da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/bin/bash
# Launch AWS instance for model evaluation with GPU

set -e

# Default values
INSTANCE_TYPE="g5.xlarge"
VOLUME_SIZE=100
INSTANCE_NAME="seriguela-evaluation"
KEY_NAME="chave-gpu-nova"
SECURITY_GROUP="seriguela-sg"
AMI_ID=""  # Will be auto-detected

# Parse arguments
HF_TOKEN=""
WANDB_KEY=""

while [[ $# -gt 0 ]]; do
  case $1 in
    --hf-token)
      HF_TOKEN="$2"
      shift 2
      ;;
    --wandb-key)
      WANDB_KEY="$2"
      shift 2
      ;;
    *)
      echo "Unknown option: $1"
      exit 1
      ;;
  esac
done

if [ -z "$HF_TOKEN" ] || [ -z "$WANDB_KEY" ]; then
  echo "Error: --hf-token and --wandb-key are required"
  exit 1
fi

echo "Launching evaluation instance..."
echo "Instance type: $INSTANCE_TYPE"

# Create user data script
cat > aws/temp/userdata_eval.sh << 'USERDATA_EOF'
#!/bin/bash
set -x
exec > >(tee /home/ubuntu/setup.log) 2>&1

echo "=== Starting evaluation setup ==="
date

# Update system
export DEBIAN_FRONTEND=noninteractive
sudo apt-get update
sudo apt-get install -y git python3-pip python3-venv

# Setup as ubuntu user
cd /home/ubuntu

# Clone repository
if [ ! -d "seriguela" ]; then
    sudo -u ubuntu git clone https://github.com/augustocsc/seriguela.git
    cd seriguela
else
    cd seriguela
    sudo -u ubuntu git pull
fi

# Create virtual environment
sudo -u ubuntu python3 -m venv venv
source venv/bin/activate

# Install dependencies
pip install --upgrade pip
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121
pip install -r requirements.txt

# Set tokens
echo "export HF_TOKEN='PLACEHOLDER_HF_TOKEN'" >> ~/.bashrc
echo "export WANDB_API_KEY='PLACEHOLDER_WANDB_KEY'" >> ~/.bashrc
source ~/.bashrc

# Create results directory
mkdir -p results

# Mark setup complete
touch /home/ubuntu/.evaluation_ready

echo "=== Evaluation setup complete ==="
date
USERDATA_EOF

# Replace placeholders in userdata
sed -i "s/PLACEHOLDER_HF_TOKEN/$HF_TOKEN/g" aws/temp/userdata_eval.sh
sed -i "s/PLACEHOLDER_WANDB_KEY/$WANDB_KEY/g" aws/temp/userdata_eval.sh

# Auto-detect AMI if not set
if [ -z "$AMI_ID" ]; then
    echo "Auto-detecting Deep Learning AMI..."
    AMI_ID=$(aws ec2 describe-images \
      --owners amazon \
      --filters "Name=name,Values=Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \
      "Name=state,Values=available" \
      --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \
      --output text \
      --region us-east-1)

    if [ -z "$AMI_ID" ] || [ "$AMI_ID" = "None" ]; then
        echo "ERROR: Could not find AMI"
        exit 1
    fi
    echo "Using AMI: $AMI_ID"
fi

# Launch instance
echo "Launching instance..."
INSTANCE_ID=$(aws ec2 run-instances \
  --image-id $AMI_ID \
  --instance-type $INSTANCE_TYPE \
  --key-name $KEY_NAME \
  --security-group-ids $SECURITY_GROUP \
  --block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \
  --user-data file://aws/temp/userdata_eval.sh \
  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME}]" \
  --query 'Instances[0].InstanceId' \
  --output text)

echo "Instance launched: $INSTANCE_ID"
echo "Waiting for instance to be running..."

aws ec2 wait instance-running --instance-ids $INSTANCE_ID

# Get public IP
PUBLIC_IP=$(aws ec2 describe-instances \
  --instance-ids $INSTANCE_ID \
  --query 'Reservations[0].Instances[0].PublicIpAddress' \
  --output text)

echo ""
echo "==================================="
echo "Evaluation instance ready!"
echo "==================================="
echo "Instance ID: $INSTANCE_ID"
echo "Public IP: $PUBLIC_IP"
echo ""
echo "Waiting 60 seconds for system to initialize..."
sleep 60
echo ""
echo "To check setup progress:"
echo "  ssh -i ~/.ssh/chave-gpu.pem ubuntu@$PUBLIC_IP 'tail -f ~/setup.log'"
echo ""
echo "To upload models:"
echo "  scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_base_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo "  scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_medium_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo "  scp -i ~/.ssh/chave-gpu.pem -r ./output/gpt2_large_700K_json ubuntu@$PUBLIC_IP:~/seriguela/output/"
echo ""
echo "Instance info saved to: aws/evaluation_instance.txt"

# Save instance info
mkdir -p aws
cat > aws/evaluation_instance.txt << EOF
Instance ID: $INSTANCE_ID
Public IP: $PUBLIC_IP
Instance Type: $INSTANCE_TYPE
Launch Time: $(date)
EOF

echo "Done!"