File size: 7,799 Bytes
a1190da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/bin/bash
# Launch AWS instance to train GPT-2 Medium (355M parameters)
# Usage: ./launch_medium_training.sh --hf-token TOKEN --wandb-key KEY

set -e

# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m'

print_status() { echo -e "${GREEN}[INFO]${NC} $1"; }
print_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; }
print_error() { echo -e "${RED}[ERROR]${NC} $1"; }

# Default configuration
INSTANCE_TYPE="g5.xlarge"
AMI_ID=""
KEY_NAME=""
SECURITY_GROUP=""
REGION=$(aws configure get region 2>/dev/null || echo "us-east-1")
VOLUME_SIZE=100
INSTANCE_NAME="seriguela-medium-training"
HF_TOKEN=""
WANDB_KEY=""

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --hf-token) HF_TOKEN="$2"; shift 2;;
        --wandb-key) WANDB_KEY="$2"; shift 2;;
        --instance-type) INSTANCE_TYPE="$2"; shift 2;;
        --help)
            echo "Usage: $0 --hf-token TOKEN --wandb-key KEY"
            echo "Launches AWS instance to train GPT-2 Medium (355M)"
            exit 0;;
        *) echo "Unknown option: $1"; exit 1;;
    esac
done

# Validate tokens
if [ -z "$WANDB_KEY" ]; then
    print_error "Wandb API key is required! Use --wandb-key"
    exit 1
fi

if [ -z "$HF_TOKEN" ]; then
    print_warning "HuggingFace token not provided. Model won't be pushed to Hub."
fi

print_status "Launching instance for GPT-2 Medium training..."

# Find Deep Learning AMI
print_status "Finding Deep Learning AMI..."
AMI_ID=$(aws ec2 describe-images \
    --owners amazon \
    --filters "Name=name,Values=*Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" \
    --query "Images | sort_by(@, &CreationDate) | [-1].ImageId" \
    --output text)

if [ -z "$AMI_ID" ] || [ "$AMI_ID" == "None" ]; then
    print_error "Could not find Deep Learning AMI"
    exit 1
fi
print_status "Using AMI: $AMI_ID"

# Find key pair
KEY_NAME=$(aws ec2 describe-key-pairs --query "KeyPairs[0].KeyName" --output text 2>/dev/null)
if [ -z "$KEY_NAME" ] || [ "$KEY_NAME" == "None" ]; then
    print_error "No SSH key pair found"
    exit 1
fi
print_status "Using key pair: $KEY_NAME"

# Find or create security group
SECURITY_GROUP=$(aws ec2 describe-security-groups \
    --filters "Name=group-name,Values=seriguela-sg" \
    --query "SecurityGroups[0].GroupId" \
    --output text 2>/dev/null)

if [ -z "$SECURITY_GROUP" ] || [ "$SECURITY_GROUP" == "None" ]; then
    print_status "Creating security group..."
    SECURITY_GROUP=$(aws ec2 create-security-group \
        --group-name seriguela-sg \
        --description "Security group for Seriguela training" \
        --query "GroupId" --output text)
    MY_IP=$(curl -s ifconfig.me)
    aws ec2 authorize-security-group-ingress \
        --group-id "$SECURITY_GROUP" \
        --protocol tcp --port 22 \
        --cidr "${MY_IP}/32"
fi
print_status "Using security group: $SECURITY_GROUP"

# Create user-data script for GPT-2 Medium training
USER_DATA=$(cat << 'USERDATA'
#!/bin/bash
exec > /var/log/user-data.log 2>&1
set -x

echo "=========================================="
echo "GPT-2 Medium Training Setup"
echo "Started: $(date)"
echo "=========================================="

# Allow system to stabilize (removed cloud-init deadlock)
sleep 5

sudo -u ubuntu bash << 'UBUNTUSETUP'
cd /home/ubuntu

echo "[1/9] Installing system dependencies..."
sudo apt-get update -qq
sudo apt-get install -y -qq python3-venv python3-pip git

echo "[2/9] Cloning repository..."
git clone https://github.com/augustocsc/seriguela.git
cd seriguela

echo "[3/9] Creating virtual environment..."
python3 -m venv venv
source venv/bin/activate

echo "[4/9] Upgrading pip..."
pip install --upgrade pip -q

echo "[5/9] Installing PyTorch with CUDA..."
pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 -q

echo "[6/9] Installing requirements..."
pip install -r requirements.txt -q

echo "[7/9] Upgrading Wandb..."
pip install --upgrade 'wandb>=0.24.1' -q

echo "[8/9] Configuring environment..."
export WANDB_API_KEY='WANDB_KEY_PLACEHOLDER'
export HF_TOKEN='HF_TOKEN_PLACEHOLDER'

echo "[9/9] Validating setup..."
nvidia-smi
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"

echo ""
echo "=========================================="
echo "Starting GPT-2 Medium Training"
echo "Model: gpt2-medium (355M parameters)"
echo "=========================================="

# Start training
cd /home/ubuntu/seriguela
source venv/bin/activate

python3 scripts/train_with_json.py \
  --model_size gpt2-medium \
  --dataset_repo augustocsc/sintetico_natural \
  --data_dir 700K \
  --output_dir ./output/gpt2_medium_700K_json \
  --num_train_epochs 3 \
  --per_device_train_batch_size 4 \
  --learning_rate 5e-5 \
  --early_stopping_patience 3 \
  2>&1 | tee /home/ubuntu/training_medium.log

echo ""
echo "=========================================="
echo "Training Completed!"
echo "Finished: $(date)"
echo "=========================================="

# Create completion marker
touch /home/ubuntu/.training_complete

# Save results info
cat > /home/ubuntu/training_results.txt << 'RESULTS'
GPT-2 Medium Training Completed!

Model saved to: ~/seriguela/output/gpt2_medium_700K_json

Next steps:
1. Test model with REINFORCE:
   cd ~/seriguela
   source venv/bin/activate
   python scripts/debug_reinforce.py \
     --model_path ./output/gpt2_medium_700K_json \
     --dataset data/benchmarks/nguyen/nguyen_5.csv \
     --epochs 10

2. Compare with base model:
   python scripts/compare_trained_models.py \
     --model_base augustocsc/Se124M_700K_infix_v3_json \
     --model_medium ./output/gpt2_medium_700K_json

3. Download model to local:
   scp -r ubuntu@IP:~/seriguela/output/gpt2_medium_700K_json ./
RESULTS

UBUNTUSETUP
USERDATA
)

# Replace placeholders
USER_DATA="${USER_DATA//WANDB_KEY_PLACEHOLDER/$WANDB_KEY}"
USER_DATA="${USER_DATA//HF_TOKEN_PLACEHOLDER/$HF_TOKEN}"

# Launch instance
print_status "Launching instance..."
INSTANCE_ID=$(aws ec2 run-instances \
    --image-id "$AMI_ID" \
    --instance-type "$INSTANCE_TYPE" \
    --key-name "$KEY_NAME" \
    --security-group-ids "$SECURITY_GROUP" \
    --block-device-mappings "[{\"DeviceName\":\"/dev/sda1\",\"Ebs\":{\"VolumeSize\":$VOLUME_SIZE,\"VolumeType\":\"gp3\"}}]" \
    --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=$INSTANCE_NAME},{Key=Model,Value=gpt2-medium}]" \
    --user-data "$USER_DATA" \
    --query "Instances[0].InstanceId" \
    --output text)

print_status "Instance launched: $INSTANCE_ID"

# Wait for instance
print_status "Waiting for instance to start..."
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID"

# Get public IP
PUBLIC_IP=$(aws ec2 describe-instances \
    --instance-ids "$INSTANCE_ID" \
    --query "Reservations[0].Instances[0].PublicIpAddress" \
    --output text)

echo ""
echo "=========================================="
echo -e "${GREEN}GPT-2 Medium Training Instance Ready!${NC}"
echo "=========================================="
echo "Instance ID: $INSTANCE_ID"
echo "Public IP: $PUBLIC_IP"
echo ""
echo -e "${BLUE}Monitor training:${NC}"
echo "  ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP}"
echo "  tail -f /home/ubuntu/training_medium.log"
echo ""
echo -e "${BLUE}Check when complete:${NC}"
echo "  ssh ubuntu@${PUBLIC_IP} 'while [ ! -f ~/.training_complete ]; do sleep 60; echo \"Training in progress...\"; done; cat ~/training_results.txt'"
echo ""
echo -e "${YELLOW}Estimated time:${NC} ~2-3 hours for 3 epochs"
echo ""

# Save info
INFO_DIR="${HOME}/.seriguela"
mkdir -p "$INFO_DIR"
cat > "$INFO_DIR/medium_instance_info.txt" << INFO
Instance ID: $INSTANCE_ID
Public IP: $PUBLIC_IP
Key Name: $KEY_NAME
Model: GPT-2 Medium (355M)
Launched: $(date)
INFO

print_status "Instance info saved to: $INFO_DIR/medium_instance_info.txt"