| #!/bin/bash |
| |
|
|
| set -e |
|
|
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| |
| INFO_FILE="${HOME}/.seriguela/medium_instance_info.txt" |
|
|
| if [ ! -f "$INFO_FILE" ]; then |
| echo "No medium training instance found!" |
| echo "Launch one first with: bash scripts/aws/launch_medium_training.sh" |
| exit 1 |
| fi |
|
|
| INSTANCE_ID=$(grep "Instance ID:" "$INFO_FILE" | cut -d' ' -f3) |
| PUBLIC_IP=$(grep "Public IP:" "$INFO_FILE" | cut -d' ' -f3) |
| KEY_NAME=$(grep "Key Name:" "$INFO_FILE" | cut -d' ' -f3) |
|
|
| echo "==========================================" |
| echo "Monitoring GPT-2 Medium Training" |
| echo "==========================================" |
| echo "Instance ID: $INSTANCE_ID" |
| echo "Public IP: $PUBLIC_IP" |
| echo "" |
|
|
| |
| STATUS=$(aws ec2 describe-instances \ |
| --instance-ids "$INSTANCE_ID" \ |
| --query "Reservations[0].Instances[0].State.Name" \ |
| --output text 2>/dev/null) |
|
|
| if [ "$STATUS" != "running" ]; then |
| echo "Instance is not running (status: $STATUS)" |
| exit 1 |
| fi |
|
|
| echo -e "${GREEN}Instance is running${NC}" |
| echo "" |
|
|
| |
| echo "Checking training status..." |
| COMPLETE=$(ssh -i ~/.ssh/${KEY_NAME}.pem -o StrictHostKeyChecking=no ubuntu@${PUBLIC_IP} \ |
| 'test -f ~/.training_complete && echo "yes" || echo "no"' 2>/dev/null) |
|
|
| if [ "$COMPLETE" == "yes" ]; then |
| echo -e "${GREEN}Training is COMPLETE!${NC}" |
| echo "" |
| echo "Results:" |
| ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} 'cat ~/training_results.txt' |
| echo "" |
| echo "Download model with:" |
| echo " scp -i ~/.ssh/${KEY_NAME}.pem -r ubuntu@${PUBLIC_IP}:~/seriguela/output/gpt2_medium_700K_json ./" |
| exit 0 |
| fi |
|
|
| echo -e "${YELLOW}Training in progress...${NC}" |
| echo "" |
| echo "Options:" |
| echo " 1. Watch live logs (Ctrl+C to exit)" |
| echo " 2. Check last 50 lines" |
| echo " 3. Check GPU usage" |
| echo " 4. Exit" |
| echo "" |
| read -p "Select option [1-4]: " OPTION |
|
|
| case $OPTION in |
| 1) |
| echo "" |
| echo "Following training logs (Ctrl+C to stop)..." |
| echo "" |
| ssh -i ~/.ssh/${KEY_NAME}.pem -t ubuntu@${PUBLIC_IP} \ |
| 'tail -f /home/ubuntu/training_medium.log' |
| ;; |
| 2) |
| echo "" |
| ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \ |
| 'tail -n 50 /home/ubuntu/training_medium.log' |
| ;; |
| 3) |
| echo "" |
| ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \ |
| 'nvidia-smi' |
| ;; |
| 4) |
| exit 0 |
| ;; |
| *) |
| echo "Invalid option" |
| exit 1 |
| ;; |
| esac |
|
|