|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
GREEN='\033[0;32m' |
|
|
YELLOW='\033[1;33m' |
|
|
BLUE='\033[0;34m' |
|
|
NC='\033[0m' |
|
|
|
|
|
|
|
|
INFO_FILE="${HOME}/.seriguela/medium_instance_info.txt" |
|
|
|
|
|
if [ ! -f "$INFO_FILE" ]; then |
|
|
echo "No medium training instance found!" |
|
|
echo "Launch one first with: bash scripts/aws/launch_medium_training.sh" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
INSTANCE_ID=$(grep "Instance ID:" "$INFO_FILE" | cut -d' ' -f3) |
|
|
PUBLIC_IP=$(grep "Public IP:" "$INFO_FILE" | cut -d' ' -f3) |
|
|
KEY_NAME=$(grep "Key Name:" "$INFO_FILE" | cut -d' ' -f3) |
|
|
|
|
|
echo "==========================================" |
|
|
echo "Monitoring GPT-2 Medium Training" |
|
|
echo "==========================================" |
|
|
echo "Instance ID: $INSTANCE_ID" |
|
|
echo "Public IP: $PUBLIC_IP" |
|
|
echo "" |
|
|
|
|
|
|
|
|
STATUS=$(aws ec2 describe-instances \ |
|
|
--instance-ids "$INSTANCE_ID" \ |
|
|
--query "Reservations[0].Instances[0].State.Name" \ |
|
|
--output text 2>/dev/null) |
|
|
|
|
|
if [ "$STATUS" != "running" ]; then |
|
|
echo "Instance is not running (status: $STATUS)" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo -e "${GREEN}Instance is running${NC}" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "Checking training status..." |
|
|
COMPLETE=$(ssh -i ~/.ssh/${KEY_NAME}.pem -o StrictHostKeyChecking=no ubuntu@${PUBLIC_IP} \ |
|
|
'test -f ~/.training_complete && echo "yes" || echo "no"' 2>/dev/null) |
|
|
|
|
|
if [ "$COMPLETE" == "yes" ]; then |
|
|
echo -e "${GREEN}Training is COMPLETE!${NC}" |
|
|
echo "" |
|
|
echo "Results:" |
|
|
ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} 'cat ~/training_results.txt' |
|
|
echo "" |
|
|
echo "Download model with:" |
|
|
echo " scp -i ~/.ssh/${KEY_NAME}.pem -r ubuntu@${PUBLIC_IP}:~/seriguela/output/gpt2_medium_700K_json ./" |
|
|
exit 0 |
|
|
fi |
|
|
|
|
|
echo -e "${YELLOW}Training in progress...${NC}" |
|
|
echo "" |
|
|
echo "Options:" |
|
|
echo " 1. Watch live logs (Ctrl+C to exit)" |
|
|
echo " 2. Check last 50 lines" |
|
|
echo " 3. Check GPU usage" |
|
|
echo " 4. Exit" |
|
|
echo "" |
|
|
read -p "Select option [1-4]: " OPTION |
|
|
|
|
|
case $OPTION in |
|
|
1) |
|
|
echo "" |
|
|
echo "Following training logs (Ctrl+C to stop)..." |
|
|
echo "" |
|
|
ssh -i ~/.ssh/${KEY_NAME}.pem -t ubuntu@${PUBLIC_IP} \ |
|
|
'tail -f /home/ubuntu/training_medium.log' |
|
|
;; |
|
|
2) |
|
|
echo "" |
|
|
ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \ |
|
|
'tail -n 50 /home/ubuntu/training_medium.log' |
|
|
;; |
|
|
3) |
|
|
echo "" |
|
|
ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \ |
|
|
'nvidia-smi' |
|
|
;; |
|
|
4) |
|
|
exit 0 |
|
|
;; |
|
|
*) |
|
|
echo "Invalid option" |
|
|
exit 1 |
|
|
;; |
|
|
esac |
|
|
|