File size: 2,589 Bytes
a1190da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
# Monitor GPT-2 Medium training in real-time

set -e

GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# Get last instance info
INFO_FILE="${HOME}/.seriguela/medium_instance_info.txt"

if [ ! -f "$INFO_FILE" ]; then
    echo "No medium training instance found!"
    echo "Launch one first with: bash scripts/aws/launch_medium_training.sh"
    exit 1
fi

INSTANCE_ID=$(grep "Instance ID:" "$INFO_FILE" | cut -d' ' -f3)
PUBLIC_IP=$(grep "Public IP:" "$INFO_FILE" | cut -d' ' -f3)
KEY_NAME=$(grep "Key Name:" "$INFO_FILE" | cut -d' ' -f3)

echo "=========================================="
echo "Monitoring GPT-2 Medium Training"
echo "=========================================="
echo "Instance ID: $INSTANCE_ID"
echo "Public IP: $PUBLIC_IP"
echo ""

# Check if instance is running
STATUS=$(aws ec2 describe-instances \
    --instance-ids "$INSTANCE_ID" \
    --query "Reservations[0].Instances[0].State.Name" \
    --output text 2>/dev/null)

if [ "$STATUS" != "running" ]; then
    echo "Instance is not running (status: $STATUS)"
    exit 1
fi

echo -e "${GREEN}Instance is running${NC}"
echo ""

# Check if training is complete
echo "Checking training status..."
COMPLETE=$(ssh -i ~/.ssh/${KEY_NAME}.pem -o StrictHostKeyChecking=no ubuntu@${PUBLIC_IP} \
    'test -f ~/.training_complete && echo "yes" || echo "no"' 2>/dev/null)

if [ "$COMPLETE" == "yes" ]; then
    echo -e "${GREEN}Training is COMPLETE!${NC}"
    echo ""
    echo "Results:"
    ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} 'cat ~/training_results.txt'
    echo ""
    echo "Download model with:"
    echo "  scp -i ~/.ssh/${KEY_NAME}.pem -r ubuntu@${PUBLIC_IP}:~/seriguela/output/gpt2_medium_700K_json ./"
    exit 0
fi

echo -e "${YELLOW}Training in progress...${NC}"
echo ""
echo "Options:"
echo "  1. Watch live logs (Ctrl+C to exit)"
echo "  2. Check last 50 lines"
echo "  3. Check GPU usage"
echo "  4. Exit"
echo ""
read -p "Select option [1-4]: " OPTION

case $OPTION in
    1)
        echo ""
        echo "Following training logs (Ctrl+C to stop)..."
        echo ""
        ssh -i ~/.ssh/${KEY_NAME}.pem -t ubuntu@${PUBLIC_IP} \
            'tail -f /home/ubuntu/training_medium.log'
        ;;
    2)
        echo ""
        ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \
            'tail -n 50 /home/ubuntu/training_medium.log'
        ;;
    3)
        echo ""
        ssh -i ~/.ssh/${KEY_NAME}.pem ubuntu@${PUBLIC_IP} \
            'nvidia-smi'
        ;;
    4)
        exit 0
        ;;
    *)
        echo "Invalid option"
        exit 1
        ;;
esac