| | #!/bin/bash |
| | |
| | |
| |
|
| | set -e |
| |
|
| | |
| | GREEN='\033[0;32m' |
| | YELLOW='\033[1;33m' |
| | RED='\033[0;31m' |
| | BLUE='\033[0;34m' |
| | NC='\033[0m' |
| |
|
| | print_status() { echo -e "${GREEN}[$(date '+%H:%M:%S')]${NC} $1"; } |
| | print_warning() { echo -e "${YELLOW}[$(date '+%H:%M:%S')]${NC} $1"; } |
| | print_error() { echo -e "${RED}[$(date '+%H:%M:%S')]${NC} $1"; } |
| | print_header() { echo -e "\n${BLUE}========================================\n$1\n========================================${NC}\n"; } |
| |
|
| | |
| | PROJECT_DIR="/home/ubuntu/seriguela" |
| | LOG_FILE="$HOME/training_success.log" |
| | MONITOR_LOG="$HOME/monitor_output.log" |
| | TRAINING_PID="" |
| | CHECK_INTERVAL=60 |
| | MODEL_PATH="./output/Se124M_700K_infix" |
| | DATASET_REPO="augustocsc/sintetico_natural" |
| | DATA_DIR="700K" |
| | DATA_COLUMN="i_prompt_n" |
| |
|
| | cd "$PROJECT_DIR" |
| | source venv/bin/activate |
| |
|
| | |
| | get_training_pid() { |
| | TRAINING_PID=$(ps aux | grep "python scripts/train.py" | grep -v grep | awk '{print $2}') |
| | } |
| |
|
| | |
| | is_training_running() { |
| | get_training_pid |
| | if [ -z "$TRAINING_PID" ]; then |
| | return 1 |
| | else |
| | return 0 |
| | fi |
| | } |
| |
|
| | |
| | get_progress() { |
| | if [ -f "$LOG_FILE" ]; then |
| | |
| | tail -100 "$LOG_FILE" | grep -E "([0-9]+)%\|" | tail -1 | sed 's/.*\([0-9]\+\)%|.*/\1/' || echo "0" |
| | else |
| | echo "0" |
| | fi |
| | } |
| |
|
| | |
| | get_training_stats() { |
| | if [ -f "$LOG_FILE" ]; then |
| | local last_line=$(tail -100 "$LOG_FILE" | grep -E "[0-9]+/21882" | tail -1) |
| | echo "$last_line" |
| | fi |
| | } |
| |
|
| | |
| | send_notification() { |
| | local title="$1" |
| | local message="$2" |
| |
|
| | print_header "$title" |
| | echo "$message" |
| |
|
| | |
| | cat > "$HOME/training_notification.txt" << EOF |
| | ================================================================================ |
| | $title |
| | $(date '+%Y-%m-%d %H:%M:%S') |
| | ================================================================================ |
| | |
| | $message |
| | |
| | ================================================================================ |
| | EOF |
| |
|
| | print_status "Notification saved to: $HOME/training_notification.txt" |
| | } |
| |
|
| | |
| | print_header "Training Monitor Started" |
| | print_status "Monitoring training process..." |
| | print_status "Log file: $LOG_FILE" |
| | print_status "Check interval: ${CHECK_INTERVAL}s" |
| |
|
| | START_TIME=$(date +%s) |
| | LAST_PROGRESS=0 |
| |
|
| | while true; do |
| | if is_training_running; then |
| | CURRENT_PROGRESS=$(get_progress) |
| | TRAINING_STATS=$(get_training_stats) |
| |
|
| | |
| | print_status "Training running (PID: $TRAINING_PID) - Progress: ${CURRENT_PROGRESS}%" |
| |
|
| | if [ ! -z "$TRAINING_STATS" ]; then |
| | echo " $TRAINING_STATS" |
| | fi |
| |
|
| | |
| | GPU_INFO=$(nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits) |
| | echo " GPU: $GPU_INFO" |
| |
|
| | LAST_PROGRESS=$CURRENT_PROGRESS |
| | sleep $CHECK_INTERVAL |
| | else |
| | |
| | END_TIME=$(date +%s) |
| | DURATION=$((END_TIME - START_TIME)) |
| | HOURS=$((DURATION / 3600)) |
| | MINUTES=$(((DURATION % 3600) / 60)) |
| |
|
| | print_header "Training Process Ended" |
| |
|
| | |
| | if grep -q "Training finished" "$LOG_FILE" 2>/dev/null || \ |
| | grep -q "100%|" "$LOG_FILE" 2>/dev/null; then |
| |
|
| | |
| | print_status "Training completed successfully!" |
| | print_status "Total time: ${HOURS}h ${MINUTES}m" |
| |
|
| | |
| | FINAL_METRICS=$(tail -200 "$LOG_FILE" | grep -E "(train_loss|eval_loss)" | tail -5) |
| |
|
| | send_notification "✅ Training Completed Successfully" \ |
| | "Training Duration: ${HOURS}h ${MINUTES}m |
| | Model: GPT-2 (124M) with LoRA |
| | Dataset: 700K infix |
| | Output: $MODEL_PATH |
| | |
| | Final Metrics: |
| | $FINAL_METRICS |
| | |
| | Wandb Dashboard: |
| | https://wandb.ai/symbolic-gression/seriguela_700K_test |
| | |
| | Starting automatic analysis... |
| | " |
| |
|
| | |
| | print_header "Starting Automatic Analysis" |
| | bash "$PROJECT_DIR/scripts/aws/analyze_model.sh" "$MODEL_PATH" "$DATA_COLUMN" 2>&1 | tee "$HOME/analysis_output.log" |
| |
|
| | print_status "Analysis complete! Check: $HOME/analysis_output.log" |
| |
|
| | else |
| | |
| | print_error "Training ended unexpectedly!" |
| |
|
| | |
| | ERRORS=$(tail -50 "$LOG_FILE" | grep -E "(Error|Exception|Traceback)" | head -10) |
| |
|
| | send_notification "❌ Training Failed or Interrupted" \ |
| | "Training Duration: ${HOURS}h ${MINUTES}m |
| | Last Progress: ${LAST_PROGRESS}% |
| | |
| | Possible Errors: |
| | $ERRORS |
| | |
| | Check full log: $LOG_FILE |
| | " |
| | fi |
| |
|
| | break |
| | fi |
| | done |
| |
|
| | print_status "Monitor finished. Check notification file: $HOME/training_notification.txt" |
| |
|