| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| |
| GREEN='\033[0;32m' |
| YELLOW='\033[1;33m' |
| RED='\033[0;31m' |
| BLUE='\033[0;34m' |
| NC='\033[0m' |
|
|
| print_status() { echo -e "${GREEN}[$(date '+%H:%M:%S')]${NC} $1"; } |
| print_warning() { echo -e "${YELLOW}[$(date '+%H:%M:%S')]${NC} $1"; } |
| print_error() { echo -e "${RED}[$(date '+%H:%M:%S')]${NC} $1"; } |
| print_header() { echo -e "\n${BLUE}========================================\n$1\n========================================${NC}\n"; } |
|
|
| |
| PROJECT_DIR="/home/ubuntu/seriguela" |
| LOG_FILE="$HOME/training_success.log" |
| MONITOR_LOG="$HOME/monitor_output.log" |
| TRAINING_PID="" |
| CHECK_INTERVAL=60 |
| MODEL_PATH="./output/Se124M_700K_infix" |
| DATASET_REPO="augustocsc/sintetico_natural" |
| DATA_DIR="700K" |
| DATA_COLUMN="i_prompt_n" |
|
|
| cd "$PROJECT_DIR" |
| source venv/bin/activate |
|
|
| |
| get_training_pid() { |
| TRAINING_PID=$(ps aux | grep "python scripts/train.py" | grep -v grep | awk '{print $2}') |
| } |
|
|
| |
| is_training_running() { |
| get_training_pid |
| if [ -z "$TRAINING_PID" ]; then |
| return 1 |
| else |
| return 0 |
| fi |
| } |
|
|
| |
| get_progress() { |
| if [ -f "$LOG_FILE" ]; then |
| |
| tail -100 "$LOG_FILE" | grep -E "([0-9]+)%\|" | tail -1 | sed 's/.*\([0-9]\+\)%|.*/\1/' || echo "0" |
| else |
| echo "0" |
| fi |
| } |
|
|
| |
| get_training_stats() { |
| if [ -f "$LOG_FILE" ]; then |
| local last_line=$(tail -100 "$LOG_FILE" | grep -E "[0-9]+/21882" | tail -1) |
| echo "$last_line" |
| fi |
| } |
|
|
| |
| send_notification() { |
| local title="$1" |
| local message="$2" |
|
|
| print_header "$title" |
| echo "$message" |
|
|
| |
| cat > "$HOME/training_notification.txt" << EOF |
| ================================================================================ |
| $title |
| $(date '+%Y-%m-%d %H:%M:%S') |
| ================================================================================ |
| |
| $message |
| |
| ================================================================================ |
| EOF |
|
|
| print_status "Notification saved to: $HOME/training_notification.txt" |
| } |
|
|
| |
| print_header "Training Monitor Started" |
| print_status "Monitoring training process..." |
| print_status "Log file: $LOG_FILE" |
| print_status "Check interval: ${CHECK_INTERVAL}s" |
|
|
| START_TIME=$(date +%s) |
| LAST_PROGRESS=0 |
|
|
| while true; do |
| if is_training_running; then |
| CURRENT_PROGRESS=$(get_progress) |
| TRAINING_STATS=$(get_training_stats) |
|
|
| |
| print_status "Training running (PID: $TRAINING_PID) - Progress: ${CURRENT_PROGRESS}%" |
|
|
| if [ ! -z "$TRAINING_STATS" ]; then |
| echo " $TRAINING_STATS" |
| fi |
|
|
| |
| GPU_INFO=$(nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv,noheader,nounits) |
| echo " GPU: $GPU_INFO" |
|
|
| LAST_PROGRESS=$CURRENT_PROGRESS |
| sleep $CHECK_INTERVAL |
| else |
| |
| END_TIME=$(date +%s) |
| DURATION=$((END_TIME - START_TIME)) |
| HOURS=$((DURATION / 3600)) |
| MINUTES=$(((DURATION % 3600) / 60)) |
|
|
| print_header "Training Process Ended" |
|
|
| |
| if grep -q "Training finished" "$LOG_FILE" 2>/dev/null || \ |
| grep -q "100%|" "$LOG_FILE" 2>/dev/null; then |
|
|
| |
| print_status "Training completed successfully!" |
| print_status "Total time: ${HOURS}h ${MINUTES}m" |
|
|
| |
| FINAL_METRICS=$(tail -200 "$LOG_FILE" | grep -E "(train_loss|eval_loss)" | tail -5) |
|
|
| send_notification "✅ Training Completed Successfully" \ |
| "Training Duration: ${HOURS}h ${MINUTES}m |
| Model: GPT-2 (124M) with LoRA |
| Dataset: 700K infix |
| Output: $MODEL_PATH |
| |
| Final Metrics: |
| $FINAL_METRICS |
| |
| Wandb Dashboard: |
| https://wandb.ai/symbolic-gression/seriguela_700K_test |
| |
| Starting automatic analysis... |
| " |
|
|
| |
| print_header "Starting Automatic Analysis" |
| bash "$PROJECT_DIR/scripts/aws/analyze_model.sh" "$MODEL_PATH" "$DATA_COLUMN" 2>&1 | tee "$HOME/analysis_output.log" |
|
|
| print_status "Analysis complete! Check: $HOME/analysis_output.log" |
|
|
| else |
| |
| print_error "Training ended unexpectedly!" |
|
|
| |
| ERRORS=$(tail -50 "$LOG_FILE" | grep -E "(Error|Exception|Traceback)" | head -10) |
|
|
| send_notification "❌ Training Failed or Interrupted" \ |
| "Training Duration: ${HOURS}h ${MINUTES}m |
| Last Progress: ${LAST_PROGRESS}% |
| |
| Possible Errors: |
| $ERRORS |
| |
| Check full log: $LOG_FILE |
| " |
| fi |
|
|
| break |
| fi |
| done |
|
|
| print_status "Monitor finished. Check notification file: $HOME/training_notification.txt" |
|
|