#!/bin/bash # Exit on any error set -e # Print script start echo "==========================================" echo "Starting Adaptive Block Forcing Training" echo "==========================================" # Print system information echo "System Information:" echo " Hostname: $(hostname)" echo " Date: $(date)" echo " User: $(whoami)" echo " Working Directory: $(pwd)" echo " Python Version: $(python --version 2>/dev/null || echo 'Python not found')" echo "" # Activate micromamba environment # echo "Activating micromamba environment 'abf'..." # eval "$(micromamba shell hook --shell bash)" # micromamba activate abf # Print environment information echo "Environment Information:" echo " Active Environment: $CONDA_DEFAULT_ENV" echo " Python Path: $(which python)" echo " Python Version: $(python --version)" echo " CUDA Available: $(python -c 'import torch; print(torch.cuda.is_available())' 2>/dev/null || echo 'PyTorch not available')" if python -c 'import torch' 2>/dev/null; then echo " CUDA Version: $(python -c 'import torch; print(torch.version.cuda)')" echo " GPU Count: $(python -c 'import torch; print(torch.cuda.device_count())')" fi echo "" # Set environment variables export CUDA_VISIBLE_DEVICES=0 export DEBUGPY=0 # export CUDA_LAUNCH_BLOCKING=1 echo "Starting training with the following configuration:" echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" echo " DEBUGPY: $DEBUGPY" echo " Config File: config/llada.yaml" echo " Accelerate Config: config/acc_config" echo " Number of Processes: 1" echo " Main Process Port: 29577" echo "" # Launch training echo "Launching training..." accelerate launch \ --config_file config/acc_config \ --num_processes 1 \ --main_process_port 29577 \ train.py --config config/llada.yaml echo "" echo "==========================================" echo "Training completed!" echo "=========================================="