Bailan-Alex's picture
Upload folder using huggingface_hub
4f2b2f4 verified
#!/bin/bash
# Exit on any error
set -e
# Print script start
echo "=========================================="
echo "Starting Adaptive Block Forcing Training"
echo "=========================================="
# Print system information
echo "System Information:"
echo " Hostname: $(hostname)"
echo " Date: $(date)"
echo " User: $(whoami)"
echo " Working Directory: $(pwd)"
echo " Python Version: $(python --version 2>/dev/null || echo 'Python not found')"
echo ""
# Activate micromamba environment
# echo "Activating micromamba environment 'abf'..."
# eval "$(micromamba shell hook --shell bash)"
# micromamba activate abf
# Print environment information
echo "Environment Information:"
echo " Active Environment: $CONDA_DEFAULT_ENV"
echo " Python Path: $(which python)"
echo " Python Version: $(python --version)"
echo " CUDA Available: $(python -c 'import torch; print(torch.cuda.is_available())' 2>/dev/null || echo 'PyTorch not available')"
if python -c 'import torch' 2>/dev/null; then
echo " CUDA Version: $(python -c 'import torch; print(torch.version.cuda)')"
echo " GPU Count: $(python -c 'import torch; print(torch.cuda.device_count())')"
fi
echo ""
# Set environment variables
export CUDA_VISIBLE_DEVICES=0
export DEBUGPY=0
# export CUDA_LAUNCH_BLOCKING=1
echo "Starting training with the following configuration:"
echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo " DEBUGPY: $DEBUGPY"
echo " Config File: config/llada.yaml"
echo " Accelerate Config: config/acc_config"
echo " Number of Processes: 1"
echo " Main Process Port: 29577"
echo ""
# Launch training
echo "Launching training..."
accelerate launch \
--config_file config/acc_config \
--num_processes 1 \
--main_process_port 29577 \
train.py --config config/llada.yaml
echo ""
echo "=========================================="
echo "Training completed!"
echo "=========================================="