File size: 1,921 Bytes
4f2b2f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/bash

# Exit on any error
set -e

# Print script start
echo "=========================================="
echo "Starting Adaptive Block Forcing Training"
echo "=========================================="

# Print system information
echo "System Information:"
echo "  Hostname: $(hostname)"
echo "  Date: $(date)"
echo "  User: $(whoami)"
echo "  Working Directory: $(pwd)"
echo "  Python Version: $(python --version 2>/dev/null || echo 'Python not found')"
echo ""

# Activate micromamba environment
# echo "Activating micromamba environment 'abf'..."
# eval "$(micromamba shell hook --shell bash)"
# micromamba activate abf

# Print environment information
echo "Environment Information:"
echo "  Active Environment: $CONDA_DEFAULT_ENV"
echo "  Python Path: $(which python)"
echo "  Python Version: $(python --version)"
echo "  CUDA Available: $(python -c 'import torch; print(torch.cuda.is_available())' 2>/dev/null || echo 'PyTorch not available')"
if python -c 'import torch' 2>/dev/null; then
    echo "  CUDA Version: $(python -c 'import torch; print(torch.version.cuda)')"
    echo "  GPU Count: $(python -c 'import torch; print(torch.cuda.device_count())')"
fi
echo ""

# Set environment variables
export CUDA_VISIBLE_DEVICES=0
export DEBUGPY=0
# export CUDA_LAUNCH_BLOCKING=1

echo "Starting training with the following configuration:"
echo "  CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "  DEBUGPY: $DEBUGPY"
echo "  Config File: config/llada.yaml"
echo "  Accelerate Config: config/acc_config"
echo "  Number of Processes: 1"
echo "  Main Process Port: 29577"
echo ""

# Launch training
echo "Launching training..."
accelerate launch \
  --config_file config/acc_config \
  --num_processes 1 \
  --main_process_port 29577 \
  train.py --config config/llada.yaml

echo ""
echo "=========================================="
echo "Training completed!"
echo "=========================================="