| # Script to clean GPU memory and run training | |
| # Kill any existing Python processes | |
| echo "Stopping any running Python processes..." | |
| pkill -9 python | |
| # Clear GPU cache | |
| echo "Clearing GPU cache..." | |
| nvidia-smi --gpu-reset | |
| # Wait a moment for cleanup | |
| sleep 5 | |
| # Check GPU memory status | |
| echo "Current GPU memory status:" | |
| nvidia-smi | |
| # Set memory optimization environment variables | |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True | |
| export PYTORCH_NO_CUDA_MEMORY_CACHING=1 | |
| # Run training with reduced image size (optional) | |
| echo "Starting training..." | |
| python run_train.py --cfg train_config/ace_plus_fft_lora.yaml | |
| # Or if you have a specific memory-optimized config: | |
| # python run_train.py --cfg train_config/ace_plus_fft_lora_low_mem.yaml | |