Spaces:

Bailan-Alex
/

Adaptive-Block-Forcing

No application file

App Files Files Community

Adaptive-Block-Forcing / train.sh

Bailan-Alex

Upload folder using huggingface_hub

4f2b2f4 verified 4 months ago

raw

history blame contribute delete

1.92 kB

	#!/bin/bash

	# Exit on any error
	set -e

	# Print script start
	echo "=========================================="
	echo "Starting Adaptive Block Forcing Training"
	echo "=========================================="

	# Print system information
	echo "System Information:"
	echo " Hostname: $(hostname)"
	echo " Date: $(date)"
	echo " User: $(whoami)"
	echo " Working Directory: $(pwd)"
	echo " Python Version: $(python --version 2>/dev/null \|\| echo 'Python not found')"
	echo ""

	# Activate micromamba environment
	# echo "Activating micromamba environment 'abf'..."
	# eval "$(micromamba shell hook --shell bash)"
	# micromamba activate abf

	# Print environment information
	echo "Environment Information:"
	echo " Active Environment: $CONDA_DEFAULT_ENV"
	echo " Python Path: $(which python)"
	echo " Python Version: $(python --version)"
	echo " CUDA Available: $(python -c 'import torch; print(torch.cuda.is_available())' 2>/dev/null \|\| echo 'PyTorch not available')"
	if python -c 'import torch' 2>/dev/null; then
	echo " CUDA Version: $(python -c 'import torch; print(torch.version.cuda)')"
	echo " GPU Count: $(python -c 'import torch; print(torch.cuda.device_count())')"
	fi
	echo ""

	# Set environment variables
	export CUDA_VISIBLE_DEVICES=0
	export DEBUGPY=0
	# export CUDA_LAUNCH_BLOCKING=1

	echo "Starting training with the following configuration:"
	echo " CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
	echo " DEBUGPY: $DEBUGPY"
	echo " Config File: config/llada.yaml"
	echo " Accelerate Config: config/acc_config"
	echo " Number of Processes: 1"
	echo " Main Process Port: 29577"
	echo ""

	# Launch training
	echo "Launching training..."
	accelerate launch \
	--config_file config/acc_config \
	--num_processes 1 \
	--main_process_port 29577 \
	train.py --config config/llada.yaml

	echo ""
	echo "=========================================="
	echo "Training completed!"
	echo "=========================================="