Spaces:
Paused
Paused
| # Distributed training launch script for Phi-4 training | |
| # This script uses torchrun to launch multi-GPU training | |
| # Set the number of GPUs to use (defaults to all available) | |
| NUM_GPUS=${1:-4} | |
| # Check if torchrun is available | |
| if ! command -v torchrun &> /dev/null; then | |
| echo "torchrun command not found. Make sure PyTorch is installed properly." | |
| echo "Try: pip install torch>=2.0.0" | |
| exit 1 | |
| fi | |
| echo "Launching distributed training with $NUM_GPUS GPUs..." | |
| # Launch the distributed training | |
| torchrun --nproc_per_node=$NUM_GPUS \ | |
| --master_port=29500 \ | |
| run_transformers_training.py \ | |
| --config transformers_config.json | |
| # Check exit status | |
| if [ $? -eq 0 ]; then | |
| echo "Distributed training completed successfully!" | |
| else | |
| echo "Distributed training failed with exit code $?" | |
| fi |