Spaces:
Paused
Paused
| @echo off | |
| REM ====================================================================== | |
| REM Distributed training launch script for Phi-4 training with torchrun | |
| REM This script launches multi-GPU training on Windows systems | |
| REM ====================================================================== | |
| REM Set the number of GPUs to use (defaults to all available) | |
| set NUM_GPUS=%1 | |
| if "%NUM_GPUS%"=="" set NUM_GPUS=4 | |
| echo. | |
| echo ===== Phi-4 Distributed Training ===== | |
| echo. | |
| echo Preparing to launch training with %NUM_GPUS% GPUs... | |
| REM Check if Python is available | |
| where python >nul 2>&1 | |
| if %ERRORLEVEL% NEQ 0 ( | |
| echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH. | |
| exit /b 1 | |
| ) | |
| REM Check if PyTorch is installed by attempting to import it | |
| python -c "import torch" >nul 2>&1 | |
| if %ERRORLEVEL% NEQ 0 ( | |
| echo ERROR: PyTorch not properly installed. Please install with: | |
| echo pip install torch>=2.0.0 | |
| exit /b 1 | |
| ) | |
| REM Check if torch.distributed is available | |
| python -c "import torch.distributed" >nul 2>&1 | |
| if %ERRORLEVEL% NEQ 0 ( | |
| echo ERROR: torch.distributed module not available. Please check your PyTorch installation. | |
| exit /b 1 | |
| ) | |
| echo Environment checks passed. Starting distributed training... | |
| echo. | |
| REM Launch the distributed training | |
| python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json | |
| REM Check exit status | |
| if %ERRORLEVEL% EQU 0 ( | |
| echo. | |
| echo ===== SUCCESS ===== | |
| echo Distributed training completed successfully! | |
| ) else ( | |
| echo. | |
| echo ===== ERROR ===== | |
| echo Distributed training failed with exit code %ERRORLEVEL% | |
| ) | |
| echo. | |
| echo Training logs are available in the ./results directory. |