| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -e |
|
|
| echo "π BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER" |
| echo "==================================================" |
| echo "Target: 680 MILLION parameters" |
| echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)" |
| echo "Dataset: WikiText-103 + Real Corpus Data" |
| echo "Architecture: Reversible Transformer with Safety Telemetry" |
| echo "" |
|
|
| |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512 |
| export NCCL_DEBUG=INFO |
| export NCCL_TREE_THRESHOLD=0 |
|
|
| |
| export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
| |
| cd /data/BitTransformerLM/BitTransformerLM |
|
|
| |
| mkdir -p /data/checkpoints |
|
|
| |
| echo "π Checking GPU availability..." |
| python -c " |
| import torch |
| print(f'CUDA Available: {torch.cuda.is_available()}') |
| print(f'GPU Count: {torch.cuda.device_count()}') |
| for i in range(torch.cuda.device_count()): |
| print(f' GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)') |
| " |
|
|
| echo "" |
| echo "π Model Configuration Preview:" |
| echo " β’ Parameters: 679,630,848 (680M)" |
| echo " β’ d_model: 1536" |
| echo " β’ Layers: 24 (reversible)" |
| echo " β’ Attention Heads: 24" |
| echo " β’ Feed Forward: 6144" |
| echo " β’ Sequence Length: 2048" |
| echo " β’ Batch Size: 4 per GPU (16 total)" |
| echo " β’ Gradient Accumulation: 32 steps" |
| echo " β’ Effective Batch Size: 512" |
| echo "" |
|
|
| echo "π― Starting distributed training..." |
| echo " Use Ctrl+C to stop training safely" |
| echo "" |
|
|
| |
| torchrun \ |
| --nproc_per_node=4 \ |
| --master_port=29500 \ |
| --nnodes=1 \ |
| --node_rank=0 \ |
| massive_scale_training.py \ |
| --world-size 4 \ |
| --port 29500 |
|
|
| echo "" |
| echo "π Training completed!" |
| echo "Check /data/checkpoints/ for saved models" |
| echo "Check /data/massive_scale_training.log for detailed logs" |