| | #!/bin/bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | set -e |
| |
|
| | echo "π BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING" |
| | echo "=====================================================" |
| | echo "Target: 680 MILLION parameters (CONFIRMED!)" |
| | echo "Hardware: Multi-GPU with DataParallel" |
| | echo "Dataset: WikiText-103 with bit-level encoding" |
| | echo "Optimizations: ALL ENABLED!" |
| | echo "" |
| |
|
| | |
| | export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| | export OMP_NUM_THREADS=12 |
| |
|
| | |
| | export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
| |
|
| | |
| | cd /data/BitTransformerLM/BitTransformerLM |
| |
|
| | |
| | mkdir -p /data/checkpoints |
| |
|
| | echo "π Hardware Check:" |
| | python -c " |
| | import torch |
| | print(f'CUDA Available: {torch.cuda.is_available()}') |
| | print(f'GPU Count: {torch.cuda.device_count()}') |
| | for i in range(torch.cuda.device_count()): |
| | props = torch.cuda.get_device_properties(i) |
| | print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') |
| | " |
| |
|
| | echo "" |
| | echo "βοΈ OPTIMIZATIONS ENABLED:" |
| | echo " β
Reversible Layers (50% memory savings)" |
| | echo " β
Gradient Checkpointing" |
| | echo " β
Mixed Precision (FP16)" |
| | echo " β
Memory-Mapped Dataset Loading" |
| | echo " β
Safety Telemetry (K, C, S metrics)" |
| | echo " β
Bit-Native Processing" |
| | echo " β
DataParallel Multi-GPU" |
| | echo "" |
| |
|
| | echo "π Training Configuration:" |
| | echo " β’ Parameters: 679,962,626 (680M)" |
| | echo " β’ Architecture: d_model=1536, layers=24, heads=24" |
| | echo " β’ Batch Size: 2 per GPU" |
| | echo " β’ Gradient Accumulation: 16 steps" |
| | echo " β’ Effective Batch Size: 128" |
| | echo " β’ Learning Rate: 3e-4 with OneCycle" |
| | echo " β’ Dataset: WikiText-103 (2000 training samples)" |
| | echo "" |
| |
|
| | echo "π― Starting optimized training..." |
| | echo " This version should train successfully!" |
| | echo "" |
| |
|
| | |
| | python massive_scale_simple.py |
| |
|
| | echo "" |
| | echo "π Training completed successfully!" |
| | echo "Check /data/checkpoints/ for saved models" |