| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -e |
|
|
| echo "π BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING" |
| echo "=====================================================" |
| echo "Target: 680 MILLION parameters (CONFIRMED!)" |
| echo "Hardware: Multi-GPU with DataParallel" |
| echo "Dataset: WikiText-103 with bit-level encoding" |
| echo "Optimizations: ALL ENABLED!" |
| echo "" |
|
|
| |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| export OMP_NUM_THREADS=12 |
|
|
| |
| export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
| |
| cd /data/BitTransformerLM/BitTransformerLM |
|
|
| |
| mkdir -p /data/checkpoints |
|
|
| echo "π Hardware Check:" |
| python -c " |
| import torch |
| print(f'CUDA Available: {torch.cuda.is_available()}') |
| print(f'GPU Count: {torch.cuda.device_count()}') |
| for i in range(torch.cuda.device_count()): |
| props = torch.cuda.get_device_properties(i) |
| print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') |
| " |
|
|
| echo "" |
| echo "βοΈ OPTIMIZATIONS ENABLED:" |
| echo " β
Reversible Layers (50% memory savings)" |
| echo " β
Gradient Checkpointing" |
| echo " β
Mixed Precision (FP16)" |
| echo " β
Memory-Mapped Dataset Loading" |
| echo " β
Safety Telemetry (K, C, S metrics)" |
| echo " β
Bit-Native Processing" |
| echo " β
DataParallel Multi-GPU" |
| echo "" |
|
|
| echo "π Training Configuration:" |
| echo " β’ Parameters: 679,962,626 (680M)" |
| echo " β’ Architecture: d_model=1536, layers=24, heads=24" |
| echo " β’ Batch Size: 2 per GPU" |
| echo " β’ Gradient Accumulation: 16 steps" |
| echo " β’ Effective Batch Size: 128" |
| echo " β’ Learning Rate: 3e-4 with OneCycle" |
| echo " β’ Dataset: WikiText-103 (2000 training samples)" |
| echo "" |
|
|
| echo "π― Starting optimized training..." |
| echo " This version should train successfully!" |
| echo "" |
|
|
| |
| python massive_scale_simple.py |
|
|
| echo "" |
| echo "π Training completed successfully!" |
| echo "Check /data/checkpoints/ for saved models" |