| #!/bin/bash |
| |
| |
| |
| |
| |
| |
|
|
| set -e |
|
|
| echo "π₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING" |
| echo "=================================================" |
| echo "π― PROPER FSDP SHARDING (not duplication!)" |
| echo "β
Based on proven 680M success" |
| echo "π Full training + inference testing" |
| echo "" |
|
|
| |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
| export OMP_NUM_THREADS=12 |
| export HF_TOKEN="${HF_TOKEN:-your-token-here}" |
|
|
| cd /data/BitTransformerLM/BitTransformerLM |
|
|
| echo "π Hardware Check:" |
| python -c " |
| import torch |
| print(f'CUDA Available: {torch.cuda.is_available()}') |
| print(f'GPU Count: {torch.cuda.device_count()}') |
| for i in range(torch.cuda.device_count()): |
| props = torch.cuda.get_device_properties(i) |
| print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)') |
| print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB') |
| " |
|
|
| echo "" |
| echo "βοΈ TRUE 1.21B CONFIGURATION:" |
| echo " π― Parameters: 1,210,000,000+ (1.21B)" |
| echo " π Architecture: d_model=2048, layers=24, heads=32" |
| echo " π§ Memory Strategy: FSDP Full Sharding across 4 GPUs" |
| echo " π Sequence Length: 512 (optimized from 680M success)" |
| echo " β‘ Mixed Precision: FP16" |
| echo " π‘οΈ Safety Telemetry: K, C, S metrics enabled" |
| echo " π§ All Optimizations: Reversible + Checkpointing + Chunked Attention" |
| echo "" |
|
|
| echo "π Starting TRUE 1.21B parameter training..." |
| echo " This WILL work - we've proven the capability!" |
| echo "" |
|
|
| |
| python true_1b_training.py |
|
|
| echo "" |
| echo "π TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!" |
| echo "π Check /data/true_1b_results.json for full results" |
| echo "πΎ Model checkpoint saved for inference" |
| echo "π§ͺ Inference testing completed" |