Remove launch_true_1b.sh - cleanup for OS launch
Browse files- launch_true_1b.sh +0 -59
launch_true_1b.sh
DELETED
|
@@ -1,59 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
#
|
| 3 |
-
# Launch TRUE 1.21B Parameter BitTransformerLM Training
|
| 4 |
-
# ====================================================
|
| 5 |
-
#
|
| 6 |
-
# PROPER FSDP sharding across 4 GPUs + inference testing!
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
set -e
|
| 10 |
-
|
| 11 |
-
echo "π₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING"
|
| 12 |
-
echo "================================================="
|
| 13 |
-
echo "π― PROPER FSDP SHARDING (not duplication!)"
|
| 14 |
-
echo "β
Based on proven 680M success"
|
| 15 |
-
echo "π Full training + inference testing"
|
| 16 |
-
echo ""
|
| 17 |
-
|
| 18 |
-
# Optimal environment setup
|
| 19 |
-
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
| 20 |
-
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 21 |
-
export OMP_NUM_THREADS=12
|
| 22 |
-
export HF_TOKEN="${HF_TOKEN:-your-token-here}"
|
| 23 |
-
|
| 24 |
-
cd /data/BitTransformerLM/BitTransformerLM
|
| 25 |
-
|
| 26 |
-
echo "π Hardware Check:"
|
| 27 |
-
python -c "
|
| 28 |
-
import torch
|
| 29 |
-
print(f'CUDA Available: {torch.cuda.is_available()}')
|
| 30 |
-
print(f'GPU Count: {torch.cuda.device_count()}')
|
| 31 |
-
for i in range(torch.cuda.device_count()):
|
| 32 |
-
props = torch.cuda.get_device_properties(i)
|
| 33 |
-
print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
|
| 34 |
-
print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB')
|
| 35 |
-
"
|
| 36 |
-
|
| 37 |
-
echo ""
|
| 38 |
-
echo "βοΈ TRUE 1.21B CONFIGURATION:"
|
| 39 |
-
echo " π― Parameters: 1,210,000,000+ (1.21B)"
|
| 40 |
-
echo " π Architecture: d_model=2048, layers=24, heads=32"
|
| 41 |
-
echo " π§ Memory Strategy: FSDP Full Sharding across 4 GPUs"
|
| 42 |
-
echo " π Sequence Length: 512 (optimized from 680M success)"
|
| 43 |
-
echo " β‘ Mixed Precision: FP16"
|
| 44 |
-
echo " π‘οΈ Safety Telemetry: K, C, S metrics enabled"
|
| 45 |
-
echo " π§ All Optimizations: Reversible + Checkpointing + Chunked Attention"
|
| 46 |
-
echo ""
|
| 47 |
-
|
| 48 |
-
echo "π Starting TRUE 1.21B parameter training..."
|
| 49 |
-
echo " This WILL work - we've proven the capability!"
|
| 50 |
-
echo ""
|
| 51 |
-
|
| 52 |
-
# Launch training
|
| 53 |
-
python true_1b_training.py
|
| 54 |
-
|
| 55 |
-
echo ""
|
| 56 |
-
echo "π TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!"
|
| 57 |
-
echo "π Check /data/true_1b_results.json for full results"
|
| 58 |
-
echo "πΎ Model checkpoint saved for inference"
|
| 59 |
-
echo "π§ͺ Inference testing completed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|