WCNegentropy commited on
Commit
236faea
Β·
verified Β·
1 Parent(s): 6e6ed3d

Remove launch_true_1b.sh - cleanup for OS launch

Browse files
Files changed (1) hide show
  1. launch_true_1b.sh +0 -59
launch_true_1b.sh DELETED
@@ -1,59 +0,0 @@
1
- #!/bin/bash
2
- #
3
- # Launch TRUE 1.21B Parameter BitTransformerLM Training
4
- # ====================================================
5
- #
6
- # PROPER FSDP sharding across 4 GPUs + inference testing!
7
- #
8
-
9
- set -e
10
-
11
- echo "πŸ”₯ TRUE 1.21B PARAMETER BITTRANSFORMERLM TRAINING"
12
- echo "================================================="
13
- echo "🎯 PROPER FSDP SHARDING (not duplication!)"
14
- echo "βœ… Based on proven 680M success"
15
- echo "πŸš€ Full training + inference testing"
16
- echo ""
17
-
18
- # Optimal environment setup
19
- export CUDA_VISIBLE_DEVICES=0,1,2,3
20
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
21
- export OMP_NUM_THREADS=12
22
- export HF_TOKEN="${HF_TOKEN:-your-token-here}"
23
-
24
- cd /data/BitTransformerLM/BitTransformerLM
25
-
26
- echo "πŸ” Hardware Check:"
27
- python -c "
28
- import torch
29
- print(f'CUDA Available: {torch.cuda.is_available()}')
30
- print(f'GPU Count: {torch.cuda.device_count()}')
31
- for i in range(torch.cuda.device_count()):
32
- props = torch.cuda.get_device_properties(i)
33
- print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
34
- print(f'Total VRAM: {sum(torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())) / 1024**3:.1f}GB')
35
- "
36
-
37
- echo ""
38
- echo "βš™οΈ TRUE 1.21B CONFIGURATION:"
39
- echo " 🎯 Parameters: 1,210,000,000+ (1.21B)"
40
- echo " πŸ“ Architecture: d_model=2048, layers=24, heads=32"
41
- echo " 🧠 Memory Strategy: FSDP Full Sharding across 4 GPUs"
42
- echo " πŸ”„ Sequence Length: 512 (optimized from 680M success)"
43
- echo " ⚑ Mixed Precision: FP16"
44
- echo " πŸ›‘οΈ Safety Telemetry: K, C, S metrics enabled"
45
- echo " πŸ”§ All Optimizations: Reversible + Checkpointing + Chunked Attention"
46
- echo ""
47
-
48
- echo "πŸš€ Starting TRUE 1.21B parameter training..."
49
- echo " This WILL work - we've proven the capability!"
50
- echo ""
51
-
52
- # Launch training
53
- python true_1b_training.py
54
-
55
- echo ""
56
- echo "πŸ† TRUE 1.21B BITTRANSFORMERLM TRAINING COMPLETED!"
57
- echo "πŸ“Š Check /data/true_1b_results.json for full results"
58
- echo "πŸ’Ύ Model checkpoint saved for inference"
59
- echo "πŸ§ͺ Inference testing completed"