WCNegentropy commited on
Commit
abf885c
·
verified ·
1 Parent(s): fb197fa

Remove launch_massive_scale.sh - cleanup for OS launch

Browse files
Files changed (1) hide show
  1. launch_massive_scale.sh +0 -75
launch_massive_scale.sh DELETED
@@ -1,75 +0,0 @@
1
- #!/bin/bash
2
- #
3
- # BitTransformerLM Massive Scale Training Launcher
4
- # =================================================
5
- #
6
- # Launches 1.21B parameter BitTransformerLM training across 4x NVIDIA L4 GPUs
7
- # with FSDP (Fully Sharded Data Parallel) for maximum efficiency.
8
- #
9
-
10
- set -e # Exit on any error
11
-
12
- echo "🚀 BITTRANSFORMERLM MASSIVE SCALE TRAINING LAUNCHER"
13
- echo "=================================================="
14
- echo "Target: 680 MILLION parameters"
15
- echo "Hardware: 4x NVIDIA L4 GPUs (23GB each)"
16
- echo "Dataset: WikiText-103 + Real Corpus Data"
17
- echo "Architecture: Reversible Transformer with Safety Telemetry"
18
- echo ""
19
-
20
- # Set environment variables
21
- export CUDA_VISIBLE_DEVICES=0,1,2,3
22
- export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
23
- export NCCL_DEBUG=INFO
24
- export NCCL_TREE_THRESHOLD=0
25
-
26
- # Set HuggingFace token
27
- export HF_TOKEN="${HF_TOKEN:-your-token-here}"
28
-
29
- # Change to BitTransformerLM directory
30
- cd /data/BitTransformerLM/BitTransformerLM
31
-
32
- # Create checkpoint directory
33
- mkdir -p /data/checkpoints
34
-
35
- # Check GPU availability
36
- echo "🔍 Checking GPU availability..."
37
- python -c "
38
- import torch
39
- print(f'CUDA Available: {torch.cuda.is_available()}')
40
- print(f'GPU Count: {torch.cuda.device_count()}')
41
- for i in range(torch.cuda.device_count()):
42
- print(f' GPU {i}: {torch.cuda.get_device_name(i)} ({torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f}GB)')
43
- "
44
-
45
- echo ""
46
- echo "📊 Model Configuration Preview:"
47
- echo " • Parameters: 679,630,848 (680M)"
48
- echo " • d_model: 1536"
49
- echo " • Layers: 24 (reversible)"
50
- echo " • Attention Heads: 24"
51
- echo " • Feed Forward: 6144"
52
- echo " • Sequence Length: 2048"
53
- echo " • Batch Size: 4 per GPU (16 total)"
54
- echo " • Gradient Accumulation: 32 steps"
55
- echo " • Effective Batch Size: 512"
56
- echo ""
57
-
58
- echo "🎯 Starting distributed training..."
59
- echo " Use Ctrl+C to stop training safely"
60
- echo ""
61
-
62
- # Launch distributed training with torchrun
63
- torchrun \
64
- --nproc_per_node=4 \
65
- --master_port=29500 \
66
- --nnodes=1 \
67
- --node_rank=0 \
68
- massive_scale_training.py \
69
- --world-size 4 \
70
- --port 29500
71
-
72
- echo ""
73
- echo "🏁 Training completed!"
74
- echo "Check /data/checkpoints/ for saved models"
75
- echo "Check /data/massive_scale_training.log for detailed logs"