WCNegentropy commited on
Commit
6e6ed3d
Β·
verified Β·
1 Parent(s): abf885c

Remove launch_optimized.sh - cleanup for OS launch

Browse files
Files changed (1) hide show
  1. launch_optimized.sh +0 -74
launch_optimized.sh DELETED
@@ -1,74 +0,0 @@
1
- #!/bin/bash
2
- #
3
- # BitTransformerLM OPTIMIZED Massive Scale Training Launcher
4
- # ==========================================================
5
- #
6
- # Launches 680M parameter BitTransformerLM with ALL optimizations enabled!
7
- # Uses DataParallel for reliable multi-GPU training.
8
- #
9
-
10
- set -e # Exit on any error
11
-
12
- echo "πŸš€ BITTRANSFORMERLM OPTIMIZED MASSIVE SCALE TRAINING"
13
- echo "====================================================="
14
- echo "Target: 680 MILLION parameters (CONFIRMED!)"
15
- echo "Hardware: Multi-GPU with DataParallel"
16
- echo "Dataset: WikiText-103 with bit-level encoding"
17
- echo "Optimizations: ALL ENABLED!"
18
- echo ""
19
-
20
- # Set environment variables for optimal performance
21
- export CUDA_VISIBLE_DEVICES=0,1,2,3
22
- export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
23
- export OMP_NUM_THREADS=12
24
-
25
- # Set HuggingFace token
26
- export HF_TOKEN="${HF_TOKEN:-your-token-here}"
27
-
28
- # Change to BitTransformerLM directory
29
- cd /data/BitTransformerLM/BitTransformerLM
30
-
31
- # Create checkpoint directory
32
- mkdir -p /data/checkpoints
33
-
34
- echo "πŸ” Hardware Check:"
35
- python -c "
36
- import torch
37
- print(f'CUDA Available: {torch.cuda.is_available()}')
38
- print(f'GPU Count: {torch.cuda.device_count()}')
39
- for i in range(torch.cuda.device_count()):
40
- props = torch.cuda.get_device_properties(i)
41
- print(f' GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f}GB)')
42
- "
43
-
44
- echo ""
45
- echo "βš™οΈ OPTIMIZATIONS ENABLED:"
46
- echo " βœ… Reversible Layers (50% memory savings)"
47
- echo " βœ… Gradient Checkpointing"
48
- echo " βœ… Mixed Precision (FP16)"
49
- echo " βœ… Memory-Mapped Dataset Loading"
50
- echo " βœ… Safety Telemetry (K, C, S metrics)"
51
- echo " βœ… Bit-Native Processing"
52
- echo " βœ… DataParallel Multi-GPU"
53
- echo ""
54
-
55
- echo "πŸ“Š Training Configuration:"
56
- echo " β€’ Parameters: 679,962,626 (680M)"
57
- echo " β€’ Architecture: d_model=1536, layers=24, heads=24"
58
- echo " β€’ Batch Size: 2 per GPU"
59
- echo " β€’ Gradient Accumulation: 16 steps"
60
- echo " β€’ Effective Batch Size: 128"
61
- echo " β€’ Learning Rate: 3e-4 with OneCycle"
62
- echo " β€’ Dataset: WikiText-103 (2000 training samples)"
63
- echo ""
64
-
65
- echo "🎯 Starting optimized training..."
66
- echo " This version should train successfully!"
67
- echo ""
68
-
69
- # Launch optimized training
70
- python massive_scale_simple.py
71
-
72
- echo ""
73
- echo "🏁 Training completed successfully!"
74
- echo "Check /data/checkpoints/ for saved models"