WCNegentropy commited on
Commit
e260177
·
verified ·
1 Parent(s): bd02910

Remove single_gpu_massive.py - cleanup for OS launch

Browse files
Files changed (1) hide show
  1. single_gpu_massive.py +0 -149
single_gpu_massive.py DELETED
@@ -1,149 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- BitTransformerLM Single GPU 680M Parameter Training
4
- ===================================================
5
-
6
- PROOF OF CONCEPT: 680M parameter model on single GPU to validate everything works!
7
- """
8
-
9
- import os
10
- import sys
11
- import time
12
- import logging
13
- from datetime import datetime
14
-
15
- import torch
16
- import torch.nn.functional as F
17
- from torch.utils.data import DataLoader
18
- from datasets import load_dataset
19
-
20
- from bit_transformer.model import BitTransformerLM
21
- from bit_transformer.bit_io import text_to_bits
22
- from bit_transformer.utils import set_dropout
23
-
24
- logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
25
- logger = logging.getLogger(__name__)
26
-
27
-
28
- def main():
29
- """Single GPU 680M parameter training - PROOF IT WORKS!"""
30
-
31
- logger.info("🚀 SINGLE GPU 680M PARAMETER BITTRANSFORMERLM PROOF OF CONCEPT!")
32
- logger.info("=" * 70)
33
-
34
- # Model configuration - SAME AS BEFORE
35
- config = {
36
- "d_model": 1536,
37
- "nhead": 24,
38
- "num_layers": 24,
39
- "dim_feedforward": 6144,
40
- "max_seq_len": 2048,
41
- "lambda_K": 1.0,
42
- "lambda_C": 1.0,
43
- "lambda_S": 1.0,
44
- "reversible": True,
45
- "use_checkpoint": True,
46
- "use_autocast": True,
47
- "chunk_size": None,
48
- "full_attn_logging": False,
49
- }
50
-
51
- # Create model
52
- logger.info("🏗️ Creating 680M parameter model...")
53
- model = BitTransformerLM(**config)
54
- params = sum(p.numel() for p in model.parameters())
55
- logger.info(f"✅ Model created: {params:,} parameters ({params/1e6:.1f}M)")
56
-
57
- # Move to GPU
58
- device = torch.device('cuda:0')
59
- model = model.to(device)
60
- logger.info(f"✅ Model moved to {device}")
61
-
62
- # Simple dataset
63
- logger.info("📚 Creating simple dataset...")
64
-
65
- class SimpleDataset(torch.utils.data.Dataset):
66
- def __init__(self, num_samples=100):
67
- self.num_samples = num_samples
68
- self.seq_len = 2048
69
-
70
- def __len__(self):
71
- return self.num_samples
72
-
73
- def __getitem__(self, idx):
74
- # Create simple alternating bit patterns
75
- pattern = [0, 1, 1, 0] * (self.seq_len // 4)
76
- if len(pattern) > self.seq_len:
77
- pattern = pattern[:self.seq_len]
78
- elif len(pattern) < self.seq_len:
79
- pattern.extend([0] * (self.seq_len - len(pattern)))
80
-
81
- input_bits = torch.tensor(pattern[:-1], dtype=torch.long)
82
- target_bits = torch.tensor(pattern[1:], dtype=torch.long)
83
-
84
- return input_bits, target_bits
85
-
86
- dataset = SimpleDataset(100)
87
- dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
88
- logger.info(f"✅ Dataset created: {len(dataset)} samples")
89
-
90
- # Optimizer
91
- optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
92
- scaler = torch.amp.GradScaler('cuda')
93
-
94
- logger.info("🎯 Starting training...")
95
- model.train()
96
- set_dropout(model, 0.1)
97
-
98
- start_time = time.time()
99
-
100
- for step, (input_ids, labels) in enumerate(dataloader):
101
- if step >= 50: # Just prove it works for 50 steps
102
- break
103
-
104
- input_ids = input_ids.to(device)
105
- labels = labels.to(device)
106
-
107
- optimizer.zero_grad()
108
-
109
- # Forward pass with mixed precision
110
- with torch.amp.autocast('cuda'):
111
- outputs = model(input_ids)
112
-
113
- if isinstance(outputs, tuple):
114
- logits, telemetry = outputs
115
- else:
116
- logits = outputs
117
- telemetry = {}
118
-
119
- loss = F.cross_entropy(logits.view(-1, 2), labels.view(-1))
120
-
121
- # Backward pass
122
- scaler.scale(loss).backward()
123
- scaler.unscale_(optimizer)
124
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
125
- scaler.step(optimizer)
126
- scaler.update()
127
-
128
- if step % 10 == 0:
129
- elapsed = time.time() - start_time
130
- memory_used = torch.cuda.memory_allocated(0) / (1024**3)
131
-
132
- logger.info(
133
- f"Step {step:2d} | "
134
- f"Loss: {loss.item():.4f} | "
135
- f"K: {telemetry.get('negentropy', 0):.3f} | "
136
- f"C: {telemetry.get('lz_complexity', 0):.3f} | "
137
- f"S: {telemetry.get('symbiosis', 0):.3f} | "
138
- f"Mem: {memory_used:.1f}GB | "
139
- f"Time: {elapsed:.1f}s"
140
- )
141
- start_time = time.time()
142
-
143
- logger.info("🏆 SUCCESS! 680M parameter BitTransformerLM trained successfully!")
144
- logger.info("✅ Single GPU training PROVEN!")
145
- logger.info("✅ Ready for proper multi-GPU scaling!")
146
-
147
-
148
- if __name__ == "__main__":
149
- main()