WCNegentropy commited on
Commit
984b78b
·
verified ·
1 Parent(s): 58b962e

Remove working_1b_demo.py - cleanup for OS launch

Browse files
Files changed (1) hide show
  1. working_1b_demo.py +0 -357
working_1b_demo.py DELETED
@@ -1,357 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- BitTransformerLM Working 1B Parameter Demo
4
- ==========================================
5
-
6
- GUARANTEED TO WORK: Simplified 1B parameter model with complete training + inference demo.
7
- Based on all our proven optimizations from the 680M success.
8
- """
9
-
10
- import torch
11
- import torch.nn.functional as F
12
- import torch.nn as nn
13
- import logging
14
- import time
15
- import json
16
- from datetime import datetime
17
-
18
- from bit_transformer.model import BitTransformerLM
19
- from bit_transformer.bit_io import text_to_bits, bits_to_text
20
- from bit_transformer.utils import set_dropout
21
-
22
- logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
23
- logger = logging.getLogger(__name__)
24
-
25
-
26
- def main():
27
- """Working 1B parameter BitTransformerLM demo with training + inference."""
28
-
29
- logger.info("🎯 WORKING 1B PARAMETER BITTRANSFORMERLM DEMO")
30
- logger.info("=" * 55)
31
- logger.info("✅ Guaranteed to work based on proven optimizations")
32
- logger.info("🚀 Full training + inference demonstration")
33
-
34
- # Working 1B configuration (simplified from 1.21B)
35
- config = {
36
- "d_model": 1792, # Optimized size
37
- "nhead": 28, # Divisible by d_model
38
- "num_layers": 20, # Slightly fewer layers
39
- "dim_feedforward": 7168, # 4x d_model
40
- "max_seq_len": 256, # Conservative sequence length
41
- "lambda_K": 0.05, # Minimal telemetry impact
42
- "lambda_C": 0.05,
43
- "lambda_S": 0.05,
44
- "reversible": True, # All optimizations ON
45
- "use_checkpoint": True,
46
- "use_autocast": True,
47
- "chunk_size": 64, # Small chunks for memory
48
- "full_attn_logging": False,
49
- }
50
-
51
- logger.info("🏗️ Creating Working 1B Parameter Model...")
52
- for k, v in config.items():
53
- logger.info(f" {k}: {v}")
54
-
55
- # Create model
56
- model = BitTransformerLM(**config)
57
- params = sum(p.numel() for p in model.parameters())
58
- logger.info(f"✅ Model: {params:,} parameters ({params/1e9:.2f}B)")
59
-
60
- # Move to GPU with DataParallel for multi-GPU
61
- if torch.cuda.device_count() > 1:
62
- logger.info(f"🚀 Using {torch.cuda.device_count()} GPUs with DataParallel")
63
- model = model.cuda()
64
- model = nn.DataParallel(model, device_ids=[0]) # Only GPU 0 to avoid memory issues
65
- device = torch.device('cuda')
66
- else:
67
- model = model.cuda()
68
- device = torch.device('cuda')
69
-
70
- # Training setup
71
- logger.info("⚙️ Setting up training...")
72
- optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
73
- scaler = torch.amp.GradScaler('cuda')
74
-
75
- # Simple training data
76
- logger.info("📚 Creating training data...")
77
- training_texts = [
78
- "Hello world, this is a test of the BitTransformerLM model.",
79
- "The quick brown fox jumps over the lazy dog repeatedly.",
80
- "In the beginning was the word, and the word was data.",
81
- "Artificial intelligence is transforming our world today.",
82
- "Large language models are revolutionizing natural language processing.",
83
- ]
84
-
85
- # Convert to bit sequences
86
- training_data = []
87
- seq_len = config["max_seq_len"]
88
-
89
- for text in training_texts:
90
- bits = text_to_bits(text)
91
- if len(bits) > seq_len:
92
- bits = bits[:seq_len]
93
- elif len(bits) < seq_len:
94
- bits.extend([0] * (seq_len - len(bits)))
95
-
96
- input_bits = torch.tensor(bits[:-1], dtype=torch.long)
97
- target_bits = torch.tensor(bits[1:], dtype=torch.long)
98
- training_data.append((input_bits, target_bits))
99
-
100
- logger.info(f"✅ Training data: {len(training_data)} sequences of {seq_len-1} bits each")
101
-
102
- # Training loop
103
- logger.info("🎯 Starting Working 1B Parameter Training...")
104
- model.train()
105
- set_dropout(model, 0.1)
106
-
107
- start_time = time.time()
108
- training_results = []
109
-
110
- for epoch in range(5): # Short demo training
111
- epoch_loss = 0.0
112
-
113
- for step, (input_bits, target_bits) in enumerate(training_data):
114
- input_bits = input_bits.unsqueeze(0).to(device)
115
- target_bits = target_bits.unsqueeze(0).to(device)
116
-
117
- optimizer.zero_grad()
118
-
119
- # Forward pass
120
- with torch.amp.autocast('cuda'):
121
- outputs = model(input_bits)
122
-
123
- if isinstance(outputs, tuple):
124
- logits, telemetry = outputs
125
- else:
126
- logits = outputs
127
- telemetry = {}
128
-
129
- loss = F.cross_entropy(logits.view(-1, 2), target_bits.view(-1))
130
-
131
- # Backward pass
132
- scaler.scale(loss).backward()
133
- scaler.unscale_(optimizer)
134
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
135
- scaler.step(optimizer)
136
- scaler.update()
137
-
138
- epoch_loss += loss.item()
139
-
140
- # Memory monitoring
141
- if step == 0: # First step of each epoch
142
- memory_used = torch.cuda.memory_allocated(0) / (1024**3)
143
- logger.info(
144
- f"Epoch {epoch+1} | "
145
- f"Loss: {loss.item():.4f} | "
146
- f"K: {telemetry.get('negentropy', 0):.3f} | "
147
- f"C: {telemetry.get('lz_complexity', 0):.3f} | "
148
- f"S: {telemetry.get('symbiosis', 0):.3f} | "
149
- f"Mem: {memory_used:.1f}GB"
150
- )
151
-
152
- avg_loss = epoch_loss / len(training_data)
153
- # Safe telemetry conversion
154
- safe_telemetry = {}
155
- for k, v in telemetry.items():
156
- try:
157
- if isinstance(v, torch.Tensor):
158
- if v.numel() == 1:
159
- safe_telemetry[k] = float(v.item())
160
- else:
161
- safe_telemetry[k] = float(v.mean().item())
162
- else:
163
- safe_telemetry[k] = v
164
- except:
165
- safe_telemetry[k] = str(v)
166
-
167
- training_results.append({
168
- 'epoch': epoch + 1,
169
- 'average_loss': avg_loss,
170
- 'telemetry': safe_telemetry
171
- })
172
-
173
- logger.info(f"Epoch {epoch+1} completed - Avg Loss: {avg_loss:.4f}")
174
-
175
- training_time = time.time() - start_time
176
- logger.info(f"✅ Training completed in {training_time:.1f} seconds!")
177
-
178
- # Inference demonstration
179
- logger.info("🧪 Testing 1B Parameter Model Inference...")
180
- model.eval()
181
- set_dropout(model, 0.0)
182
-
183
- inference_results = []
184
- test_prompts = [
185
- "Hello",
186
- "The future of AI",
187
- "Once upon a time",
188
- "In a world where",
189
- "The answer is"
190
- ]
191
-
192
- with torch.no_grad():
193
- for i, prompt in enumerate(test_prompts):
194
- try:
195
- # Convert prompt to bits
196
- prompt_bits = text_to_bits(prompt)
197
- if len(prompt_bits) > config["max_seq_len"] - 50:
198
- prompt_bits = prompt_bits[:config["max_seq_len"] - 50]
199
-
200
- input_sequence = torch.tensor(prompt_bits, dtype=torch.long).unsqueeze(0).to(device)
201
-
202
- # Generate continuation
203
- generated_bits = prompt_bits.copy()
204
-
205
- for _ in range(30): # Generate 30 additional bits
206
- if len(generated_bits) >= config["max_seq_len"] - 1:
207
- break
208
-
209
- current_input = torch.tensor(generated_bits, dtype=torch.long).unsqueeze(0).to(device)
210
-
211
- with torch.amp.autocast('cuda'):
212
- outputs = model(current_input)
213
- if isinstance(outputs, tuple):
214
- logits, _ = outputs
215
- else:
216
- logits = outputs
217
-
218
- # Get next bit prediction
219
- next_logits = logits[0, -1, :]
220
- next_bit_prob = torch.softmax(next_logits, dim=-1)
221
- next_bit = torch.multinomial(next_bit_prob, 1).item() # Sample for variety
222
-
223
- generated_bits.append(next_bit)
224
-
225
- # Convert back to text
226
- try:
227
- generated_text = bits_to_text(generated_bits)
228
- # Clean up text for display
229
- generated_text = generated_text.replace('\x00', '').replace('\n', ' ').strip()
230
- if len(generated_text) > 100:
231
- generated_text = generated_text[:100] + "..."
232
- except:
233
- generated_text = f"[Generated {len(generated_bits)} bits]"
234
-
235
- result = {
236
- 'prompt': prompt,
237
- 'input_bits': len(prompt_bits),
238
- 'generated_bits': len(generated_bits),
239
- 'output': generated_text,
240
- 'success': True
241
- }
242
-
243
- inference_results.append(result)
244
- logger.info(f"Test {i+1}: '{prompt}' -> '{generated_text}'")
245
-
246
- except Exception as e:
247
- logger.error(f"Inference {i+1} failed: {e}")
248
- inference_results.append({
249
- 'prompt': prompt,
250
- 'error': str(e),
251
- 'success': False
252
- })
253
-
254
- # Save comprehensive results with proper serialization
255
- def make_serializable(obj):
256
- """Convert tensors to Python types for JSON serialization"""
257
- if isinstance(obj, torch.Tensor):
258
- if obj.numel() == 1:
259
- return float(obj.item())
260
- else:
261
- return float(obj.mean().item())
262
- elif isinstance(obj, dict):
263
- return {k: make_serializable(v) for k, v in obj.items()}
264
- elif isinstance(obj, list):
265
- return [make_serializable(v) for v in obj]
266
- else:
267
- return obj
268
-
269
- final_results = {
270
- 'timestamp': datetime.now().isoformat(),
271
- 'model_config': config,
272
- 'model_parameters': int(params),
273
- 'training_time_seconds': float(training_time),
274
- 'training_results': make_serializable(training_results),
275
- 'inference_results': make_serializable(inference_results),
276
- 'hardware_info': {
277
- 'gpu_count': torch.cuda.device_count(),
278
- 'gpu_names': [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
279
- 'peak_memory_gb': float(torch.cuda.max_memory_allocated(0) / (1024**3)),
280
- }
281
- }
282
-
283
- # Save results
284
- with open('/data/working_1b_results.json', 'w') as f:
285
- json.dump(final_results, f, indent=2)
286
-
287
- logger.info("🏆 WORKING 1B PARAMETER DEMO COMPLETED SUCCESSFULLY!")
288
- logger.info(f"📊 Model: {params:,} parameters ({params/1e9:.2f}B)")
289
- logger.info(f"⚡ Training: {training_time:.1f}s across {len(training_results)} epochs")
290
- logger.info(f"🧪 Inference: {len([r for r in inference_results if r.get('success', False)])} successful tests")
291
- logger.info(f"💾 Peak Memory: {final_results['hardware_info']['peak_memory_gb']:.1f}GB")
292
- logger.info("📋 Full results saved to /data/working_1b_results.json")
293
-
294
- # Interactive chat demo
295
- logger.info("\n🎉 BONUS: Interactive Chat with 1B Model!")
296
- logger.info("=" * 45)
297
-
298
- chat_history = []
299
- chat_prompts = ["Hi there", "What do you think about AI?", "Tell me something interesting"]
300
-
301
- for prompt in chat_prompts:
302
- logger.info(f"Human: {prompt}")
303
-
304
- try:
305
- # Simple generation (similar to above)
306
- prompt_bits = text_to_bits(prompt)[-50:] # Use last 50 bits as context
307
- input_tensor = torch.tensor(prompt_bits, dtype=torch.long).unsqueeze(0).to(device)
308
-
309
- generated = prompt_bits.copy()
310
- for _ in range(40): # Generate response
311
- if len(generated) >= config["max_seq_len"] - 1:
312
- break
313
-
314
- current = torch.tensor(generated[-100:], dtype=torch.long).unsqueeze(0).to(device) # Last 100 bits context
315
-
316
- with torch.amp.autocast('cuda'):
317
- outputs = model(current)
318
- if isinstance(outputs, tuple):
319
- logits, _ = outputs
320
- else:
321
- logits = outputs
322
-
323
- next_logits = logits[0, -1, :]
324
- next_bit = torch.multinomial(torch.softmax(next_logits, dim=-1), 1).item()
325
- generated.append(next_bit)
326
-
327
- # Convert to text
328
- response_bits = generated[len(prompt_bits):] # Only the generated part
329
- try:
330
- response = bits_to_text(response_bits).replace('\x00', '').strip()[:50]
331
- if not response:
332
- response = "[Generated binary response]"
333
- except:
334
- response = f"[Generated {len(response_bits)} bit response]"
335
-
336
- logger.info(f"BitTransformerLM: {response}")
337
- chat_history.append({'human': prompt, 'model': response})
338
-
339
- except Exception as e:
340
- logger.info(f"BitTransformerLM: [Error: {e}]")
341
- chat_history.append({'human': prompt, 'model': f"Error: {e}"})
342
-
343
- logger.info("\n🎊 MISSION ACCOMPLISHED!")
344
- logger.info("✅ 1B+ parameter BitTransformerLM: PROVEN TO WORK!")
345
- logger.info("✅ Training: SUCCESSFUL")
346
- logger.info("✅ Inference: FUNCTIONAL")
347
- logger.info("✅ Interactive Chat: DEMONSTRATED")
348
-
349
- return True
350
-
351
-
352
- if __name__ == "__main__":
353
- success = main()
354
- if success:
355
- print("\n🏆 COMPLETE SUCCESS: 1B+ Parameter BitTransformerLM fully functional!")
356
- else:
357
- print("\n❌ Demo failed - but we've already proven capability!")