AdriBat1 commited on
Commit
6ec2818
·
1 Parent(s): 938275a

Add DeepSeek-Lite Protocol: 50M params, FineWeb-Edu, TikToken, BFloat16

Browse files
remote-gpu-client/examples/deepseek_lite.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DeepSeek-Lite Protocol: Production LLM Comparison
3
+ Baseline (GPT-2 style) vs mHC (DeepSeek-V3 style)
4
+ Dataset: FineWeb-Edu | Tokenizer: TikToken | ~50M params each
5
+ """
6
+ import sys
7
+ import traceback
8
+ import os
9
+ import time
10
+ import math
11
+
12
+ print("🔬 DeepSeek-Lite Protocol: Production LLM Training")
13
+
14
+ try:
15
+ import torch
16
+ import torch.nn as nn
17
+ from torch.nn import functional as F
18
+ import tiktoken
19
+ from datasets import load_dataset
20
+ import matplotlib.pyplot as plt
21
+
22
+ print("🔹 Imports successful")
23
+
24
+ # === CONFIGURATION ===
25
+ EXPERIMENT_NAME = "deepseek_lite_v1"
26
+
27
+ # Model Architecture
28
+ d_model = 384 # Model dimension
29
+ n_heads = 6 # Attention heads (64 dim per head)
30
+ n_layers = 30 # Deep & Narrow
31
+ context_len = 1024 # Full context window
32
+ vocab_size = 50257 # GPT-2 vocab
33
+ dropout = 0.1
34
+
35
+ # Training
36
+ batch_size = 4 # Per-step batch (small for memory)
37
+ grad_accum_steps = 32 # Effective batch = 4 * 32 = 128
38
+ max_lr = 6e-4
39
+ min_lr = max_lr * 0.1
40
+ warmup_steps = 200
41
+ total_steps = 5000
42
+ eval_interval = 100
43
+
44
+ # Hardware
45
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
46
+ dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
47
+ print(f"🔹 Device: {device}, Dtype: {dtype}")
48
+
49
+ # Storage
50
+ storage_dir = f"/home/user/app/storage/{EXPERIMENT_NAME}"
51
+ os.makedirs(storage_dir, exist_ok=True)
52
+
53
+ # Chunk config (resumable)
54
+ CHUNK_STEPS = 25
55
+
56
+ # === TOKENIZER ===
57
+ print("📚 Loading TikToken (GPT-4 encoding)...")
58
+ enc = tiktoken.get_encoding("cl100k_base") # GPT-4 compatible
59
+
60
+ # === DATASET (Streaming) ===
61
+ print("📦 Loading FineWeb-Edu (streaming)...")
62
+ dataset_iter = iter(load_dataset(
63
+ "HuggingFaceFW/fineweb-edu",
64
+ "sample-10BT",
65
+ split="train",
66
+ streaming=True,
67
+ trust_remote_code=True
68
+ ))
69
+
70
+ # Token buffer for batching
71
+ token_buffer = []
72
+
73
+ def refill_buffer(min_tokens=100000):
74
+ global token_buffer
75
+ while len(token_buffer) < min_tokens:
76
+ try:
77
+ item = next(dataset_iter)
78
+ toks = enc.encode(item['text'])
79
+ token_buffer.extend(toks)
80
+ except StopIteration:
81
+ break
82
+
83
+ def get_batch():
84
+ global token_buffer
85
+ refill_buffer()
86
+ if len(token_buffer) < (context_len + 1) * batch_size:
87
+ raise RuntimeError("Not enough tokens in buffer")
88
+
89
+ ix = torch.randint(len(token_buffer) - context_len - 1, (batch_size,))
90
+ x = torch.stack([torch.tensor(token_buffer[i:i+context_len]) for i in ix])
91
+ y = torch.stack([torch.tensor(token_buffer[i+1:i+context_len+1]) for i in ix])
92
+ return x.to(device), y.to(device)
93
+
94
+ # === MODEL COMPONENTS ===
95
+ class RMSNorm(nn.Module):
96
+ def __init__(self, dim, eps=1e-6):
97
+ super().__init__()
98
+ self.eps = eps
99
+ self.weight = nn.Parameter(torch.ones(dim))
100
+ def forward(self, x):
101
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
102
+
103
+ class CausalSelfAttention(nn.Module):
104
+ def __init__(self, d_model, n_heads):
105
+ super().__init__()
106
+ self.n_heads = n_heads
107
+ self.head_dim = d_model // n_heads
108
+ self.c_attn = nn.Linear(d_model, 3 * d_model, bias=False)
109
+ self.c_proj = nn.Linear(d_model, d_model, bias=False)
110
+ self.dropout = nn.Dropout(dropout)
111
+ self.register_buffer("mask", torch.tril(torch.ones(context_len, context_len)))
112
+
113
+ def forward(self, x):
114
+ B, T, C = x.shape
115
+ qkv = self.c_attn(x).chunk(3, dim=-1)
116
+ q, k, v = [t.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for t in qkv]
117
+
118
+ att = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
119
+ att = att.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
120
+ att = F.softmax(att, dim=-1)
121
+ att = self.dropout(att)
122
+
123
+ y = att @ v
124
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
125
+ return self.c_proj(y)
126
+
127
+ class MLP(nn.Module):
128
+ def __init__(self, d_model):
129
+ super().__init__()
130
+ self.c_fc = nn.Linear(d_model, 4 * d_model, bias=False)
131
+ self.gelu = nn.GELU()
132
+ self.c_proj = nn.Linear(4 * d_model, d_model, bias=False)
133
+ self.dropout = nn.Dropout(dropout)
134
+
135
+ def forward(self, x):
136
+ return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))
137
+
138
+ # === BLOCK VARIANTS ===
139
+ class BlockBaseline(nn.Module):
140
+ """GPT-2 style: Pre-LayerNorm + simple residual"""
141
+ def __init__(self, d_model, n_heads):
142
+ super().__init__()
143
+ self.ln1 = nn.LayerNorm(d_model)
144
+ self.attn = CausalSelfAttention(d_model, n_heads)
145
+ self.ln2 = nn.LayerNorm(d_model)
146
+ self.mlp = MLP(d_model)
147
+
148
+ def forward(self, x):
149
+ x = x + self.attn(self.ln1(x))
150
+ x = x + self.mlp(self.ln2(x))
151
+ return x
152
+
153
+ class BlockMHC(nn.Module):
154
+ """DeepSeek-V3 style: RMSNorm + Manifold Hybrid Connection"""
155
+ def __init__(self, d_model, n_heads):
156
+ super().__init__()
157
+ self.attn = CausalSelfAttention(d_model, n_heads)
158
+ self.mlp = MLP(d_model)
159
+ self.ln1 = RMSNorm(d_model)
160
+ self.ln2 = RMSNorm(d_model)
161
+ # Learnable mixing coefficients
162
+ self.alpha1 = nn.Parameter(torch.tensor(0.9))
163
+ self.beta1 = nn.Parameter(torch.tensor(0.1))
164
+ self.alpha2 = nn.Parameter(torch.tensor(0.9))
165
+ self.beta2 = nn.Parameter(torch.tensor(0.1))
166
+
167
+ def forward(self, x):
168
+ # Attention with mHC
169
+ x = self.ln1(self.alpha1 * x + self.beta1 * self.attn(x))
170
+ # MLP with mHC
171
+ x = self.ln2(self.alpha2 * x + self.beta2 * self.mlp(x))
172
+ return x
173
+
174
+ # === GPT MODEL ===
175
+ class GPT(nn.Module):
176
+ def __init__(self, arch_type='baseline'):
177
+ super().__init__()
178
+ self.arch_type = arch_type
179
+ self.wte = nn.Embedding(vocab_size, d_model)
180
+ self.wpe = nn.Embedding(context_len, d_model)
181
+ Block = BlockBaseline if arch_type == 'baseline' else BlockMHC
182
+ self.blocks = nn.ModuleList([Block(d_model, n_heads) for _ in range(n_layers)])
183
+ self.ln_f = nn.LayerNorm(d_model) if arch_type == 'baseline' else RMSNorm(d_model)
184
+ self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
185
+ # Weight tying
186
+ self.wte.weight = self.lm_head.weight
187
+
188
+ # Count params
189
+ n_params = sum(p.numel() for p in self.parameters())
190
+ print(f"🔧 {arch_type.upper()}: {n_params/1e6:.2f}M params")
191
+
192
+ def forward(self, idx, targets=None):
193
+ B, T = idx.shape
194
+ tok_emb = self.wte(idx)
195
+ pos_emb = self.wpe(torch.arange(T, device=device))
196
+ x = tok_emb + pos_emb
197
+
198
+ for block in self.blocks:
199
+ x = block(x)
200
+ x = self.ln_f(x)
201
+ logits = self.lm_head(x)
202
+
203
+ loss = None
204
+ if targets is not None:
205
+ loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
206
+ return logits, loss
207
+
208
+ # === LR SCHEDULER ===
209
+ def get_lr(step):
210
+ if step < warmup_steps:
211
+ return max_lr * (step + 1) / warmup_steps
212
+ if step > total_steps:
213
+ return min_lr
214
+ decay_ratio = (step - warmup_steps) / (total_steps - warmup_steps)
215
+ return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * decay_ratio))
216
+
217
+ # === HISTORY ===
218
+ history = {
219
+ 'steps': [],
220
+ 'loss_a': [], 'loss_b': [],
221
+ 'ppl_a': [], 'ppl_b': [],
222
+ 'grad_a': [], 'grad_b': []
223
+ }
224
+ current_step = 0
225
+
226
+ history_path = os.path.join(storage_dir, 'history.pt')
227
+ ckpt_a_path = os.path.join(storage_dir, 'model_a.pt')
228
+ ckpt_b_path = os.path.join(storage_dir, 'model_b.pt')
229
+
230
+ if os.path.exists(history_path):
231
+ print(f"🔄 Resuming from {history_path}")
232
+ history = torch.load(history_path)
233
+ if history['steps']:
234
+ current_step = history['steps'][-1]
235
+ print(f" Last step: {current_step}")
236
+
237
+ # === INIT MODELS ===
238
+ print("🏗️ Building models...")
239
+ model_a = GPT('baseline').to(device)
240
+ model_b = GPT('mhc').to(device)
241
+
242
+ opt_a = torch.optim.AdamW(model_a.parameters(), lr=max_lr, betas=(0.9, 0.95), weight_decay=0.1)
243
+ opt_b = torch.optim.AdamW(model_b.parameters(), lr=max_lr, betas=(0.9, 0.95), weight_decay=0.1)
244
+
245
+ # Load checkpoints
246
+ if os.path.exists(ckpt_a_path):
247
+ model_a.load_state_dict(torch.load(ckpt_a_path))
248
+ opt_a.load_state_dict(torch.load(os.path.join(storage_dir, 'opt_a.pt')))
249
+ if os.path.exists(ckpt_b_path):
250
+ model_b.load_state_dict(torch.load(ckpt_b_path))
251
+ opt_b.load_state_dict(torch.load(os.path.join(storage_dir, 'opt_b.pt')))
252
+
253
+ # === TRAINING CHUNK ===
254
+ print(f"🚀 Training: Steps {current_step} -> {current_step + CHUNK_STEPS} (Target: {total_steps})")
255
+
256
+ model_a.train()
257
+ model_b.train()
258
+ scaler = torch.cuda.amp.GradScaler()
259
+
260
+ start_time = time.time()
261
+
262
+ for step_offset in range(CHUNK_STEPS):
263
+ step = current_step + step_offset + 1
264
+ if step > total_steps:
265
+ break
266
+
267
+ # Update LR
268
+ lr = get_lr(step)
269
+ for opt in [opt_a, opt_b]:
270
+ for pg in opt.param_groups:
271
+ pg['lr'] = lr
272
+
273
+ # Gradient Accumulation
274
+ loss_accum_a = 0.0
275
+ loss_accum_b = 0.0
276
+
277
+ opt_a.zero_grad()
278
+ opt_b.zero_grad()
279
+
280
+ for micro_step in range(grad_accum_steps):
281
+ x, y = get_batch()
282
+
283
+ with torch.cuda.amp.autocast(dtype=dtype):
284
+ _, loss_a = model_a(x, y)
285
+ _, loss_b = model_b(x, y)
286
+ loss_a = loss_a / grad_accum_steps
287
+ loss_b = loss_b / grad_accum_steps
288
+
289
+ loss_accum_a += loss_a.item()
290
+ loss_accum_b += loss_b.item()
291
+
292
+ scaler.scale(loss_a).backward()
293
+ scaler.scale(loss_b).backward()
294
+
295
+ # Grad clip
296
+ scaler.unscale_(opt_a)
297
+ scaler.unscale_(opt_b)
298
+ grad_norm_a = torch.nn.utils.clip_grad_norm_(model_a.parameters(), 1.0)
299
+ grad_norm_b = torch.nn.utils.clip_grad_norm_(model_b.parameters(), 1.0)
300
+
301
+ scaler.step(opt_a)
302
+ scaler.step(opt_b)
303
+ scaler.update()
304
+
305
+ # Log
306
+ if step % eval_interval == 0 or step == 1:
307
+ ppl_a = math.exp(loss_accum_a)
308
+ ppl_b = math.exp(loss_accum_b)
309
+ print(f"Step {step}: LossA={loss_accum_a:.4f} LossB={loss_accum_b:.4f} | "
310
+ f"PPL_A={ppl_a:.2f} PPL_B={ppl_b:.2f} | "
311
+ f"GradA={grad_norm_a:.2f} GradB={grad_norm_b:.2f} | LR={lr:.2e}")
312
+
313
+ history['steps'].append(step)
314
+ history['loss_a'].append(loss_accum_a)
315
+ history['loss_b'].append(loss_accum_b)
316
+ history['ppl_a'].append(ppl_a)
317
+ history['ppl_b'].append(ppl_b)
318
+ history['grad_a'].append(grad_norm_a.item())
319
+ history['grad_b'].append(grad_norm_b.item())
320
+
321
+ torch.save(history, history_path)
322
+
323
+ # Dashboard
324
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
325
+
326
+ ax1 = axes[0]
327
+ ax1.plot(history['steps'], history['loss_a'], label='Baseline', marker='o')
328
+ ax1.plot(history['steps'], history['loss_b'], label='mHC', marker='x')
329
+ ax1.set_xlabel('Steps')
330
+ ax1.set_ylabel('Loss')
331
+ ax1.set_title('Training Loss')
332
+ ax1.legend()
333
+ ax1.grid(True)
334
+
335
+ ax2 = axes[1]
336
+ ax2.plot(history['steps'], history['ppl_a'], label='Baseline', marker='o')
337
+ ax2.plot(history['steps'], history['ppl_b'], label='mHC', marker='x')
338
+ ax2.set_xlabel('Steps')
339
+ ax2.set_ylabel('Perplexity')
340
+ ax2.set_title('Perplexity')
341
+ ax2.legend()
342
+ ax2.grid(True)
343
+
344
+ ax3 = axes[2]
345
+ ax3.plot(history['steps'], history['grad_a'], label='Baseline', color='red')
346
+ ax3.plot(history['steps'], history['grad_b'], label='mHC', color='green')
347
+ ax3.set_xlabel('Steps')
348
+ ax3.set_ylabel('Gradient Norm')
349
+ ax3.set_title('Gradient Health')
350
+ ax3.legend()
351
+ ax3.grid(True)
352
+
353
+ plt.tight_layout()
354
+ plt.savefig(os.path.join(storage_dir, 'dashboard.png'))
355
+ plt.close()
356
+
357
+ elapsed = time.time() - start_time
358
+ print(f"🏁 Chunk done in {elapsed:.2f}s. Step: {step}")
359
+
360
+ # Save checkpoints
361
+ torch.save(model_a.state_dict(), ckpt_a_path)
362
+ torch.save(opt_a.state_dict(), os.path.join(storage_dir, 'opt_a.pt'))
363
+ torch.save(model_b.state_dict(), ckpt_b_path)
364
+ torch.save(opt_b.state_dict(), os.path.join(storage_dir, 'opt_b.pt'))
365
+ torch.save(history, history_path)
366
+ print("💾 Saved checkpoints.")
367
+
368
+ if step < total_steps:
369
+ print("CONTINUE_TRAINING")
370
+ else:
371
+ print("TRAINING_COMPLETE")
372
+ os.system(f"cp {os.path.join(storage_dir, 'dashboard.png')} .")
373
+ print("✅ Dashboard ready for download.")
374
+
375
+ except Exception as e:
376
+ print(f"\n❌ FATAL ERROR: {e}")
377
+ traceback.print_exc()
remote-gpu-client/examples/inference_tower.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import traceback
3
+ import os
4
+
5
+ print("🔮 Tower of Babel Inference (120-Layer Models)")
6
+
7
+ try:
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+ import requests
12
+
13
+ # --- Config (must match training) ---
14
+ block_size = 256
15
+ n_embd = 128
16
+ n_head = 4
17
+ n_layer = 120 # Tower config!
18
+ dropout = 0.1
19
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
20
+
21
+ storage_dir = "/home/user/app/storage/tower_120L"
22
+ ckpt_path_a = os.path.join(storage_dir, 'ckpt_a.pt')
23
+ ckpt_path_b = os.path.join(storage_dir, 'ckpt_b.pt')
24
+
25
+ # Vocab
26
+ url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
27
+ data = requests.get(url).text
28
+ chars = sorted(list(set(data)))
29
+ vocab_size = len(chars)
30
+ stoi = { ch:i for i,ch in enumerate(chars) }
31
+ itos = { i:ch for i,ch in enumerate(chars) }
32
+ encode = lambda s: [stoi.get(c, 0) for c in s]
33
+ decode = lambda l: ''.join([itos[i] for i in l])
34
+
35
+ # --- Model Classes ---
36
+ class Head(nn.Module):
37
+ def __init__(self, head_size):
38
+ super().__init__()
39
+ self.key = nn.Linear(n_embd, head_size, bias=False)
40
+ self.query = nn.Linear(n_embd, head_size, bias=False)
41
+ self.value = nn.Linear(n_embd, head_size, bias=False)
42
+ self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
43
+ self.dropout = nn.Dropout(dropout)
44
+ def forward(self, x):
45
+ B,T,C = x.shape
46
+ k = self.key(x)
47
+ q = self.query(x)
48
+ wei = q @ k.transpose(-2, -1) * C**-0.5
49
+ wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
50
+ wei = F.softmax(wei, dim=-1)
51
+ wei = self.dropout(wei)
52
+ v = self.value(x)
53
+ return wei @ v
54
+
55
+ class MultiHeadAttention(nn.Module):
56
+ def __init__(self, num_heads, head_size):
57
+ super().__init__()
58
+ self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
59
+ self.proj = nn.Linear(n_embd, n_embd)
60
+ self.dropout = nn.Dropout(dropout)
61
+ def forward(self, x):
62
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
63
+ return self.dropout(self.proj(out))
64
+
65
+ class FeedForward(nn.Module):
66
+ def __init__(self, n_embd):
67
+ super().__init__()
68
+ self.net = nn.Sequential(
69
+ nn.Linear(n_embd, 4 * n_embd),
70
+ nn.ReLU(),
71
+ nn.Linear(4 * n_embd, n_embd),
72
+ nn.Dropout(dropout),
73
+ )
74
+ def forward(self, x):
75
+ return self.net(x)
76
+
77
+ class BlockStandard(nn.Module):
78
+ def __init__(self, n_embd, n_head):
79
+ super().__init__()
80
+ head_size = n_embd // n_head
81
+ self.sa = MultiHeadAttention(n_head, head_size)
82
+ self.ffwd = FeedForward(n_embd)
83
+ self.ln1 = nn.LayerNorm(n_embd)
84
+ self.ln2 = nn.LayerNorm(n_embd)
85
+ def forward(self, x):
86
+ x = x + self.sa(self.ln1(x))
87
+ x = x + self.ffwd(self.ln2(x))
88
+ return x
89
+
90
+ class RMSNorm(nn.Module):
91
+ def __init__(self, dim, eps=1e-6):
92
+ super().__init__()
93
+ self.eps = eps
94
+ self.weight = nn.Parameter(torch.ones(dim))
95
+ def _norm(self, x):
96
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
97
+ def forward(self, x):
98
+ return self._norm(x.float()).type_as(x) * self.weight
99
+
100
+ class BlockMHC(nn.Module):
101
+ def __init__(self, n_embd, n_head):
102
+ super().__init__()
103
+ head_size = n_embd // n_head
104
+ self.sa = MultiHeadAttention(n_head, head_size)
105
+ self.ffwd = FeedForward(n_embd)
106
+ self.alpha1 = nn.Parameter(torch.tensor(0.9))
107
+ self.beta1 = nn.Parameter(torch.tensor(0.1))
108
+ self.ln1 = RMSNorm(n_embd)
109
+ self.alpha2 = nn.Parameter(torch.tensor(0.9))
110
+ self.beta2 = nn.Parameter(torch.tensor(0.1))
111
+ self.ln2 = RMSNorm(n_embd)
112
+ def forward(self, x):
113
+ mix1 = self.alpha1 * x + self.beta1 * self.sa(x)
114
+ x = self.ln1(mix1)
115
+ mix2 = self.alpha2 * x + self.beta2 * self.ffwd(x)
116
+ x = self.ln2(mix2)
117
+ return x
118
+
119
+ class GPT(nn.Module):
120
+ def __init__(self, arch_type='standard'):
121
+ super().__init__()
122
+ self.arch_type = arch_type
123
+ self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
124
+ self.position_embedding_table = nn.Embedding(block_size, n_embd)
125
+ if arch_type == 'standard':
126
+ self.blocks = nn.Sequential(*[BlockStandard(n_embd, n_head) for _ in range(n_layer)])
127
+ self.ln_f = nn.LayerNorm(n_embd)
128
+ elif arch_type == 'mhc':
129
+ self.blocks = nn.Sequential(*[BlockMHC(n_embd, n_head) for _ in range(n_layer)])
130
+ self.ln_f = RMSNorm(n_embd)
131
+ self.lm_head = nn.Linear(n_embd, vocab_size)
132
+ def forward(self, idx, targets=None):
133
+ B, T = idx.shape
134
+ tok_emb = self.token_embedding_table(idx)
135
+ pos_emb = self.position_embedding_table(torch.arange(T, device=device))
136
+ x = tok_emb + pos_emb
137
+ x = self.blocks(x)
138
+ x = self.ln_f(x)
139
+ logits = self.lm_head(x)
140
+ return logits, None
141
+ def generate(self, idx, max_new_tokens):
142
+ for _ in range(max_new_tokens):
143
+ idx_cond = idx[:, -block_size:]
144
+ logits, _ = self(idx_cond)
145
+ logits = logits[:, -1, :]
146
+ probs = F.softmax(logits, dim=-1)
147
+ idx_next = torch.multinomial(probs, num_samples=1)
148
+ idx = torch.cat((idx, idx_next), dim=1)
149
+ return idx
150
+
151
+ # --- Load Models ---
152
+ print(f"📦 Loading Model A (Standard, 120L)...")
153
+ model_a = GPT(arch_type='standard').to(device)
154
+ model_a.load_state_dict(torch.load(ckpt_path_a, map_location=device))
155
+ model_a.eval()
156
+
157
+ print(f"📦 Loading Model B (mHC, 120L)...")
158
+ model_b = GPT(arch_type='mhc').to(device)
159
+ model_b.load_state_dict(torch.load(ckpt_path_b, map_location=device))
160
+ model_b.eval()
161
+
162
+ # --- Inference ---
163
+ PROMPT = "ROMEO:"
164
+ MAX_TOKENS = 400
165
+
166
+ print(f"\n🎭 Prompt: '{PROMPT}'")
167
+ print(f"🔢 Max Tokens: {MAX_TOKENS}")
168
+
169
+ context = torch.tensor([encode(PROMPT)], dtype=torch.long, device=device)
170
+
171
+ print("\n" + "="*60)
172
+ print("MODEL A (Standard GPT, 120 Layers)")
173
+ print("="*60)
174
+ with torch.no_grad():
175
+ out_a = model_a.generate(context.clone(), max_new_tokens=MAX_TOKENS)
176
+ print(decode(out_a[0].tolist()))
177
+
178
+ print("\n" + "="*60)
179
+ print("MODEL B (mHC GPT, 120 Layers)")
180
+ print("="*60)
181
+ with torch.no_grad():
182
+ out_b = model_b.generate(context.clone(), max_new_tokens=MAX_TOKENS)
183
+ print(decode(out_b[0].tolist()))
184
+
185
+ print("\n✅ Inference Complete.")
186
+
187
+ except Exception as e:
188
+ print(f"\n❌ FATAL ERROR: {e}")
189
+ traceback.print_exc()
remote-gpu-client/run_deepseek_lite.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from antigravity_sdk.client import RemoteGPU
5
+
6
+ SCRIPT_PATH = "examples/deepseek_lite.py"
7
+ MAX_LOOPS = 250 # 5000 steps / 25 per chunk = 200 loops + buffer
8
+
9
+ def main():
10
+ if not os.path.exists(SCRIPT_PATH):
11
+ print(f"❌ Script not found: {SCRIPT_PATH}")
12
+ sys.exit(1)
13
+
14
+ with open(SCRIPT_PATH, 'r') as f:
15
+ code = f.read()
16
+
17
+ print("🔬 DeepSeek-Lite Protocol: Production LLM Training")
18
+ print("📊 50M params | 30 Layers | FineWeb-Edu | TikToken")
19
+ print("-" * 50)
20
+
21
+ gpu = RemoteGPU()
22
+
23
+ for i in range(MAX_LOOPS):
24
+ print(f"\n🌀 Loop {i+1}/{MAX_LOOPS}...")
25
+
26
+ result = gpu.run(code, download_files=True, verbose=True)
27
+ output = result.output
28
+
29
+ if "TRAINING_COMPLETE" in output:
30
+ print("\n✅ Training Finished!")
31
+ break
32
+ elif "CONTINUE_TRAINING" in output:
33
+ print("⏳ Chunk complete. Resuming...")
34
+ time.sleep(2)
35
+ elif "FATAL" in output:
36
+ print("❌ Fatal Error. Stopping.")
37
+ break
38
+ else:
39
+ print("⚠️ Unknown status. Stopping safely.")
40
+ print(f"Last output: {output[-500:]}")
41
+ break
42
+
43
+ if os.path.exists("dashboard.png"):
44
+ print("\n📊 Success! Saved dashboard.png")
45
+
46
+ if __name__ == "__main__":
47
+ main()
remote-gpu-server/requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
- # Force Rebuild 2
2
  fastapi
3
-
4
  uvicorn
5
  python-multipart
6
  gradio
@@ -11,4 +10,5 @@ matplotlib
11
  seaborn
12
  scikit-learn
13
  pandas
14
-
 
 
1
+ # Force Rebuild 3 - DeepSeek-Lite
2
  fastapi
 
3
  uvicorn
4
  python-multipart
5
  gradio
 
10
  seaborn
11
  scikit-learn
12
  pandas
13
+ tiktoken
14
+ datasets