ikaganacar commited on
Commit
7557c9f
·
1 Parent(s): d7d2fb2

Better Configuration Implementation

Browse files
Model_Architecture/config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "max_batch_size": 8,
4
+ "max_seq_len": 2048,
5
+ "dtype": "bf16",
6
+ "scale_fmt": null,
7
+ "vocab_size": 102400,
8
+ "dim": 1024,
9
+ "inter_dim": 4096,
10
+ "moe_inter_dim": 1024,
11
+ "n_layers": 20,
12
+ "n_dense_layers": 3,
13
+ "n_heads": 12,
14
+ "n_routed_experts": 6,
15
+ "n_shared_experts": 1,
16
+ "n_activated_experts": 2,
17
+ "route_scale": 1.0,
18
+ "use_routing_bias": true,
19
+ "q_lora_rank": 0,
20
+ "kv_lora_rank": 512,
21
+ "qk_nope_head_dim": 128,
22
+ "qk_rope_head_dim": 64,
23
+ "v_head_dim": 128,
24
+ "original_seq_len": 4096,
25
+ "rope_theta": 10000.0,
26
+ "rope_factor": 40,
27
+ "beta_fast": 32,
28
+ "beta_slow": 1,
29
+ "mscale": 1.0,
30
+ "tokenizer_name": "gpt2"
31
+ },
32
+ "training": {
33
+ "learning_rate": 3e-4,
34
+ "weight_decay": 0.1,
35
+ "beta1": 0.9,
36
+ "beta2": 0.95,
37
+ "grad_clip": 1.0,
38
+ "warmup_steps": 1000,
39
+ "total_steps": 50000,
40
+ "expert_rotation_steps": 2000,
41
+ "gradient_accumulation_steps": 16,
42
+ "eval_every": 1000,
43
+ "save_every": 5000,
44
+ "save_dir": "./checkpoints",
45
+ "log_every": 100,
46
+ "dtype": "bf16",
47
+ "compile": true
48
+ },
49
+ "data": {
50
+ "train_file": "./data/train.txt",
51
+ "val_file": "./data/val.txt",
52
+ "stride": 512
53
+ },
54
+ "logging": {
55
+ "use_wandb": true,
56
+ "project_name": "sequential-moe",
57
+ "run_name": "moe-12gb-gpu"
58
+ }
59
+ }
Model_Architecture/generation.py CHANGED
@@ -128,25 +128,20 @@ def token_ids_to_text(token_ids, tokenizer):
128
  #####################################
129
 
130
  if __name__ == "__main__":
 
 
 
131
  # Example configuration - smaller model for testing
132
- args = ModelArgs(
133
- max_batch_size=4,
134
- max_seq_len=1024,
135
- vocab_size=50257, # GPT-2 vocab size
136
- dim=768,
137
- inter_dim=3072,
138
- moe_inter_dim=768,
139
- n_layers=12,
140
- n_dense_layers=1,
141
- n_heads=12,
142
- n_routed_experts=8,
143
- n_shared_experts=2,
144
- n_activated_experts=2,
145
- kv_lora_rank=256,
146
- qk_nope_head_dim=64,
147
- qk_rope_head_dim=32,
148
- v_head_dim=64,
149
- )
150
 
151
  # Initialize model and tokenizer
152
  print("Initializing model...")
@@ -154,7 +149,8 @@ if __name__ == "__main__":
154
  model = ismail(args)
155
  model.eval()
156
 
157
- tokenizer = tiktoken.get_encoding("gpt2")
 
158
 
159
  # Example 1: Greedy generation (argmax)
160
  print(f"\n{'='*60}")
 
128
  #####################################
129
 
130
  if __name__ == "__main__":
131
+ import json
132
+ from pathlib import Path
133
+
134
  # Example configuration - smaller model for testing
135
+ config_path = Path("config.json")
136
+ if config_path.exists():
137
+ with open(config_path) as f:
138
+ config = json.load(f)
139
+ print(f"✅ Loaded config from {config_path}")
140
+ args = ModelArgs(**config["model"])
141
+ else:
142
+ print("⚠️ config.json not found, using default ModelArgs")
143
+ args = ModelArgs()
144
+
 
 
 
 
 
 
 
 
145
 
146
  # Initialize model and tokenizer
147
  print("Initializing model...")
 
149
  model = ismail(args)
150
  model.eval()
151
 
152
+ tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
153
+ tokenizer = tiktoken.get_encoding(tokenizer_name)
154
 
155
  # Example 1: Greedy generation (argmax)
156
  print(f"\n{'='*60}")
Model_Architecture/model.py CHANGED
@@ -52,6 +52,8 @@ class ModelArgs:
52
  beta_slow: int = 1
53
  mscale: float = 1.
54
 
 
 
55
  # others
56
  world_size = 1
57
  rank = 0
@@ -304,9 +306,8 @@ class Gate(nn.Module):
304
  indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
305
  weights = original_scores.gather(1, indices)
306
 
307
- # Normalize weights if using sigmoid
308
- if self.score_func == "sigmoid":
309
- weights = weights / weights.sum(dim=-1, keepdim=True)
310
 
311
  # Apply route scaling
312
  weights = weights * self.route_scale
@@ -387,10 +388,9 @@ class MoE(nn.Module):
387
 
388
  # Select top-k experts
389
  weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
390
-
391
- # Normalize weights
392
- if self.gate.score_func == "sigmoid":
393
- weights = weights / weights.sum(dim=-1, keepdim=True)
394
  weights = weights * self.gate.route_scale
395
 
396
  # Sequential Training Mode
@@ -468,10 +468,19 @@ class Block(nn.Module):
468
  self.attn_norm = RMSNorm(args.dim)
469
  self.ffn_norm = RMSNorm(args.dim)
470
 
471
- def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> torch.Tensor:
472
  x = x + self.attn(self.attn_norm(x), start_pos, freqs_cis, mask)
473
- x = x + self.ffn(self.ffn_norm(x))
474
- return x
 
 
 
 
 
 
 
 
 
475
 
476
 
477
  #####################################
@@ -492,6 +501,12 @@ class ismail(nn.Module):
492
 
493
  self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
494
 
 
 
 
 
 
 
495
  def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
496
  bsz, seqlen = tokens.shape
497
  h = self.tok_embeddings(tokens)
@@ -504,9 +519,17 @@ class ismail(nn.Module):
504
  mask = torch.triu(mask, diagonal=1)
505
  mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
506
 
 
507
  for layer in self.layers:
508
- h = layer(h, start_pos, freqs_cis, mask)
 
 
 
509
  h = self.norm(h)
510
  output = self.output(h)
 
 
 
 
511
  return output
512
 
 
52
  beta_slow: int = 1
53
  mscale: float = 1.
54
 
55
+ tokenizer_name: str = "gpt2" #
56
+
57
  # others
58
  world_size = 1
59
  rank = 0
 
306
  indices = torch.topk(scores, self.n_activated_experts, dim=-1)[1]
307
  weights = original_scores.gather(1, indices)
308
 
309
+ # Normalize weights (sigmoid always needs normalization)
310
+ weights = weights / weights.sum(dim=-1, keepdim=True)
 
311
 
312
  # Apply route scaling
313
  weights = weights * self.route_scale
 
388
 
389
  # Select top-k experts
390
  weights, indices = torch.topk(router_probs, self.n_activated_experts, dim=-1)
391
+
392
+ # Normalize weights (sigmoid always needs normalization)
393
+ weights = weights / weights.sum(dim=-1, keepdim=True)
 
394
  weights = weights * self.gate.route_scale
395
 
396
  # Sequential Training Mode
 
468
  self.attn_norm = RMSNorm(args.dim)
469
  self.ffn_norm = RMSNorm(args.dim)
470
 
471
+ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
472
  x = x + self.attn(self.attn_norm(x), start_pos, freqs_cis, mask)
473
+
474
+ # Handle both MLP (returns single output) and MoE (returns output + loss)
475
+ ffn_result = self.ffn(self.ffn_norm(x))
476
+ if isinstance(ffn_result, tuple):
477
+ ffn_out, lb_loss = ffn_result
478
+ else:
479
+ ffn_out = ffn_result
480
+ lb_loss = None
481
+
482
+ x = x + ffn_out
483
+ return x, lb_loss
484
 
485
 
486
  #####################################
 
501
 
502
  self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
503
 
504
+ def set_active_expert(self, expert_idx: Optional[int]):
505
+ """Set active expert for all MoE layers (for sequential training)"""
506
+ for layer in self.layers:
507
+ if isinstance(layer.ffn, MoE):
508
+ layer.ffn.set_active_expert(expert_idx)
509
+
510
  def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
511
  bsz, seqlen = tokens.shape
512
  h = self.tok_embeddings(tokens)
 
519
  mask = torch.triu(mask, diagonal=1)
520
  mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
521
 
522
+ total_lb_loss = 0.0
523
  for layer in self.layers:
524
+ h, lb_loss = layer(h, start_pos, freqs_cis, mask)
525
+ if lb_loss is not None:
526
+ total_lb_loss += lb_loss
527
+
528
  h = self.norm(h)
529
  output = self.output(h)
530
+
531
+ # Return output and total load balancing loss if training
532
+ if self.training and total_lb_loss > 0:
533
+ return output, total_lb_loss
534
  return output
535
 
Model_Architecture/model_size.py CHANGED
@@ -219,8 +219,19 @@ def estimate_model_size(args: ModelArgs):
219
 
220
 
221
  if __name__ == "__main__":
222
- # Load default configuration
223
- args = ModelArgs()
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  # Run estimation
226
  results = estimate_model_size(args)
 
219
 
220
 
221
  if __name__ == "__main__":
222
+ import json
223
+ from pathlib import Path
224
+
225
+ # Try to load from config.json, otherwise use defaults
226
+ config_path = Path(__file__).parent / "config.json"
227
+ if config_path.exists():
228
+ print(f"📄 Loading configuration from {config_path}")
229
+ with open(config_path) as f:
230
+ config = json.load(f)
231
+ args = ModelArgs(**config["model"])
232
+ else:
233
+ print("⚠️ config.json not found, using default ModelArgs")
234
+ args = ModelArgs()
235
 
236
  # Run estimation
237
  results = estimate_model_size(args)
Model_Architecture/train.py CHANGED
@@ -31,29 +31,37 @@ except ImportError:
31
  HAS_BNB = False
32
  print("⚠️ bitsandbytes not installed. Run 'pip install bitsandbytes' for memory-efficient optimizer.")
33
 
34
- # Configuration
35
  DEFAULT_CONFIG = {
36
  "model": {
37
- "vocab_size": 32000, # Reduced from 102400
 
 
 
 
38
  "dim": 1024,
39
  "inter_dim": 4096,
40
  "moe_inter_dim": 1024,
41
- "n_layers": 16,
42
- "n_dense_layers": 1, # Only first layer dense
43
- "n_heads": 16, # Increased for better parallelism
44
- # MoE
45
  "n_routed_experts": 6,
46
  "n_shared_experts": 1,
47
  "n_activated_experts": 2,
48
- # MLA
49
- "q_lora_rank": 128, # Enable Q LoRA
 
50
  "kv_lora_rank": 512,
51
- "qk_nope_head_dim": 64,
52
- "qk_rope_head_dim": 32,
53
- "v_head_dim": 64,
54
- # Sequence
55
- "max_seq_len": 2048, # Start shorter
56
- "max_batch_size": 4,
 
 
 
 
57
  },
58
  "training": {
59
  "learning_rate": 3e-4,
@@ -237,22 +245,25 @@ def evaluate(model, val_loader, device, config):
237
  model.eval()
238
  total_loss = 0.0
239
  total_tokens = 0
240
-
241
  with torch.no_grad():
242
  for input_ids, target_ids in val_loader:
243
  input_ids = input_ids.to(device)
244
  target_ids = target_ids.to(device)
245
-
246
- logits, lb_loss = model(input_ids, start_pos=0)
 
 
 
247
  loss = F.cross_entropy(
248
  logits.view(-1, logits.size(-1)),
249
  target_ids.view(-1),
250
  ignore_index=-1,
251
  )
252
-
253
  total_loss += loss.item() * target_ids.numel()
254
  total_tokens += target_ids.numel()
255
-
256
  model.train()
257
  return total_loss / total_tokens
258
 
@@ -286,22 +297,29 @@ def train_step(model, batch, device, config, scaler=None):
286
  input_ids, target_ids = batch
287
  input_ids = input_ids.to(device, non_blocking=True)
288
  target_ids = target_ids.to(device, non_blocking=True)
289
-
290
  # Forward pass
291
  with torch.cuda.amp.autocast(enabled=(config["training"]["dtype"] == "bf16")):
292
- logits, lb_loss = model(input_ids, start_pos=0)
293
-
 
 
 
 
 
 
 
294
  # Main language modeling loss
295
  lm_loss = F.cross_entropy(
296
  logits.view(-1, logits.size(-1)),
297
  target_ids.view(-1),
298
  ignore_index=-1,
299
  )
300
-
301
  # Total loss with load balancing
302
  total_loss = lm_loss + config["training"].get("lb_loss_coef", 0.01) * lb_loss
303
-
304
- return total_loss, lm_loss, lb_loss
305
 
306
 
307
  def main():
 
31
  HAS_BNB = False
32
  print("⚠️ bitsandbytes not installed. Run 'pip install bitsandbytes' for memory-efficient optimizer.")
33
 
34
+ # Configuration - matches ModelArgs defaults
35
  DEFAULT_CONFIG = {
36
  "model": {
37
+ "max_batch_size": 8,
38
+ "max_seq_len": 2048,
39
+ "dtype": "bf16",
40
+ "scale_fmt": None,
41
+ "vocab_size": 102400,
42
  "dim": 1024,
43
  "inter_dim": 4096,
44
  "moe_inter_dim": 1024,
45
+ "n_layers": 20,
46
+ "n_dense_layers": 3,
47
+ "n_heads": 12,
 
48
  "n_routed_experts": 6,
49
  "n_shared_experts": 1,
50
  "n_activated_experts": 2,
51
+ "route_scale": 1.0,
52
+ "use_routing_bias": True,
53
+ "q_lora_rank": 0,
54
  "kv_lora_rank": 512,
55
+ "qk_nope_head_dim": 128,
56
+ "qk_rope_head_dim": 64,
57
+ "v_head_dim": 128,
58
+ "original_seq_len": 4096,
59
+ "rope_theta": 10000.0,
60
+ "rope_factor": 40,
61
+ "beta_fast": 32,
62
+ "beta_slow": 1,
63
+ "mscale": 1.0,
64
+ "tokenizer_name": "gpt2",
65
  },
66
  "training": {
67
  "learning_rate": 3e-4,
 
245
  model.eval()
246
  total_loss = 0.0
247
  total_tokens = 0
248
+
249
  with torch.no_grad():
250
  for input_ids, target_ids in val_loader:
251
  input_ids = input_ids.to(device)
252
  target_ids = target_ids.to(device)
253
+
254
+ # Model returns just logits in eval mode (no lb_loss)
255
+ output = model(input_ids, start_pos=0)
256
+ logits = output if not isinstance(output, tuple) else output[0]
257
+
258
  loss = F.cross_entropy(
259
  logits.view(-1, logits.size(-1)),
260
  target_ids.view(-1),
261
  ignore_index=-1,
262
  )
263
+
264
  total_loss += loss.item() * target_ids.numel()
265
  total_tokens += target_ids.numel()
266
+
267
  model.train()
268
  return total_loss / total_tokens
269
 
 
297
  input_ids, target_ids = batch
298
  input_ids = input_ids.to(device, non_blocking=True)
299
  target_ids = target_ids.to(device, non_blocking=True)
300
+
301
  # Forward pass
302
  with torch.cuda.amp.autocast(enabled=(config["training"]["dtype"] == "bf16")):
303
+ output = model(input_ids, start_pos=0)
304
+
305
+ # Handle model output (tuple in training mode with MoE, single tensor otherwise)
306
+ if isinstance(output, tuple):
307
+ logits, lb_loss = output
308
+ else:
309
+ logits = output
310
+ lb_loss = 0.0
311
+
312
  # Main language modeling loss
313
  lm_loss = F.cross_entropy(
314
  logits.view(-1, logits.size(-1)),
315
  target_ids.view(-1),
316
  ignore_index=-1,
317
  )
318
+
319
  # Total loss with load balancing
320
  total_loss = lm_loss + config["training"].get("lb_loss_coef", 0.01) * lb_loss
321
+
322
+ return total_loss, lm_loss, lb_loss if isinstance(lb_loss, float) else lb_loss.item()
323
 
324
 
325
  def main():