ikaganacar commited on
Commit
f8b2ed5
·
1 Parent(s): 8118e19
Model_Architecture/config.json CHANGED
@@ -30,7 +30,7 @@
30
  "tokenizer_name": "turkish"
31
  },
32
  "training": {
33
- "learning_rate": 3e-4,
34
  "weight_decay": 0.1,
35
  "beta1": 0.9,
36
  "beta2": 0.95,
 
30
  "tokenizer_name": "turkish"
31
  },
32
  "training": {
33
+ "learning_rate": 3e-5,
34
  "weight_decay": 0.1,
35
  "beta1": 0.9,
36
  "beta2": 0.95,
Model_Architecture/model.py CHANGED
@@ -129,7 +129,7 @@ def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] =
129
 
130
 
131
  class Linear(nn.Module):
132
- dtype = torch.bfloat16
133
  scale_fmt: Optional[str] = None
134
 
135
  def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
@@ -187,7 +187,7 @@ class RMSNorm(nn.Module):
187
  super().__init__()
188
  self.dim = dim
189
  self.eps = eps
190
- self.weight = nn.Parameter(torch.ones(dim, dtype=torch.bfloat16))
191
 
192
  def forward(self, x: torch.Tensor):
193
  output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
@@ -500,12 +500,17 @@ class ismail(nn.Module):
500
  self.n_layers = args.n_layers
501
 
502
  self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
 
 
503
  self.layers = nn.ModuleList([Block(i, args) for i in range(args.n_layers)])
504
  self.norm = RMSNorm(args.dim)
505
  self.output = Linear(args.dim, args.vocab_size, bias=False)
506
  self.use_checkpointing = False
507
 
508
  self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
 
 
 
509
 
510
  def set_active_expert(self, expert_idx: Optional[int]):
511
  """Set active expert for all MoE layers (for sequential training)"""
 
129
 
130
 
131
  class Linear(nn.Module):
132
+ dtype = torch.float32
133
  scale_fmt: Optional[str] = None
134
 
135
  def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
 
187
  super().__init__()
188
  self.dim = dim
189
  self.eps = eps
190
+ self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
191
 
192
  def forward(self, x: torch.Tensor):
193
  output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
 
500
  self.n_layers = args.n_layers
501
 
502
  self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
503
+ nn.init.normal_(self.tok_embeddings.weight, mean=0.0, std=0.02)
504
+
505
  self.layers = nn.ModuleList([Block(i, args) for i in range(args.n_layers)])
506
  self.norm = RMSNorm(args.dim)
507
  self.output = Linear(args.dim, args.vocab_size, bias=False)
508
  self.use_checkpointing = False
509
 
510
  self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
511
+
512
+ if hasattr(self.output, 'weight'):
513
+ nn.init.normal_(self.output.weight, mean=0.0, std=0.02 / math.sqrt(args.n_layers))
514
 
515
  def set_active_expert(self, expert_idx: Optional[int]):
516
  """Set active expert for all MoE layers (for sequential training)"""
Model_Architecture/train.py CHANGED
@@ -294,9 +294,21 @@ def save_checkpoint(model, optimizer, step, config, expert_idx=None):
294
  def train_step(model, input_mb, target_mb, device, config, scaler=None):
295
  """Process a SINGLE micro-batch (already sliced)"""
296
 
 
297
  if input_mb.size(0) == 0:
298
  return 0.0, 0.0
299
 
 
 
 
 
 
 
 
 
 
 
 
300
  input_mb = input_mb.to(device, non_blocking=True)
301
  target_mb = target_mb.to(device, non_blocking=True)
302
 
@@ -309,9 +321,11 @@ def train_step(model, input_mb, target_mb, device, config, scaler=None):
309
  logits = output
310
  lb_loss = 0.0
311
 
312
- # 🚨 DEBUG: Check for NaN in logits
313
  if torch.isnan(logits).any():
314
- print(f"🚨 NaN detected in logits! Scale: {logits.abs().max().item():.2f}")
 
 
315
 
316
  lm_loss = F.cross_entropy(
317
  logits.view(-1, logits.size(-1)),
@@ -319,19 +333,22 @@ def train_step(model, input_mb, target_mb, device, config, scaler=None):
319
  ignore_index=-1,
320
  )
321
 
 
 
 
 
 
322
  accum_steps = config["training"]["gradient_accumulation_steps"]
323
  if isinstance(lb_loss, float):
324
  total_loss = lm_loss / accum_steps
325
  else:
 
 
 
326
  lb_loss_coef = config["training"].get("lb_loss_coef", 0.01)
327
  total_loss = (lm_loss + lb_loss_coef * lb_loss) / accum_steps
328
 
329
- # 🚨 DEBUG: Check for NaN in total loss
330
- if torch.isnan(total_loss):
331
- print(f"🚨 NaN in total_loss! lm_loss: {lm_loss.item():.4f}, lb_loss: {lb_loss}")
332
- return 0.0, 0.0
333
-
334
- # Backward
335
  if scaler is not None:
336
  scaler.scale(total_loss).backward()
337
  else:
 
294
  def train_step(model, input_mb, target_mb, device, config, scaler=None):
295
  """Process a SINGLE micro-batch (already sliced)"""
296
 
297
+ # 🚨 Validate data before processing
298
  if input_mb.size(0) == 0:
299
  return 0.0, 0.0
300
 
301
+ # Check for invalid token IDs (outside vocab range)
302
+ vocab_size = config["model"]["vocab_size"]
303
+ if input_mb.max() >= vocab_size or target_mb.max() >= vocab_size:
304
+ print(f"🚨 Invalid token detected! Max token: {input_mb.max().item()}, Vocab size: {vocab_size}")
305
+ return 0.0, 0.0
306
+
307
+ # Check for NaN in data
308
+ if torch.isnan(input_mb).any() or torch.isnan(target_mb).any():
309
+ print("🚨 NaN detected in input data!")
310
+ return 0.0, 0.0
311
+
312
  input_mb = input_mb.to(device, non_blocking=True)
313
  target_mb = target_mb.to(device, non_blocking=True)
314
 
 
321
  logits = output
322
  lb_loss = 0.0
323
 
324
+ # 🚨 Check for NaN in logits before computing loss
325
  if torch.isnan(logits).any():
326
+ print(f"🚨 NaN detected in logits! Scale: {logits.abs().max().item()}")
327
+ print(f" Input range: [{input_mb.min().item()}, {input_mb.max().item()}]")
328
+ return 0.0, 0.0
329
 
330
  lm_loss = F.cross_entropy(
331
  logits.view(-1, logits.size(-1)),
 
333
  ignore_index=-1,
334
  )
335
 
336
+ # 🚨 Check for NaN in loss components
337
+ if torch.isnan(lm_loss):
338
+ print(f"🚨 NaN in lm_loss!")
339
+ return 0.0, 0.0
340
+
341
  accum_steps = config["training"]["gradient_accumulation_steps"]
342
  if isinstance(lb_loss, float):
343
  total_loss = lm_loss / accum_steps
344
  else:
345
+ if torch.isnan(lb_loss):
346
+ print(f"🚨 NaN in lb_loss! Setting to 0")
347
+ lb_loss = 0.0
348
  lb_loss_coef = config["training"].get("lb_loss_coef", 0.01)
349
  total_loss = (lm_loss + lb_loss_coef * lb_loss) / accum_steps
350
 
351
+ # Backward with NaN check
 
 
 
 
 
352
  if scaler is not None:
353
  scaler.scale(total_loss).backward()
354
  else: