Commit ·
f8b2ed5
1
Parent(s): 8118e19
Fixes
Browse files- Model_Architecture/config.json +1 -1
- Model_Architecture/model.py +7 -2
- Model_Architecture/train.py +25 -8
Model_Architecture/config.json
CHANGED
|
@@ -30,7 +30,7 @@
|
|
| 30 |
"tokenizer_name": "turkish"
|
| 31 |
},
|
| 32 |
"training": {
|
| 33 |
-
"learning_rate": 3e-
|
| 34 |
"weight_decay": 0.1,
|
| 35 |
"beta1": 0.9,
|
| 36 |
"beta2": 0.95,
|
|
|
|
| 30 |
"tokenizer_name": "turkish"
|
| 31 |
},
|
| 32 |
"training": {
|
| 33 |
+
"learning_rate": 3e-5,
|
| 34 |
"weight_decay": 0.1,
|
| 35 |
"beta1": 0.9,
|
| 36 |
"beta2": 0.95,
|
Model_Architecture/model.py
CHANGED
|
@@ -129,7 +129,7 @@ def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] =
|
|
| 129 |
|
| 130 |
|
| 131 |
class Linear(nn.Module):
|
| 132 |
-
dtype = torch.
|
| 133 |
scale_fmt: Optional[str] = None
|
| 134 |
|
| 135 |
def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
|
|
@@ -187,7 +187,7 @@ class RMSNorm(nn.Module):
|
|
| 187 |
super().__init__()
|
| 188 |
self.dim = dim
|
| 189 |
self.eps = eps
|
| 190 |
-
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.
|
| 191 |
|
| 192 |
def forward(self, x: torch.Tensor):
|
| 193 |
output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
|
|
@@ -500,12 +500,17 @@ class ismail(nn.Module):
|
|
| 500 |
self.n_layers = args.n_layers
|
| 501 |
|
| 502 |
self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
|
|
|
|
|
|
|
| 503 |
self.layers = nn.ModuleList([Block(i, args) for i in range(args.n_layers)])
|
| 504 |
self.norm = RMSNorm(args.dim)
|
| 505 |
self.output = Linear(args.dim, args.vocab_size, bias=False)
|
| 506 |
self.use_checkpointing = False
|
| 507 |
|
| 508 |
self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
def set_active_expert(self, expert_idx: Optional[int]):
|
| 511 |
"""Set active expert for all MoE layers (for sequential training)"""
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
class Linear(nn.Module):
|
| 132 |
+
dtype = torch.float32
|
| 133 |
scale_fmt: Optional[str] = None
|
| 134 |
|
| 135 |
def __init__(self, in_features: int, out_features: int, bias: bool = False, dtype = None):
|
|
|
|
| 187 |
super().__init__()
|
| 188 |
self.dim = dim
|
| 189 |
self.eps = eps
|
| 190 |
+
self.weight = nn.Parameter(torch.ones(dim, dtype=torch.float32))
|
| 191 |
|
| 192 |
def forward(self, x: torch.Tensor):
|
| 193 |
output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
|
|
|
|
| 500 |
self.n_layers = args.n_layers
|
| 501 |
|
| 502 |
self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
|
| 503 |
+
nn.init.normal_(self.tok_embeddings.weight, mean=0.0, std=0.02)
|
| 504 |
+
|
| 505 |
self.layers = nn.ModuleList([Block(i, args) for i in range(args.n_layers)])
|
| 506 |
self.norm = RMSNorm(args.dim)
|
| 507 |
self.output = Linear(args.dim, args.vocab_size, bias=False)
|
| 508 |
self.use_checkpointing = False
|
| 509 |
|
| 510 |
self.register_buffer("freqs_cis", precompute_freqs_cis(args), persistent=False)
|
| 511 |
+
|
| 512 |
+
if hasattr(self.output, 'weight'):
|
| 513 |
+
nn.init.normal_(self.output.weight, mean=0.0, std=0.02 / math.sqrt(args.n_layers))
|
| 514 |
|
| 515 |
def set_active_expert(self, expert_idx: Optional[int]):
|
| 516 |
"""Set active expert for all MoE layers (for sequential training)"""
|
Model_Architecture/train.py
CHANGED
|
@@ -294,9 +294,21 @@ def save_checkpoint(model, optimizer, step, config, expert_idx=None):
|
|
| 294 |
def train_step(model, input_mb, target_mb, device, config, scaler=None):
|
| 295 |
"""Process a SINGLE micro-batch (already sliced)"""
|
| 296 |
|
|
|
|
| 297 |
if input_mb.size(0) == 0:
|
| 298 |
return 0.0, 0.0
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
input_mb = input_mb.to(device, non_blocking=True)
|
| 301 |
target_mb = target_mb.to(device, non_blocking=True)
|
| 302 |
|
|
@@ -309,9 +321,11 @@ def train_step(model, input_mb, target_mb, device, config, scaler=None):
|
|
| 309 |
logits = output
|
| 310 |
lb_loss = 0.0
|
| 311 |
|
| 312 |
-
# 🚨
|
| 313 |
if torch.isnan(logits).any():
|
| 314 |
-
print(f"🚨 NaN detected in logits! Scale: {logits.abs().max().item()
|
|
|
|
|
|
|
| 315 |
|
| 316 |
lm_loss = F.cross_entropy(
|
| 317 |
logits.view(-1, logits.size(-1)),
|
|
@@ -319,19 +333,22 @@ def train_step(model, input_mb, target_mb, device, config, scaler=None):
|
|
| 319 |
ignore_index=-1,
|
| 320 |
)
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
accum_steps = config["training"]["gradient_accumulation_steps"]
|
| 323 |
if isinstance(lb_loss, float):
|
| 324 |
total_loss = lm_loss / accum_steps
|
| 325 |
else:
|
|
|
|
|
|
|
|
|
|
| 326 |
lb_loss_coef = config["training"].get("lb_loss_coef", 0.01)
|
| 327 |
total_loss = (lm_loss + lb_loss_coef * lb_loss) / accum_steps
|
| 328 |
|
| 329 |
-
#
|
| 330 |
-
if torch.isnan(total_loss):
|
| 331 |
-
print(f"🚨 NaN in total_loss! lm_loss: {lm_loss.item():.4f}, lb_loss: {lb_loss}")
|
| 332 |
-
return 0.0, 0.0
|
| 333 |
-
|
| 334 |
-
# Backward
|
| 335 |
if scaler is not None:
|
| 336 |
scaler.scale(total_loss).backward()
|
| 337 |
else:
|
|
|
|
| 294 |
def train_step(model, input_mb, target_mb, device, config, scaler=None):
|
| 295 |
"""Process a SINGLE micro-batch (already sliced)"""
|
| 296 |
|
| 297 |
+
# 🚨 Validate data before processing
|
| 298 |
if input_mb.size(0) == 0:
|
| 299 |
return 0.0, 0.0
|
| 300 |
|
| 301 |
+
# Check for invalid token IDs (outside vocab range)
|
| 302 |
+
vocab_size = config["model"]["vocab_size"]
|
| 303 |
+
if input_mb.max() >= vocab_size or target_mb.max() >= vocab_size:
|
| 304 |
+
print(f"🚨 Invalid token detected! Max token: {input_mb.max().item()}, Vocab size: {vocab_size}")
|
| 305 |
+
return 0.0, 0.0
|
| 306 |
+
|
| 307 |
+
# Check for NaN in data
|
| 308 |
+
if torch.isnan(input_mb).any() or torch.isnan(target_mb).any():
|
| 309 |
+
print("🚨 NaN detected in input data!")
|
| 310 |
+
return 0.0, 0.0
|
| 311 |
+
|
| 312 |
input_mb = input_mb.to(device, non_blocking=True)
|
| 313 |
target_mb = target_mb.to(device, non_blocking=True)
|
| 314 |
|
|
|
|
| 321 |
logits = output
|
| 322 |
lb_loss = 0.0
|
| 323 |
|
| 324 |
+
# 🚨 Check for NaN in logits before computing loss
|
| 325 |
if torch.isnan(logits).any():
|
| 326 |
+
print(f"🚨 NaN detected in logits! Scale: {logits.abs().max().item()}")
|
| 327 |
+
print(f" Input range: [{input_mb.min().item()}, {input_mb.max().item()}]")
|
| 328 |
+
return 0.0, 0.0
|
| 329 |
|
| 330 |
lm_loss = F.cross_entropy(
|
| 331 |
logits.view(-1, logits.size(-1)),
|
|
|
|
| 333 |
ignore_index=-1,
|
| 334 |
)
|
| 335 |
|
| 336 |
+
# 🚨 Check for NaN in loss components
|
| 337 |
+
if torch.isnan(lm_loss):
|
| 338 |
+
print(f"🚨 NaN in lm_loss!")
|
| 339 |
+
return 0.0, 0.0
|
| 340 |
+
|
| 341 |
accum_steps = config["training"]["gradient_accumulation_steps"]
|
| 342 |
if isinstance(lb_loss, float):
|
| 343 |
total_loss = lm_loss / accum_steps
|
| 344 |
else:
|
| 345 |
+
if torch.isnan(lb_loss):
|
| 346 |
+
print(f"🚨 NaN in lb_loss! Setting to 0")
|
| 347 |
+
lb_loss = 0.0
|
| 348 |
lb_loss_coef = config["training"].get("lb_loss_coef", 0.01)
|
| 349 |
total_loss = (lm_loss + lb_loss_coef * lb_loss) / accum_steps
|
| 350 |
|
| 351 |
+
# Backward with NaN check
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
if scaler is not None:
|
| 353 |
scaler.scale(total_loss).backward()
|
| 354 |
else:
|