ikaganacar commited on
Commit
a050405
·
1 Parent(s): b9d0f81
.gitignore CHANGED
@@ -1 +1,6 @@
1
  *.pyc
 
 
 
 
 
 
1
  *.pyc
2
+ Model_Architecture/wandb
3
+ *.pt
4
+ Model_Architecture/checkpoints/step_8000_expert_best.pt.old
5
+ Model_Architecture/data/train.txt
6
+ Model_Architecture/data/val.txt
Model_Architecture/config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "model": {
3
- "max_batch_size": 8,
4
  "max_seq_len": 512,
5
  "dtype": "fp32",
6
  "scale_fmt": null,
@@ -45,7 +45,9 @@
45
  "save_dir": "./checkpoints",
46
  "log_every": 100,
47
  "dtype": "fp32",
48
- "compile": false
 
 
49
  },
50
  "data": {
51
  "train_file": "./data/train.txt",
 
1
  {
2
  "model": {
3
+ "max_batch_size": 16,
4
  "max_seq_len": 512,
5
  "dtype": "fp32",
6
  "scale_fmt": null,
 
45
  "save_dir": "./checkpoints",
46
  "log_every": 100,
47
  "dtype": "fp32",
48
+ "compile": false,
49
+ "max_val_batches": 50,
50
+ "val_batch_size_multiplier": 4
51
  },
52
  "data": {
53
  "train_file": "./data/train.txt",
Model_Architecture/data.py CHANGED
@@ -7,6 +7,7 @@ from tqdm import tqdm
7
  import mmap
8
  import numpy as np
9
  import os
 
10
 
11
  from model import ModelArgs
12
 
@@ -257,6 +258,8 @@ def create_dataloader(
257
  max_samples: Optional[int] = None,
258
  use_turkish_tokenizer: bool = True,
259
  use_memory_efficient: bool = True, # NEW: Use memory-efficient loading by default
 
 
260
  ) -> DataLoader:
261
 
262
  # Select tokenizer based on user preference
@@ -310,10 +313,21 @@ def create_dataloader(
310
  except Exception as e:
311
  raise RuntimeError(f"Failed to create dataset: {e}")
312
 
 
 
 
 
 
 
 
 
 
 
 
313
  # Create DataLoader with optimized settings
314
  dataloader = DataLoader(
315
  dataset,
316
- batch_size=args.max_batch_size,
317
  shuffle=shuffle,
318
  drop_last=drop_last,
319
  num_workers=num_workers,
 
7
  import mmap
8
  import numpy as np
9
  import os
10
+ import json
11
 
12
  from model import ModelArgs
13
 
 
258
  max_samples: Optional[int] = None,
259
  use_turkish_tokenizer: bool = True,
260
  use_memory_efficient: bool = True, # NEW: Use memory-efficient loading by default
261
+ is_val: bool = True
262
+
263
  ) -> DataLoader:
264
 
265
  # Select tokenizer based on user preference
 
313
  except Exception as e:
314
  raise RuntimeError(f"Failed to create dataset: {e}")
315
 
316
+ config_path = Path("config.json")
317
+
318
+ with open(config_path,"r") as f:
319
+ config = json.load(f)
320
+ val_batch_size = config["model"]["max_batch_size"] #* config["training"].get("val_batch_size_multiplier", 4)
321
+
322
+ if is_val:
323
+ batch_size = val_batch_size
324
+ else:
325
+ batch_size = args.max_batch_size
326
+
327
  # Create DataLoader with optimized settings
328
  dataloader = DataLoader(
329
  dataset,
330
+ batch_size=batch_size,
331
  shuffle=shuffle,
332
  drop_last=drop_last,
333
  num_workers=num_workers,
Model_Architecture/data/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": "uonlp/CulturaX",
3
+ "subset": "tr",
4
+ "use_small": true,
5
+ "total_documents": 1471979,
6
+ "train_samples": 1398380,
7
+ "val_samples": 73599,
8
+ "train_ratio": 0.95,
9
+ "seed": 2357,
10
+ "train_file": "/mnt/2tb_ssd/ismAIl/Model_Architecture/data/train.txt",
11
+ "val_file": "/mnt/2tb_ssd/ismAIl/Model_Architecture/data/val.txt"
12
+ }
Model_Architecture/train.py CHANGED
@@ -243,6 +243,7 @@ def load_data(config):
243
  shuffle=False,
244
  drop_last=True,
245
  use_memory_efficient=True, # Use memory-efficient loading
 
246
  )
247
 
248
  print(f"✅ Train batches: {len(train_loader)}")
@@ -256,25 +257,38 @@ def evaluate(model, val_loader, device, config):
256
  model.eval()
257
  total_loss = 0.0
258
  total_tokens = 0
259
-
 
 
 
 
 
 
260
  with torch.no_grad():
261
- for input_ids, target_ids in val_loader:
262
- input_ids = input_ids.to(device)
263
- target_ids = target_ids.to(device)
264
-
265
- # Model returns just logits in eval mode (no lb_loss)
266
- output = model(input_ids, start_pos=0)
267
- logits = output if not isinstance(output, tuple) else output[0]
268
-
269
- loss = F.cross_entropy(
270
- logits.view(-1, logits.size(-1)),
271
- target_ids.view(-1),
272
- ignore_index=-1,
273
- )
274
-
 
 
 
 
275
  total_loss += loss.item() * target_ids.numel()
276
  total_tokens += target_ids.numel()
277
-
 
 
 
278
  model.train()
279
  return total_loss / total_tokens
280
 
@@ -284,17 +298,16 @@ def save_checkpoint(model, optimizer, step, config, expert_idx=None):
284
  save_dir = Path(config["training"]["save_dir"])
285
  save_dir.mkdir(parents=True, exist_ok=True)
286
 
287
- # Create checkpoint name
288
- if expert_idx is not None:
289
- ckpt_name = f"step_{step}_expert_{expert_idx}.pt"
290
- else:
291
- ckpt_name = f"step_{step}.pt"
292
-
293
  ckpt_path = save_dir / ckpt_name
294
 
 
 
 
 
295
  checkpoint = {
296
  "step": step,
297
- "model_state_dict": model.state_dict(),
298
  "optimizer_state_dict": optimizer.state_dict(),
299
  "config": config,
300
  }
@@ -406,11 +419,33 @@ def main():
406
 
407
  # Resume from checkpoint
408
  if args.resume:
 
409
  ckpt = torch.load(args.resume, map_location=device)
410
- model.load_state_dict(ckpt["model_state_dict"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  optimizer.load_state_dict(ckpt["optimizer_state_dict"])
412
  step = ckpt["step"]
413
- print(f"✅ Resumed from step {step}\n")
414
 
415
  # ✅ FIX: Only create scaler for FP16, not BF16 or FP32
416
  training_dtype = config["training"]["dtype"].lower()
@@ -570,4 +605,8 @@ def main():
570
 
571
 
572
  if __name__ == "__main__":
573
- main()
 
 
 
 
 
243
  shuffle=False,
244
  drop_last=True,
245
  use_memory_efficient=True, # Use memory-efficient loading
246
+ is_val = True
247
  )
248
 
249
  print(f"✅ Train batches: {len(train_loader)}")
 
257
  model.eval()
258
  total_loss = 0.0
259
  total_tokens = 0
260
+ max_batches = config["training"].get("max_val_batches", 50) # Only 50 batches
261
+
262
+ # Add progress bar
263
+ from tqdm import tqdm
264
+ pbar = tqdm(total=max_batches, desc="📊 Validating", ncols=80)
265
+
266
+ val_dtype = config["training"]["dtype"]
267
  with torch.no_grad():
268
+ for i, (input_ids, target_ids) in enumerate(val_loader):
269
+ if i >= max_batches:
270
+ break
271
+
272
+ input_ids = input_ids.to(device, non_blocking=True)
273
+ target_ids = target_ids.to(device, non_blocking=True)
274
+
275
+ # Use autocast for speed
276
+ with torch.amp.autocast(device_type='cuda', enabled=(val_dtype == 'bf16')):
277
+ output = model(input_ids, start_pos=0)
278
+ logits = output[0] if isinstance(output, tuple) else output
279
+
280
+ loss = F.cross_entropy(
281
+ logits.view(-1, logits.size(-1)),
282
+ target_ids.view(-1),
283
+ ignore_index=-1,
284
+ )
285
+
286
  total_loss += loss.item() * target_ids.numel()
287
  total_tokens += target_ids.numel()
288
+ pbar.update(1)
289
+ pbar.set_postfix({'loss': f'{loss.item():.3f}'})
290
+
291
+ pbar.close()
292
  model.train()
293
  return total_loss / total_tokens
294
 
 
298
  save_dir = Path(config["training"]["save_dir"])
299
  save_dir.mkdir(parents=True, exist_ok=True)
300
 
301
+ ckpt_name = f"step_{step}_expert_{expert_idx}.pt" if expert_idx is not None else f"step_{step}.pt"
 
 
 
 
 
302
  ckpt_path = save_dir / ckpt_name
303
 
304
+ # 🔥 Exclude cache buffers - they should be reinitialized from config
305
+ state_dict = model.state_dict()
306
+ filtered_state_dict = {k: v for k, v in state_dict.items() if 'cache' not in k.lower()}
307
+
308
  checkpoint = {
309
  "step": step,
310
+ "model_state_dict": filtered_state_dict,
311
  "optimizer_state_dict": optimizer.state_dict(),
312
  "config": config,
313
  }
 
419
 
420
  # Resume from checkpoint
421
  if args.resume:
422
+ print(f"📥 Loading checkpoint from {args.resume}...")
423
  ckpt = torch.load(args.resume, map_location=device)
424
+
425
+ # Create model with current config (ensures correct cache sizes)
426
+ model, model_args = setup_model(config, device)
427
+
428
+ # Load state dict but skip/resize mismatched buffers
429
+ model_state_dict = model.state_dict()
430
+ loaded_state_dict = ckpt["model_state_dict"]
431
+
432
+ skip_count = 0
433
+ for name, param in loaded_state_dict.items():
434
+ if name in model_state_dict:
435
+ if model_state_dict[name].shape != param.shape:
436
+ if "cache" in name: # Skip cache buffers
437
+ skip_count += 1
438
+ continue
439
+ else:
440
+ raise RuntimeError(f"Shape mismatch {name}: {param.shape} vs {model_state_dict[name].shape}")
441
+ model_state_dict[name].copy_(param)
442
+ else:
443
+ print(f"⚠️ Unexpected parameter: {name}")
444
+
445
+ model.load_state_dict(model_state_dict, strict=False)
446
  optimizer.load_state_dict(ckpt["optimizer_state_dict"])
447
  step = ckpt["step"]
448
+ print(f"✅ Resumed from step {step} (skipped {skip_count} cache buffers)\n")
449
 
450
  # ✅ FIX: Only create scaler for FP16, not BF16 or FP32
451
  training_dtype = config["training"]["dtype"].lower()
 
605
 
606
 
607
  if __name__ == "__main__":
608
+ main()
609
+
610
+
611
+
612
+