Commit
·
a050405
1
Parent(s):
b9d0f81
FİXED
Browse files- .gitignore +5 -0
- Model_Architecture/config.json +4 -2
- Model_Architecture/data.py +15 -1
- Model_Architecture/data/dataset_info.json +12 -0
- Model_Architecture/train.py +65 -26
.gitignore
CHANGED
|
@@ -1 +1,6 @@
|
|
| 1 |
*.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.pyc
|
| 2 |
+
Model_Architecture/wandb
|
| 3 |
+
*.pt
|
| 4 |
+
Model_Architecture/checkpoints/step_8000_expert_best.pt.old
|
| 5 |
+
Model_Architecture/data/train.txt
|
| 6 |
+
Model_Architecture/data/val.txt
|
Model_Architecture/config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"model": {
|
| 3 |
-
"max_batch_size":
|
| 4 |
"max_seq_len": 512,
|
| 5 |
"dtype": "fp32",
|
| 6 |
"scale_fmt": null,
|
|
@@ -45,7 +45,9 @@
|
|
| 45 |
"save_dir": "./checkpoints",
|
| 46 |
"log_every": 100,
|
| 47 |
"dtype": "fp32",
|
| 48 |
-
"compile": false
|
|
|
|
|
|
|
| 49 |
},
|
| 50 |
"data": {
|
| 51 |
"train_file": "./data/train.txt",
|
|
|
|
| 1 |
{
|
| 2 |
"model": {
|
| 3 |
+
"max_batch_size": 16,
|
| 4 |
"max_seq_len": 512,
|
| 5 |
"dtype": "fp32",
|
| 6 |
"scale_fmt": null,
|
|
|
|
| 45 |
"save_dir": "./checkpoints",
|
| 46 |
"log_every": 100,
|
| 47 |
"dtype": "fp32",
|
| 48 |
+
"compile": false,
|
| 49 |
+
"max_val_batches": 50,
|
| 50 |
+
"val_batch_size_multiplier": 4
|
| 51 |
},
|
| 52 |
"data": {
|
| 53 |
"train_file": "./data/train.txt",
|
Model_Architecture/data.py
CHANGED
|
@@ -7,6 +7,7 @@ from tqdm import tqdm
|
|
| 7 |
import mmap
|
| 8 |
import numpy as np
|
| 9 |
import os
|
|
|
|
| 10 |
|
| 11 |
from model import ModelArgs
|
| 12 |
|
|
@@ -257,6 +258,8 @@ def create_dataloader(
|
|
| 257 |
max_samples: Optional[int] = None,
|
| 258 |
use_turkish_tokenizer: bool = True,
|
| 259 |
use_memory_efficient: bool = True, # NEW: Use memory-efficient loading by default
|
|
|
|
|
|
|
| 260 |
) -> DataLoader:
|
| 261 |
|
| 262 |
# Select tokenizer based on user preference
|
|
@@ -310,10 +313,21 @@ def create_dataloader(
|
|
| 310 |
except Exception as e:
|
| 311 |
raise RuntimeError(f"Failed to create dataset: {e}")
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
# Create DataLoader with optimized settings
|
| 314 |
dataloader = DataLoader(
|
| 315 |
dataset,
|
| 316 |
-
batch_size=
|
| 317 |
shuffle=shuffle,
|
| 318 |
drop_last=drop_last,
|
| 319 |
num_workers=num_workers,
|
|
|
|
| 7 |
import mmap
|
| 8 |
import numpy as np
|
| 9 |
import os
|
| 10 |
+
import json
|
| 11 |
|
| 12 |
from model import ModelArgs
|
| 13 |
|
|
|
|
| 258 |
max_samples: Optional[int] = None,
|
| 259 |
use_turkish_tokenizer: bool = True,
|
| 260 |
use_memory_efficient: bool = True, # NEW: Use memory-efficient loading by default
|
| 261 |
+
is_val: bool = True
|
| 262 |
+
|
| 263 |
) -> DataLoader:
|
| 264 |
|
| 265 |
# Select tokenizer based on user preference
|
|
|
|
| 313 |
except Exception as e:
|
| 314 |
raise RuntimeError(f"Failed to create dataset: {e}")
|
| 315 |
|
| 316 |
+
config_path = Path("config.json")
|
| 317 |
+
|
| 318 |
+
with open(config_path,"r") as f:
|
| 319 |
+
config = json.load(f)
|
| 320 |
+
val_batch_size = config["model"]["max_batch_size"] #* config["training"].get("val_batch_size_multiplier", 4)
|
| 321 |
+
|
| 322 |
+
if is_val:
|
| 323 |
+
batch_size = val_batch_size
|
| 324 |
+
else:
|
| 325 |
+
batch_size = args.max_batch_size
|
| 326 |
+
|
| 327 |
# Create DataLoader with optimized settings
|
| 328 |
dataloader = DataLoader(
|
| 329 |
dataset,
|
| 330 |
+
batch_size=batch_size,
|
| 331 |
shuffle=shuffle,
|
| 332 |
drop_last=drop_last,
|
| 333 |
num_workers=num_workers,
|
Model_Architecture/data/dataset_info.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"dataset": "uonlp/CulturaX",
|
| 3 |
+
"subset": "tr",
|
| 4 |
+
"use_small": true,
|
| 5 |
+
"total_documents": 1471979,
|
| 6 |
+
"train_samples": 1398380,
|
| 7 |
+
"val_samples": 73599,
|
| 8 |
+
"train_ratio": 0.95,
|
| 9 |
+
"seed": 2357,
|
| 10 |
+
"train_file": "/mnt/2tb_ssd/ismAIl/Model_Architecture/data/train.txt",
|
| 11 |
+
"val_file": "/mnt/2tb_ssd/ismAIl/Model_Architecture/data/val.txt"
|
| 12 |
+
}
|
Model_Architecture/train.py
CHANGED
|
@@ -243,6 +243,7 @@ def load_data(config):
|
|
| 243 |
shuffle=False,
|
| 244 |
drop_last=True,
|
| 245 |
use_memory_efficient=True, # Use memory-efficient loading
|
|
|
|
| 246 |
)
|
| 247 |
|
| 248 |
print(f"✅ Train batches: {len(train_loader)}")
|
|
@@ -256,25 +257,38 @@ def evaluate(model, val_loader, device, config):
|
|
| 256 |
model.eval()
|
| 257 |
total_loss = 0.0
|
| 258 |
total_tokens = 0
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
with torch.no_grad():
|
| 261 |
-
for input_ids, target_ids in val_loader:
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
total_loss += loss.item() * target_ids.numel()
|
| 276 |
total_tokens += target_ids.numel()
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
| 278 |
model.train()
|
| 279 |
return total_loss / total_tokens
|
| 280 |
|
|
@@ -284,17 +298,16 @@ def save_checkpoint(model, optimizer, step, config, expert_idx=None):
|
|
| 284 |
save_dir = Path(config["training"]["save_dir"])
|
| 285 |
save_dir.mkdir(parents=True, exist_ok=True)
|
| 286 |
|
| 287 |
-
|
| 288 |
-
if expert_idx is not None:
|
| 289 |
-
ckpt_name = f"step_{step}_expert_{expert_idx}.pt"
|
| 290 |
-
else:
|
| 291 |
-
ckpt_name = f"step_{step}.pt"
|
| 292 |
-
|
| 293 |
ckpt_path = save_dir / ckpt_name
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
checkpoint = {
|
| 296 |
"step": step,
|
| 297 |
-
"model_state_dict":
|
| 298 |
"optimizer_state_dict": optimizer.state_dict(),
|
| 299 |
"config": config,
|
| 300 |
}
|
|
@@ -406,11 +419,33 @@ def main():
|
|
| 406 |
|
| 407 |
# Resume from checkpoint
|
| 408 |
if args.resume:
|
|
|
|
| 409 |
ckpt = torch.load(args.resume, map_location=device)
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
optimizer.load_state_dict(ckpt["optimizer_state_dict"])
|
| 412 |
step = ckpt["step"]
|
| 413 |
-
print(f"✅ Resumed from step {step}\n")
|
| 414 |
|
| 415 |
# ✅ FIX: Only create scaler for FP16, not BF16 or FP32
|
| 416 |
training_dtype = config["training"]["dtype"].lower()
|
|
@@ -570,4 +605,8 @@ def main():
|
|
| 570 |
|
| 571 |
|
| 572 |
if __name__ == "__main__":
|
| 573 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
shuffle=False,
|
| 244 |
drop_last=True,
|
| 245 |
use_memory_efficient=True, # Use memory-efficient loading
|
| 246 |
+
is_val = True
|
| 247 |
)
|
| 248 |
|
| 249 |
print(f"✅ Train batches: {len(train_loader)}")
|
|
|
|
| 257 |
model.eval()
|
| 258 |
total_loss = 0.0
|
| 259 |
total_tokens = 0
|
| 260 |
+
max_batches = config["training"].get("max_val_batches", 50) # Only 50 batches
|
| 261 |
+
|
| 262 |
+
# Add progress bar
|
| 263 |
+
from tqdm import tqdm
|
| 264 |
+
pbar = tqdm(total=max_batches, desc="📊 Validating", ncols=80)
|
| 265 |
+
|
| 266 |
+
val_dtype = config["training"]["dtype"]
|
| 267 |
with torch.no_grad():
|
| 268 |
+
for i, (input_ids, target_ids) in enumerate(val_loader):
|
| 269 |
+
if i >= max_batches:
|
| 270 |
+
break
|
| 271 |
+
|
| 272 |
+
input_ids = input_ids.to(device, non_blocking=True)
|
| 273 |
+
target_ids = target_ids.to(device, non_blocking=True)
|
| 274 |
+
|
| 275 |
+
# Use autocast for speed
|
| 276 |
+
with torch.amp.autocast(device_type='cuda', enabled=(val_dtype == 'bf16')):
|
| 277 |
+
output = model(input_ids, start_pos=0)
|
| 278 |
+
logits = output[0] if isinstance(output, tuple) else output
|
| 279 |
+
|
| 280 |
+
loss = F.cross_entropy(
|
| 281 |
+
logits.view(-1, logits.size(-1)),
|
| 282 |
+
target_ids.view(-1),
|
| 283 |
+
ignore_index=-1,
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
total_loss += loss.item() * target_ids.numel()
|
| 287 |
total_tokens += target_ids.numel()
|
| 288 |
+
pbar.update(1)
|
| 289 |
+
pbar.set_postfix({'loss': f'{loss.item():.3f}'})
|
| 290 |
+
|
| 291 |
+
pbar.close()
|
| 292 |
model.train()
|
| 293 |
return total_loss / total_tokens
|
| 294 |
|
|
|
|
| 298 |
save_dir = Path(config["training"]["save_dir"])
|
| 299 |
save_dir.mkdir(parents=True, exist_ok=True)
|
| 300 |
|
| 301 |
+
ckpt_name = f"step_{step}_expert_{expert_idx}.pt" if expert_idx is not None else f"step_{step}.pt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
ckpt_path = save_dir / ckpt_name
|
| 303 |
|
| 304 |
+
# 🔥 Exclude cache buffers - they should be reinitialized from config
|
| 305 |
+
state_dict = model.state_dict()
|
| 306 |
+
filtered_state_dict = {k: v for k, v in state_dict.items() if 'cache' not in k.lower()}
|
| 307 |
+
|
| 308 |
checkpoint = {
|
| 309 |
"step": step,
|
| 310 |
+
"model_state_dict": filtered_state_dict,
|
| 311 |
"optimizer_state_dict": optimizer.state_dict(),
|
| 312 |
"config": config,
|
| 313 |
}
|
|
|
|
| 419 |
|
| 420 |
# Resume from checkpoint
|
| 421 |
if args.resume:
|
| 422 |
+
print(f"📥 Loading checkpoint from {args.resume}...")
|
| 423 |
ckpt = torch.load(args.resume, map_location=device)
|
| 424 |
+
|
| 425 |
+
# Create model with current config (ensures correct cache sizes)
|
| 426 |
+
model, model_args = setup_model(config, device)
|
| 427 |
+
|
| 428 |
+
# Load state dict but skip/resize mismatched buffers
|
| 429 |
+
model_state_dict = model.state_dict()
|
| 430 |
+
loaded_state_dict = ckpt["model_state_dict"]
|
| 431 |
+
|
| 432 |
+
skip_count = 0
|
| 433 |
+
for name, param in loaded_state_dict.items():
|
| 434 |
+
if name in model_state_dict:
|
| 435 |
+
if model_state_dict[name].shape != param.shape:
|
| 436 |
+
if "cache" in name: # Skip cache buffers
|
| 437 |
+
skip_count += 1
|
| 438 |
+
continue
|
| 439 |
+
else:
|
| 440 |
+
raise RuntimeError(f"Shape mismatch {name}: {param.shape} vs {model_state_dict[name].shape}")
|
| 441 |
+
model_state_dict[name].copy_(param)
|
| 442 |
+
else:
|
| 443 |
+
print(f"⚠️ Unexpected parameter: {name}")
|
| 444 |
+
|
| 445 |
+
model.load_state_dict(model_state_dict, strict=False)
|
| 446 |
optimizer.load_state_dict(ckpt["optimizer_state_dict"])
|
| 447 |
step = ckpt["step"]
|
| 448 |
+
print(f"✅ Resumed from step {step} (skipped {skip_count} cache buffers)\n")
|
| 449 |
|
| 450 |
# ✅ FIX: Only create scaler for FP16, not BF16 or FP32
|
| 451 |
training_dtype = config["training"]["dtype"].lower()
|
|
|
|
| 605 |
|
| 606 |
|
| 607 |
if __name__ == "__main__":
|
| 608 |
+
main()
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
|