kgrabko commited on
Commit
cfffde9
·
verified ·
1 Parent(s): 95d9995

Upload load_JiRack5_RedPajama_140b.py

Browse files
Files changed (1) hide show
  1. load_JiRack5_RedPajama_140b.py +87 -0
load_JiRack5_RedPajama_140b.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # COPYRIGHT (C) 2025 KONSTANTIN VLADIMIROVICH GRABKO. ALL RIGHTS RESERVED.
3
+ # PATENT PENDING | CMS MANHATTAN JIRACK TECHNOLOGY
4
+ # ==============================================================================
5
+ # Version 4.1 - 140B Dense | RedPajama-Data-1T Integration
6
+ # Optimized for 160 Layers & SwiGLU-Attention (SWA) Fusion
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from transformers import AutoTokenizer
11
+ from datasets import load_dataset
12
+ from torch.cuda.amp import autocast, GradScaler
13
+ import os
14
+
15
+ # Import the Dense Architecture
16
+ from JiRackPyTorch_GPT5_class_140b import JiRackPyTorch
17
+
18
+ # --- CMS MANHATTAN CONFIGURATION ---
19
+ CHECKPOINT_DIR = "checkpoints_jirack_140b_dense"
20
+ SAVE_INTERVAL = 500
21
+ GRAD_ACCUM_STEPS = 64 # High accumulation to stabilize the 160-layer gradient
22
+ BLOCK_SIZE = 2048
23
+ LEARNING_RATE = 4.0e-6
24
+
25
+ def train():
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ scaler = GradScaler()
28
+
29
+ # --- REDPAJAMA INTEGRATION ---
30
+ # Using the INCITE-Base-3B tokenizer for its high-efficiency vocabulary
31
+ tokenizer = AutoTokenizer.from_pretrained("togethercomputer/RedPajama-INCITE-Base-3B")
32
+ if tokenizer.pad_token is None:
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+
35
+ # Load RedPajama-Data-1T in Streaming Mode to save Disk I/O
36
+ print("Connecting to RedPajama-Data-1T (Streaming Mode)...")
37
+ dataset = load_dataset("togethercomputer/RedPajama-Data-1T", split="train", streaming=True)
38
+
39
+ # Initialize 140B Dense Flagship
40
+ model = JiRackPyTorch()
41
+ model.gradient_checkpointing_enable() # BRE Strategy: Trade compute for VRAM
42
+
43
+ if torch.cuda.device_count() > 1:
44
+ model = nn.DataParallel(model)
45
+ model.to(device)
46
+
47
+ optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
48
+
49
+ model.train()
50
+ print("--- Training Started: JiRack 140B Dense ---")
51
+
52
+ for current_step, example in enumerate(dataset):
53
+ # RedPajama uses the "text" key for content
54
+ tokens = tokenizer(
55
+ example["text"],
56
+ truncation=True,
57
+ max_length=BLOCK_SIZE,
58
+ padding="max_length",
59
+ return_tensors="pt"
60
+ )
61
+
62
+ input_ids = tokens["input_ids"].to(device)
63
+
64
+ # SWA Fusion Forward Pass (Mixed Precision)
65
+ with autocast(dtype=torch.bfloat16):
66
+ logits, loss, _ = model(input_ids, targets=input_ids)
67
+ loss = loss.mean() / GRAD_ACCUM_STEPS
68
+
69
+ scaler.scale(loss).backward()
70
+
71
+ # Step Optimization
72
+ if (current_step + 1) % GRAD_ACCUM_STEPS == 0:
73
+ scaler.unscale_(optimizer)
74
+ # Tight clipping for deep 140B networks
75
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
76
+ scaler.step(optimizer)
77
+ scaler.update()
78
+ optimizer.zero_grad()
79
+
80
+ if current_step % 5 == 0:
81
+ vram = torch.cuda.memory_reserved() / 1e9
82
+ print(f"CMS 140B | Step {current_step} | Loss: {loss.item()*GRAD_ACCUM_STEPS:.4f} | VRAM: {vram:.1f}GB", end='\r')
83
+
84
+ if __name__ == "__main__":
85
+ # Allocator tuning for Tesla M10 32GB
86
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:64"
87
+ train()