kgrabko commited on
Commit
53af61e
·
verified ·
1 Parent(s): fbedeb2

Upload load_JiRack5_ThePile_7b.py

Browse files
Files changed (1) hide show
  1. load_JiRack5_ThePile_7b.py +92 -0
load_JiRack5_ThePile_7b.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # COPYRIGHT (C) 2025 KONSTANTIN VLADIMIROVICH GRABKO. ALL RIGHTS RESERVED.
3
+ # PATENT PENDING | CMS MANHATTAN JIRACK TECHNOLOGY
4
+ # ==============================================================================
5
+ # Version 3.7 - 7B Agile Titan | Pile-Uncopyrighted & Llama Tokenizer
6
+ # Optimized for 32 Layers | 4,096 Width | 32k Vocab
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from transformers import AutoTokenizer
11
+ from datasets import load_dataset
12
+ from torch.cuda.amp import autocast, GradScaler
13
+ import os
14
+
15
+ # Import the 7B Agile Architecture
16
+ from JiRackPyTorch_GPT5_class_7b import JiRackPyTorch
17
+
18
+ # --- CMS MANHATTAN 7B CONFIGURATION ---
19
+ CHECKPOINT_DIR = "checkpoints_jirack_7b_agile"
20
+ SAVE_INTERVAL = 1000
21
+ GRAD_ACCUM_STEPS = 8 # Lower for 7B; enables faster weight updates
22
+ BLOCK_SIZE = 4096 # Full context window support
23
+ LEARNING_RATE = 3.0e-4 # Robust LR for 7B scale
24
+
25
+ def train():
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ scaler = GradScaler()
28
+
29
+ # 1. TOKENIZER: Standard Llama-7B Vocabulary
30
+ tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=True)
31
+ if tokenizer.pad_token is None:
32
+ tokenizer.pad_token = tokenizer.eos_token
33
+
34
+ # 2. DATASET: Pile-Uncopyrighted (Streaming)
35
+ print("Connecting to monology/pile-uncopyrighted (Streaming)...")
36
+ dataset = load_dataset("monology/pile-uncopyrighted", split="train", streaming=True)
37
+
38
+ # 3. MODEL INITIALIZATION
39
+ # Architecture: 32 Layers | 4,096 Hidden Dim | SWA Fusion
40
+ model = JiRackPyTorch(vocab_size=len(tokenizer))
41
+
42
+ # Optional: Enable for M10 hardware to maximize batch size
43
+ model.gradient_checkpointing_enable()
44
+
45
+ if torch.cuda.device_count() > 1:
46
+ model = nn.DataParallel(model)
47
+ model.to(device)
48
+
49
+ # Optimizer: Higher weight decay (0.1) to regularize the smaller 7B model
50
+ optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)
51
+
52
+ model.train()
53
+ print("--- [AGILE TITAN] Training Started: JiRack 7B ---")
54
+
55
+ for current_step, example in enumerate(dataset):
56
+ tokens = tokenizer(
57
+ example["text"],
58
+ truncation=True,
59
+ max_length=BLOCK_SIZE,
60
+ padding="max_length",
61
+ return_tensors="pt"
62
+ )
63
+
64
+ input_ids = tokens["input_ids"].to(device)
65
+
66
+ # SWA Fusion Forward Pass
67
+ with autocast(dtype=torch.bfloat16):
68
+ logits, loss, _ = model(input_ids, targets=input_ids)
69
+ loss = loss.mean() / GRAD_ACCUM_STEPS
70
+
71
+ scaler.scale(loss).backward()
72
+
73
+ if (current_step + 1) % GRAD_ACCUM_STEPS == 0:
74
+ scaler.unscale_(optimizer)
75
+ # Clip at 1.0 for the 7B's higher learning rate
76
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
77
+ scaler.step(optimizer)
78
+ scaler.update()
79
+ optimizer.zero_grad()
80
+
81
+ if current_step % 20 == 0:
82
+ vram = torch.cuda.memory_reserved() / 1e9
83
+ print(f"JiRack 7B | Step {current_step} | Loss: {loss.item()*GRAD_ACCUM_STEPS:.4f} | VRAM: {vram:.1f}GB", end='\r')
84
+
85
+ if current_step % SAVE_INTERVAL == 0 and current_step > 0:
86
+ if not os.path.exists(CHECKPOINT_DIR): os.makedirs(CHECKPOINT_DIR)
87
+ torch.save(model.state_dict(), f"{CHECKPOINT_DIR}/step_{current_step}.pt")
88
+
89
+ if __name__ == "__main__":
90
+ # Optimize for Tesla M10 (32GB chunks)
91
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
92
+ train()