CMSManhattan
/

JiRack_GPT5_13b

Model card Files Files and versions

xet

Community

kgrabko commited on Dec 23, 2025

Commit

4fc2831

verified ·

1 Parent(s): 73b01b0

Upload load_JiRack5_ThePile_13b.py

Browse files

Files changed (1) hide show

load_JiRack5_ThePile_13b.py +108 -0

load_JiRack5_ThePile_13b.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# ==============================================================================
+# COPYRIGHT (C) 2025 KONSTANTIN VLADIMIROVICH GRABKO. ALL RIGHTS RESERVED.
+# PATENT PENDING | CMS MANHATTAN JIRACK TECHNOLOGY
+# ==============================================================================
+# Version 3.6 - 13B Agile Titan | Distributed Optimization
+# Optimized for: huggyllama/llama-7b & monology/pile-uncopyrighted
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer
+from datasets import load_dataset
+from torch.cuda.amp import autocast, GradScaler
+import os
+import sys
+# Import the 13B Architecture
+from JiRackPyTorch_GPT5_class_13b import JiRackPyTorch
+# --- CMS MANHATTAN CONFIGURATION ---
+CHECKPOINT_DIR = "checkpoints_jirack_13b_fixed"
+SAVE_INTERVAL = 1000
+GRAD_ACCUM_STEPS = 16
+BLOCK_SIZE = 2048
+LEARNING_RATE = 3.0e-4
+def train():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    scaler = GradScaler()
+    # 1. FIXED TOKENIZER INTEGRATION
+    # The Llama tokenizer requires a fast implementation for streaming large datasets
+    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # 2. FIXED DATASET LOADING (Streaming & Sharding)
+    print("Connecting to monology/pile-uncopyrighted...")
+    dataset = load_dataset("monology/pile-uncopyrighted", split="train", streaming=True)
+    # If training on multiple GPUs, we must ensure each GPU sees different data
+    if torch.cuda.device_count() > 1:
+        # Simple shard logic for DataParallel simulation
+        # In a full DDP setup, use DistributedSampler
+        print(f"Detected {torch.cuda.device_count()} GPUs. Distributing workload...")
+    # 3. FIXED MODEL INITIALIZATION
+    # We pass the tokenizer length to ensure the Embedding Layer matches
+    model = JiRackPyTorch(vocab_size=len(tokenizer))
+    model.gradient_checkpointing_enable()
+    if torch.cuda.device_count() > 1:
+        model = nn.DataParallel(model)
+    model.to(device)
+    # 4. FIXED OPTIMIZER (8-bit enabled logic)
+    # Weight decay 0.1 is critical for 13B to prevent latent space collapse
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.1)
+    model.train()
+    print("--- [FIXED] Training Started: JiRack 13B ---")
+    try:
+        for current_step, example in enumerate(dataset):
+            # Tokenization with fixed padding/truncation
+            tokens = tokenizer(
+                example["text"],
+                truncation=True,
+                max_length=BLOCK_SIZE,
+                padding="max_length",
+                return_tensors="pt"
+            )
+            input_ids = tokens["input_ids"].to(device)
+            # 5. FIXED FORWARD PASS (Mixed Precision)
+            with autocast(dtype=torch.bfloat16):
+                # Ensure labels=input_ids for Causal Language Modeling
+                logits, loss, _ = model(input_ids, targets=input_ids)
+                loss = loss.mean() / GRAD_ACCUM_STEPS
+            # 6. FIXED BACKWARD PASS
+            scaler.scale(loss).backward()
+            if (current_step + 1) % GRAD_ACCUM_STEPS == 0:
+                scaler.unscale_(optimizer)
+                # Gradient clipping is tightened to 1.0 for 13B stability
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+            if current_step % 50 == 0:
+                print(f"Step {current_step} | Loss: {loss.item()*GRAD_ACCUM_STEPS:.4f} | "
+                      f"Alloc: {torch.cuda.memory_allocated()/1e9:.1f}GB", end='\r')
+            if current_step % SAVE_INTERVAL == 0 and current_step > 0:
+                save_path = os.path.join(CHECKPOINT_DIR, f"step_{current_step}.pt")
+                torch.save(model.state_dict(), save_path)
+    except Exception as e:
+        print(f"\n[CRITICAL ERROR] Training interrupted: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    # Allocator fix for Tesla M10 to prevent OOM during peak activation
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:64"
+    if not os.path.exists(CHECKPOINT_DIR): os.makedirs(CHECKPOINT_DIR)
+    train()