CMSManhattan
/

JiRack_GPT5_3b

Model card Files Files and versions

xet

Community

kgrabko commited on Dec 21, 2025

Commit

701b6b5

verified ·

1 Parent(s): 53bee7b

Upload load_JiRack5_SlimPajama_3b_safetensors.py

Browse files

Files changed (1) hide show

load_JiRack5_SlimPajama_3b_safetensors.py +140 -0

load_JiRack5_SlimPajama_3b_safetensors.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) 2025 CMS Manhattan
+# All rights reserved.
+# Author: Konstantin Vladimirovich Grabko
+# Email: grabko@cmsmanhattan.com
+# Phone: +1(516)777-0945
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Additional terms:
+# Any commercial use or distribution of this software or derivative works
+# requires explicit written permission from the copyright holder.
+import os
+import torch
+from torch.cuda.amp import autocast, GradScaler
+from datasets import load_dataset
+from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
+from safetensors.torch import save_file, load_file
+from JiRackPyTorch_GPT5_class_3b import JiRackPyTorch, VOCAB_SIZE, MAX_SEQ_LEN
+# --- Настройки сохранения ---
+SAVE_DIR = "jirack_weights"
+SHARD_1 = "model-00001-of-00002.safetensors"
+SHARD_2 = "model-00002-of-00002.safetensors"
+def save_sharded_safetensors(model, directory):
+    os.makedirs(directory, exist_ok=True)
+    state_dict = model.state_dict()
+    keys = list(state_dict.keys())
+    mid = len(keys) // 2
+    shard1 = {k: state_dict[k] for k in keys[:mid]}
+    shard2 = {k: state_dict[k] for k in keys[mid:]}
+    save_file(shard1, os.path.join(directory, SHARD_1))
+    save_file(shard2, os.path.join(directory, SHARD_2))
+    print(f"--- [CHECKPOINT] Model shards saved to {directory} ---")
+def load_sharded_safetensors(model, directory):
+    p1 = os.path.join(directory, SHARD_1)
+    p2 = os.path.join(directory, SHARD_2)
+    if os.path.exists(p1) and os.path.exists(p2):
+        print(f"--- [RESUME] Loading weights from {directory} ---")
+        sd = {}
+        sd.update(load_file(p1))
+        sd.update(load_file(p2))
+        model.load_state_dict(sd)
+        return True
+    return False
+def train():
+    # 1. Setup Device & Model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = JiRackPyTorch().to(device)
+    # Проверка авторства перед стартом
+    print(f"--- JiRack Engine Start ---")
+    print(f"--- {model.get_author_info()} ---")
+    # Попытка возобновить обучение
+    load_sharded_safetensors(model, SAVE_DIR)
+    # 2. Dataset & Tokenizer
+    print("Loading Dataset: SlimPajama...")
+    dataset = load_dataset("cerebras/SlimPajama-627B", split="train", streaming=True)
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.pad_token = tokenizer.eos_token
+    # 3. Hyperparameters
+    lr = 2e-4
+    batch_size = 2 # Micro-batch
+    grad_accum = 16 # Effective batch = 32
+    max_steps = 100000
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)
+    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=max_steps)
+    scaler = GradScaler() # Для работы с float16/bf16
+    model.train()
+    data_iter = iter(dataset)
+    print(f"Starting Training Loop. Target steps: {max_steps}")
+    for step in range(1, max_steps + 1):
+        optimizer.zero_grad(set_to_none=True)
+        total_loss = 0
+        for _ in range(grad_accum):
+            try:
+                batch = next(data_iter)
+            except StopIteration:
+                data_iter = iter(dataset)
+                batch = next(data_iter)
+            tokens = tokenizer(batch['text'], truncation=True, max_length=MAX_SEQ_LEN + 1,
+                               padding="max_length", return_tensors="pt").input_ids.to(device)
+            # Предсказываем следующий токен
+            x = tokens[:, :-1]
+            y = tokens[:, 1:]
+            with autocast(dtype=torch.bfloat16):
+                logits, loss, _ = model(x, targets=y)
+                loss = loss / grad_accum
+            scaler.scale(loss).backward()
+            total_loss += loss.item()
+        # Step Optimizer
+        scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        scaler.step(optimizer)
+        scaler.update()
+        scheduler.step()
+        # Logging
+        if step % 10 == 0:
+            print(f"Step: {step} | Loss: {total_loss:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")
+        # Sharded Saving
+        if step % 500 == 0:
+            save_sharded_safetensors(model, SAVE_DIR)
+if __name__ == "__main__":
+    try:
+        train()
+    except Exception as e:
+        print(f"CRITICAL ERROR: {e}")
+        # Сохраняем веса даже при падении, если возможно
+        # save_sharded_safetensors(model, "jirack_crash_recovery")