CMSManhattan
/

JiRackTernary_70b

Model card Files Files and versions

xet

Community

kgrabko commited on Dec 22, 2025

Commit

f13eb01

verified ·

1 Parent(s): 4e9d00e

Upload train_70b_heavy_mixed_val_data.py

Browse files

Files changed (1) hide show

train_70b_heavy_mixed_val_data.py +151 -0

train_70b_heavy_mixed_val_data.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# ==============================================================================
+# COPYRIGHT (C) 2025 KONSTANTIN VLADIMIROVICH GRABKO. ALL RIGHTS RESERVED.
+# PATENT PENDING | CMS MANHATTAN JIRACK TECHNOLOGY
+#
+# This software is licensed under the Commercial License Agreement V.1.2.
+# Any use, modification, or distribution of this code requires compliance with
+# the terms found in the LICENSE.md file in the root directory.
+#
+# NO PATENTING RIGHTS: Users are strictly prohibited from filing patent claims
+# based on the BRE or SWA architectures disclosed herein.
+# Contact: grabko@cmsmanhattan.com | +1 (516) 777-0945
+# ==============================================================================
+##
+## Mix dataset with The Pile and custom cultural data for fine-tuning. to make priority to client data.
+##
+import torch
+import os
+import random
+import json
+from torch.utils.data import DataLoader, IterableDataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset # Загрузка The Pile
+import accelerate
+# --- КОНФИГУРАЦИЯ ---
+MODEL_ID = "./models/ternary_70b_init"
+GENERAL_DATA_LINK = "monology/pile-uncopyrighted" # Ссылка на The Pile
+CLIENT_DATA_FILE = "cultural_finetune.jsonl"    # Твой эволюционный индекс
+OUTPUT_DIR = "./models/checkpoints_70b"
+MIX_RATIO = 0.4         # 40% - Культурный код, 60% - The Pile
+LEARNING_RATE = 5e-6    # Ниже для стабильности 70B
+SAVE_STEPS = 50         # Чаще сохраняем для Sidecar на тяжелых моделях
+# --- ВСТРОЕННЫЙ МИКСЕР (Для автономности скрипта) ---
+class CMSDataMixer(IterableDataset):
+    def __init__(self, tokenizer, client_file, pile_link, mix_ratio=0.4):
+        self.tokenizer = tokenizer
+        self.mix_ratio = mix_ratio
+        # Стриминг The Pile
+        print(f">>> [MIXER] Streaming general knowledge from: {pile_link}")
+        self.pile_stream = load_dataset(pile_link, split="train", streaming=True)
+        # Загрузка клиентских данных
+        self.cultural_data = []
+        if os.path.exists(client_file):
+            with open(client_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    self.cultural_data.append(json.loads(line))
+            print(f">>> [MIXER] Loaded {len(self.cultural_data)} client samples.")
+        else:
+            print(f"⚠️ ERROR: {client_file} not found!")
+    def __iter__(self):
+        pile_iterator = iter(self.pile_stream)
+        while True:
+            if random.random() < self.mix_ratio and self.cultural_data:
+                sample = random.choice(self.cultural_data)
+                text = f"Question: {sample['question']}\nAnswer: {sample['answer']}"
+            else:
+                try:
+                    sample = next(pile_iterator)
+                    text = sample['text']
+                except StopIteration:
+                    pile_iterator = iter(self.pile_stream)
+                    continue
+            tokens = self.tokenizer(
+                text, truncation=True, max_length=512, padding="max_length", return_tensors="pt"
+            )
+            yield {
+                "input_ids": tokens["input_ids"].squeeze(0),
+                "labels": tokens["input_ids"].squeeze(0)
+            }
+# --- ОСНОВНОЙ ЦИКЛ 70B ---
+def train_heavy():
+    # Настройка акселератора для распределения 70B по кластеру Tesla M10
+    accelerator = accelerate.Accelerator(gradient_accumulation_steps=4)
+    device = accelerator.device
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # 1. Токенайзер
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # 2. Загрузка модели 70B
+    # device_map="auto" критически важен здесь для распределения слоев
+    print(f">>> Loading 70B model layers across GPUs using Accelerate...")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True
+    )
+    # 3. Инициализация миксера
+    dataset = CMSDataMixer(tokenizer, CLIENT_DATA_FILE, GENERAL_DATA_LINK, mix_ratio=MIX_RATIO)
+    loader = DataLoader(dataset, batch_size=1, pin_memory=True)
+    # Оптимизатор
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
+    # Подготовка через accelerate
+    model, optimizer, loader = accelerator.prepare(model, optimizer, loader)
+    print(f">>> CMS Heavy Engine (70B) Started.")
+    print(f">>> Mixed Strategy: {int(MIX_RATIO*100)}% Client Focus / {int((1-MIX_RATIO)*100)}% Pile.")
+    model.train()
+    for step, batch in enumerate(loader):
+        try:
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                if torch.isnan(loss):
+                    print(f"⚠️ NaN loss at step {step}. Skipping...")
+                    continue
+                accelerator.backward(loss)
+                optimizer.step()
+                optimizer.zero_grad()
+            if step % 10 == 0 and accelerator.is_main_process:
+                print(f"📊 Step {step} | Loss: {loss.item():.4f}")
+            # Сохранение для Sidecar
+            if step > 0 and step % SAVE_STEPS == 0 and accelerator.is_main_process:
+                save_path = os.path.join(OUTPUT_DIR, f"checkpoint_step_{step}")
+                print(f">>> Exporting 70B state: {save_path}")
+                accelerator.save_state(save_path)
+                torch.cuda.empty_cache()
+        except RuntimeError as e:
+            if "out of memory" in str(e):
+                print(f"❌ OOM on Step {step}. Clearing cache...")
+                torch.cuda.empty_cache()
+                continue
+            else:
+                raise e
+if __name__ == "__main__":
+    train_heavy()