# Copyright (c) 2025 CMS Manhattan # All rights reserved. # Author: Konstantin Vladimirovich Grabko # Email: grabko@cmsmanhattan.com # Phone: +1(516)777-0945 # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Additional terms: # Any commercial use or distribution of this software or derivative works # requires explicit written permission from the copyright holder. import torch import torch.nn as nn from torch.optim import AdamW from torch.utils.data import DataLoader from torch.amp import autocast, GradScaler from tqdm import tqdm import math from pathlib import Path # Подключаем твою модель (ту же самую, что была в JIT) from your_model_file import JiRack_H4_L2 # ← сюда имя файла с классом модели (который я тебе дал последним) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Устройство: {device}") # Загружаем обычную модель (НЕ JIT!) model = JiRack_H4_L2().to(device) # Загружаем веса из JIT-конвертированной модели (они совместимы!) state_dict = torch.load("models/JiRack_H4_L2_V50257_D768_MSL8192_FF3072.pt", map_location=device) model.load_state_dict(state_dict) print("Веса загружены из .pt файла") # Параметры обучения — теперь можно чуть агрессивнее BATCH_SIZE = 12 SEQ_LEN = 256 EPOCHS = 10 LR = 5e-5 WARMUP_STEPS = 100 # Твой датасет (пример с рандомом — замени на реальный) class DummyDataset(torch.utils.data.Dataset): def __init__(self, n=10000): self.n = n def __len__(self): return self.n def __getitem__(self, i): x = torch.randint(0, 50257, (SEQ_LEN,)) return x, x.roll(-1) # next token prediction train_loader = DataLoader(DummyDataset(), batch_size=BATCH_SIZE, shuffle=True, drop_last=True) optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01) scaler = GradScaler('cuda') criterion = nn.CrossEntropyLoss() global_step = 0 model.train() for epoch in range(1, EPOCHS + 1): total_loss = 0 pbar = tqdm(train_loader, desc=f"Эпоха {epoch}/{EPOCHS}") for xb, yb in pbar: global_step += 1 xb, yb = xb.to(device), yb.to(device) optimizer.zero_grad() with autocast('cuda'): logits = model(xb) # ← обычный forward, без past_kv loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1)) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) scaler.step(optimizer) scaler.update() # LR warmup if global_step < WARMUP_STEPS: lr_scale = global_step / WARMUP_STEPS for pg in optimizer.param_groups: pg['lr'] = LR * lr_scale total_loss += loss.item() pbar.set_postfix({"loss": f"{loss.item():.4f}", "ppl": f"{math.exp(loss.item()):.1f}"}) avg_loss = total_loss / len(train_loader) print(f"Эпоха {epoch} завершена | Средний loss: {avg_loss:.4f} | Perplexity: {math.exp(avg_loss):.2f}\n") # После обучения — сохраняем и JIT-версию для инференса torch.save(model.state_dict(), "models/JiRack_H4_L2_finetuned.pt") # Экспорт в JIT (теперь уже обученной модели) class JITWrapper(nn.Module): def __init__(self, m): super().__init__(); self.m = m def forward(self, x): return self.m(x) dummy = torch.randint(0, 50257, (1, 256), device=device) traced = torch.jit.trace(JITWrapper(model.cpu().eval()), dummy) traced.save("models/JiRack_H4_L2_finetuned.script.pt") print("Обученная модель сохранена + экспортирована в JIT для инференса")