# Copyright (c) 2025 CMS Manhattan
# All rights reserved.
# Author: Konstantin Vladimirovich Grabko
# Email: grabko@cmsmanhattan.com
# Phone: +1(516)777-0945
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# Additional terms:
# Any commercial use or distribution of this software or derivative works
# requires explicit written permission from the copyright holder.

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler
from tqdm import tqdm
import math
from pathlib import Path

# Подключаем твою модель (ту же самую, что была в JIT)
from your_model_file import JiRack_H4_L2  # ← сюда имя файла с классом модели (который я тебе дал последним)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Устройство: {device}")

# Загружаем обычную модель (НЕ JIT!)
model = JiRack_H4_L2().to(device)

# Загружаем веса из JIT-конвертированной модели (они совместимы!)
state_dict = torch.load("models/JiRack_H4_L2_V50257_D768_MSL8192_FF3072.pt", map_location=device)
model.load_state_dict(state_dict)
print("Веса загружены из .pt файла")

# Параметры обучения — теперь можно чуть агрессивнее
BATCH_SIZE = 12
SEQ_LEN = 256
EPOCHS = 10
LR = 5e-5
WARMUP_STEPS = 100

# Твой датасет (пример с рандомом — замени на реальный)
class DummyDataset(torch.utils.data.Dataset):
    def __init__(self, n=10000): self.n = n
    def __len__(self): return self.n
    def __getitem__(self, i):
        x = torch.randint(0, 50257, (SEQ_LEN,))
        return x, x.roll(-1)  # next token prediction

train_loader = DataLoader(DummyDataset(), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
scaler = GradScaler('cuda')
criterion = nn.CrossEntropyLoss()

global_step = 0
model.train()

for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Эпоха {epoch}/{EPOCHS}")

    for xb, yb in pbar:
        global_step += 1
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()

        with autocast('cuda'):
            logits = model(xb)  # ← обычный forward, без past_kv
            loss = criterion(logits.view(-1, logits.size(-1)), yb.view(-1))

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        # LR warmup
        if global_step < WARMUP_STEPS:
            lr_scale = global_step / WARMUP_STEPS
            for pg in optimizer.param_groups:
                pg['lr'] = LR * lr_scale

        total_loss += loss.item()
        pbar.set_postfix({"loss": f"{loss.item():.4f}", "ppl": f"{math.exp(loss.item()):.1f}"})

    avg_loss = total_loss / len(train_loader)
    print(f"Эпоха {epoch} завершена | Средний loss: {avg_loss:.4f} | Perplexity: {math.exp(avg_loss):.2f}\n")

# После обучения — сохраняем и JIT-версию для инференса
torch.save(model.state_dict(), "models/JiRack_H4_L2_finetuned.pt")

# Экспорт в JIT (теперь уже обученной модели)
class JITWrapper(nn.Module):
    def __init__(self, m): super().__init__(); self.m = m
    def forward(self, x): return self.m(x)

dummy = torch.randint(0, 50257, (1, 256), device=device)
traced = torch.jit.trace(JITWrapper(model.cpu().eval()), dummy)
traced.save("models/JiRack_H4_L2_finetuned.script.pt")
print("Обученная модель сохранена + экспортирована в JIT для инференса")