JiRack_GPT3_1b / fine_tune1b_with_validation_no_torchscript.py
kgrabko's picture
Update fine_tune1b_with_validation_no_torchscript.py
8db18a6 verified
# Copyright (c) 2025 CMS Manhattan
# All rights reserved.
# Author: Konstantin Vladimirovich Grabko
# Email: grabko@cmsmanhattan.com
# Phone: +1(516)777-0945
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Additional terms:
# Any commercial use or distribution of this software or derivative works
# requires explicit written permission from the copyright holder.
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
from tqdm import tqdm
import shutil
import math
from pathlib import Path
import re
import logging
from torch.amp import GradScaler, autocast
# --- ДОБАВЛЕНО: Отключаем предупреждение о длинной последовательности ---
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
# -----------------------------------------------------------------------
# Убедитесь, что этот файл содержит ИСПРАВЛЕНИЯ СТАБИЛЬНОСТИ (FP32 Attention, _init_weights)!
from gpt_jit_modern_1b import JiRackPyTorch
# ============================= SETTINGS =============================
# --- НАСТРОЙКИ (независимые от устройства) ---
TRAIN_SEQ_LEN = 64
BATCH_SIZE = 1
ACCUM_STEPS = 32 # Эффективный батч = 32
EPOCHS = 500
LEARNING_RATE = 1e-6
WEIGHT_DECAY = 0.01
GRAD_CLIP = 1.0
VAL_SPLIT_RATIO = 0.05
KEEP_LAST_EPOCHS = 3
# ====================================================================
# 💻 Device Configuration: АВТООПРЕДЕЛЕНИЕ
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
USE_AMP = True
AUTOCAST_DTYPE = torch.float16
print(f"Using device: {device} (GPU). AMP (FP16) enabled for efficiency.")
elif device.type == 'cpu':
USE_AMP = False
AUTOCAST_DTYPE = torch.float32
print(f"Using device: {device} (CPU). WARNING: Training 1.2B model on CPU will be extremely slow.")
else:
USE_AMP = False
AUTOCAST_DTYPE = torch.float32
print(f"Using device: {device}. AMP disabled.")
# === PATHS ===
BASE_MODEL_PATH = Path("models/gpt_modern_1b_class.state_dict.pt")
LAST_TRAINED_PATH = Path("models/gpt_last_modern_1b_class.state_dict.pt")
BACKUP_DIR = Path("models/backups")
BACKUP_DIR.mkdir(exist_ok=True, parents=True)
RAW_PATH = Path("datasets/dialogues_text.txt")
CLEAN_PATH = Path("datasets/dialogues_text_clean.txt")
# === DATASET CLEANING ===
if not CLEAN_PATH.exists() or RAW_PATH.stat().st_mtime > CLEAN_PATH.stat().st_mtime:
print("Cleaning dataset...")
try:
text = RAW_PATH.read_text(encoding="utf-8")
text = re.sub(r' {2,}', ' ', text)
text = text.replace(" \n", "\n").replace("\n ", "\n")
CLEAN_PATH.write_text(text, encoding="utf-8")
print(f"Done → {CLEAN_PATH}")
except FileNotFoundError:
print(f"ERROR: Raw dataset not found at {RAW_PATH}")
sys.exit(1)
DATASET_PATH = CLEAN_PATH
OUTPUT_DIR = Path("build/fine_tuning_output")
MODEL_SAVE_NAME = "pytorch_model.bin"
# ============================= DATASET =============================
class TextDataset(Dataset):
def __init__(self, text_file, seq_len=TRAIN_SEQ_LEN, split='train'):
self.seq_len = seq_len
try:
tokenizer = GPT2TokenizerFast.from_pretrained("./tokenizer", local_files_only=True)
except Exception:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
text = Path(text_file).read_text(encoding="utf-8")
tokens = tokenizer.encode(text)
sequences = []
for i in range(0, len(tokens) - seq_len, seq_len):
sequences.append(tokens[i:i + seq_len + 1])
split_idx = int(len(sequences) * (1 - VAL_SPLIT_RATIO))
if split == 'train':
self.data = sequences[:split_idx]
else:
self.data = sequences[split_idx:]
print(f"{split.upper()} sequences: {len(self.data):,}")
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
seq = self.data[idx]
return torch.tensor(seq[:-1], dtype=torch.long), torch.tensor(seq[1:], dtype=torch.long)
def evaluate(model, loader):
model.eval()
total_loss = 0
criterion = nn.CrossEntropyLoss()
# autocast используется только при USE_AMP=True (только на GPU)
with torch.no_grad(), autocast(device_type=device.type, enabled=USE_AMP, dtype=AUTOCAST_DTYPE):
for x, y in loader:
x, y = x.to(device), y.to(device)
logits = model(x)
if isinstance(logits, tuple):
logits = logits[0]
input_logits = logits.contiguous().view(-1, logits.size(-1))
target_labels = y.contiguous().view(-1)[:input_logits.size(0)]
# Loss всегда вычисляется в FP32 для точности
loss = criterion(input_logits.float(), target_labels)
total_loss += loss.item()
model.train()
return total_loss / len(loader)
def train():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print("Loading model...")
model = JiRackPyTorch().to(device)
# GradScaler инициализируется, но будет работать только если USE_AMP=True
scaler = GradScaler(enabled=USE_AMP, device=device.type)
# =========================================================================
# 🔥 ВРЕМЕННО ОТКЛЮЧЕНА ЗАГРУЗКА ВЕСОВ
# =========================================================================
print("Starting from scratch — random weights (Skipping state_dict load for stability test!)")
# =========================================================================
model.train()
train_dataset = TextDataset(DATASET_PATH, split='train')
val_dataset = TextDataset(DATASET_PATH, split='val')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
print("\nFULL TRAINING STARTED! No LoRA, no compromises — we're training the whole thing!\n")
print(f"Batch size: {BATCH_SIZE * ACCUM_STEPS} (effective) | LR: {LEARNING_RATE} | AMP: {USE_AMP} ({AUTOCAST_DTYPE})")
for epoch in range(1, EPOCHS + 1):
total_loss = 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch} [TRAIN]")
for step, (x, y) in enumerate(pbar):
x, y = x.to(device), y.to(device)
# 1. Прямой проход и Loss в AMP (только если GPU)
with autocast(device_type=device.type, enabled=USE_AMP, dtype=AUTOCAST_DTYPE):
logits = model(x)
if isinstance(logits, tuple):
logits = logits[0]
input_logits = logits.contiguous().view(-1, logits.size(-1))
target_labels = y.contiguous().view(-1)[:input_logits.size(0)]
loss = criterion(input_logits.float(), target_labels)
loss = loss / ACCUM_STEPS
# Проверка NaN
if torch.isnan(loss).any():
print(f"\n[FATAL ERROR] Loss became NaN at step {step}. Stopping training.")
raise RuntimeError("Loss became NaN during training, stopping.")
# 2. Обратный проход через scaler
scaler.scale(loss).backward()
total_loss += loss.item() * ACCUM_STEPS
if (step + 1) % ACCUM_STEPS == 0 or (step + 1) == len(train_loader):
# 3. Обновление оптимизатора через scaler
if USE_AMP:
scaler.unscale_(optimizer) # Снимаем масштабирование (только для GPU)
torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
# Обновление TQDM
current_avg_loss = total_loss / (step + 1)
ppl_val = math.exp(min(current_avg_loss, 10))
pbar.set_postfix({"loss (avg)": f"{current_avg_loss:.4f}", "ppl": f"{ppl_val:.2f}"})
avg_train_loss = total_loss / len(train_loader)
val_loss = evaluate(model, val_loader)
print(f"\nEpoch {epoch}")
print(f" Train loss: {avg_train_loss:.4f} | PPL: {math.exp(avg_train_loss):.2f}")
print(f" Val loss: {val_loss:.4f} | PPL: {math.exp(val_loss):.2f}")
# Save checkpoint
save_dir = OUTPUT_DIR / f"epoch_{epoch}"
save_dir.mkdir(exist_ok=True, parents=True)
torch.save(model.state_dict(), save_dir / MODEL_SAVE_NAME)
torch.save(model.state_dict(), LAST_TRAINED_PATH)
# Keep only the last N epochs to save disk space
epochs_dirs = sorted([p for p in OUTPUT_DIR.iterdir() if p.is_dir() and p.name.startswith("epoch_")])
for old in epochs_dirs[:-KEEP_LAST_EPOCHS]:
shutil.rmtree(old)
print("\nDONE! Full model trained. You are now the emperor of fine-tuning.")
if __name__ == "__main__":
try:
train()
except RuntimeError as e:
if "Loss became NaN" in str(e):
print("\n[CRITICAL FAILURE] Training stopped due to NaN loss.")
print("Action: Revisit JiRackPyTorch weight initialization (reduce STD further) or reduce LEARNING_RATE to 1e-6.")
sys.exit(1)
elif "CUDA out of memory" in str(e):
print("\n[CRITICAL FAILURE] CUDA Out of Memory.")
print("Action: Current configuration BATCH_SIZE=1, AMP=FP16 is the minimum memory usage possible. Try reducing TRAIN_SEQ_LEN from 256 to 128.")
sys.exit(1)
raise