CMSManhattan
/

JiRack_GPT3_empty

Model card Files Files and versions

xet

Community

kgrabko commited on Dec 19, 2025

Commit

54b8edf

verified ·

1 Parent(s): 9a7388e

Update fine_tune_jit_with_validation_cuda_1b.py

Browse files

Files changed (1) hide show

fine_tune_jit_with_validation_cuda_1b.py +486 -464

fine_tune_jit_with_validation_cuda_1b.py CHANGED Viewed

@@ -1,465 +1,487 @@
-#!/usr/bin/env python3
-"""
-# install tokenizer before run
-mkdir -p tokenizer
-wget -O tokenizer/tokenizer.json https://huggingface.co/gpt2/resolve/main/tokenizer.json
-wget -O tokenizer/vocab.json https://huggingface.co/gpt2/resolve/main/vocab.json
-wget -O tokenizer/merges.txt https://huggingface.co/gpt2/resolve/main/merges.txt
-wget -O tokenizer/tokenizer_config.json https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
-Updated fine-tuning script, version "prefer Python nn.Module with gradient checkpointing".
-What it does:
-- Tries to load a local Python implementation of the model (as torch.nn.Module). If found — uses it and
-  enables gradient_checkpointing (if the model supports it).
-- If no Python model class is found — falls back to JIT ScriptModule (as before).
-- If the original weights are only available as JIT, attempts to extract state_dict() from the ScriptModule
-  and load it into the nn.Module (best-effort).
-- Saves the final trained model as a JIT torch.jit.save at the end, or as state_dict if something fails.
-- Saves the tokenizer locally (./tokenizer) and uses it. Gives a helpful message if the tokenizer is missing.
-- Supports AMP (autocast + GradScaler) on GPU.
-- Optional support for bitsandbytes 8-bit optimizer (if installed).
-- Comments and console messages are in Russian.
-Before running: if you have a Python file with the model implementation
-(for example gpt_modern_1b.py or gpt_modern_1b_class.py), place it in the same folder
-and make sure it contains a class named JiRackPyTorch (or one of the other names the script looks for).
-If no such file exists — the script will just use the JIT model as before.
-"""
-import os
-os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128,garbage_collection_threshold:0.6")
-import sys
-import importlib
-import math
-import shutil
-import re
-from pathlib import Path
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import IterableDataset, DataLoader
-from transformers import GPT2TokenizerFast
-from tqdm import tqdm
-from torch.cuda.amp import GradScaler, autocast
-# ========================= SETTINGS =========================
-TRAIN_SEQ_LEN = int(os.environ.get("TRAIN_SEQ_LEN", 64))
-BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 1))
-EPOCHS = int(os.environ.get("EPOCHS", 999))
-LEARNING_RATE = float(os.environ.get("LEARNING_RATE", 6e-6))
-WEIGHT_DECAY = float(os.environ.get("WEIGHT_DECAY", 0.01))
-GRAD_CLIP = float(os.environ.get("GRAD_CLIP", 1.0))
-KEEP_LAST_EPOCHS = int(os.environ.get("KEEP_LAST_EPOCHS", 3))
-VAL_SPLIT_RATIO = float(os.environ.get("VAL_SPLIT_RATIO", 0.05))
-BASE_MODEL_PATH = Path("models/gpt_modern_1b_class.script.pt")
-LAST_TRAINED_PATH = Path("models/gpt_1b_last_trained.script.pt")
-PT_STATE_DICT_PATH = Path("models/gpt_modern_1b_class.state_dict.pt")
-BACKUP_DIR = Path("models/backups")
-BACKUP_DIR.mkdir(parents=True, exist_ok=True)
-RAW_PATH = Path("datasets/dialogues_text.txt")
-CLEAN_PATH = Path("datasets/dialogues_text_clean.txt")
-TOKENIZER_LOCAL_DIR = Path("./tokenizer")
-OUTPUT_DIR = Path("build/fine_tuning_output")
-MODEL_SAVE_NAME = "gpt_finetuned.script.pt"
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# ========================= Tokenizer helper =========================
-def _load_tokenizer_local(tokenizer_name: str = "gpt2"):
-    """
-    Try to load tokenizer from local files. If not found — give the user instructions.
-    """
-    candidates = []
-    env_path = os.environ.get("TOKENIZER_PATH")
-    if env_path:
-        candidates.append(env_path)
-    candidates.append(str(TOKENIZER_LOCAL_DIR))
-    candidates.append(tokenizer_name)
-    candidates.append("./")
-    for cand in candidates:
-        try:
-            tok = GPT2TokenizerFast.from_pretrained(cand, local_files_only=True)
-            if getattr(tok, "pad_token", None) is None:
-                tok.pad_token = tok.eos_token
-            print(f"Tokenizer loaded from: {cand}")
-            return tok
-        except Exception:
-            continue
-    raise RuntimeError(
-        "Local tokenizer not found. Place tokenizer.json or (vocab.json + merges.txt) into ./tokenizer\n"
-        "OR set the path via TOKENIZER_PATH environment variable.\n"
-        "Example: export TOKENIZER_PATH=/path/to/tokenizer\n"
-        "If you have internet access, you can temporarily use transformers.GPT2TokenizerFast.from_pretrained('gpt2')"
-    )
-# ========================= Dataset =========================
-class LazyTextDataset(IterableDataset):
-    def __init__(self, text_file: Path, seq_len: int = TRAIN_SEQ_LEN, tokenizer_name: str = "gpt2",
-                 split_type: str = 'train', val_ratio: float = VAL_SPLIT_RATIO):
-        self.seq_len = seq_len
-        self.tokenizer = _load_tokenizer_local(tokenizer_name)
-        self.text_file = Path(text_file)
-        self.split_type = split_type
-        self.val_ratio = val_ratio
-        print(f"Loading and tokenizing {self.text_file} (one-time tokenization into ids)...")
-        with open(self.text_file, "r", encoding="utf-8") as f:
-            data = f.read()
-        self.tokens = self.tokenizer.encode(data)
-        total_tokens = max(0, len(self.tokens) - 1)
-        total_batches = total_tokens // seq_len if seq_len > 0 else 0
-        val_size = int(total_batches * val_ratio)
-        train_size = total_batches - val_size
-        if split_type == 'train':
-            self.start = 0
-            self.stop = train_size
-        elif split_type == 'val':
-            self.start = train_size
-            self.stop = train_size + val_size
-        else:
-            raise ValueError("split_type must be 'train' or 'val'")
-        self.total_sequences = max(0, self.stop - self.start)
-        print(f"Split {split_type}: {self.total_sequences} sequences (out of {total_batches})")
-    def __iter__(self):
-        for i in range(self.start * self.seq_len, self.stop * self.seq_len, self.seq_len):
-            if i + self.seq_len + 1 > len(self.tokens):
-                break
-            input_seq = torch.tensor(self.tokens[i: i + self.seq_len], dtype=torch.long)
-            label_seq = torch.tensor(self.tokens[i + 1: i + self.seq_len + 1], dtype=torch.long)
-            yield input_seq, label_seq
-    def __len__(self):
-        return self.total_sequences
-# ========================= Try to load Python nn.Module model =========================
-def try_load_python_model():
-    """
-    Attempt to find and import a local Python model implementation (nn.Module).
-    Looks for several possible module and class names.
-    Returns (model_instance, source_description) or (None, None).
-    """
-    candidates_modules = [
-        "gpt_modern_1b_class",
-        "gpt_modern_1b",
-        "gpt_modern_1b_class_fixed",
-        "model", "ji_rack_model"
-    ]
-    candidates_class_names = [
-        "JiRackPyTorch",
-        "JiRackPyTorch1B",
-        "GPTModel",
-        "JiRackModel"
-    ]
-    for modname in candidates_modules:
-        try:
-            spec = importlib.util.find_spec(modname)
-            if spec is None:
-                continue
-            mod = importlib.import_module(modname)
-            for cls_name in candidates_class_names:
-                if hasattr(mod, cls_name):
-                    cls = getattr(mod, cls_name)
-                    try:
-                        inst = cls()
-                        print(f"Loaded Python model class {cls_name} from module {modname}")
-                        return inst, f"python:{modname}.{cls_name}"
-                    except Exception as e:
-                        print(f"Found class {cls_name} in {modname} but instantiation failed: {e}")
-                        continue
-        except Exception:
-            continue
-    return None, None
-# ========================= Utility: load weights from JIT script into nn.Module =========================
-def load_weights_from_script_to_module(script_path: Path, module_model: nn.Module):
-    """
-    Best-effort: extract state_dict from a ScriptModule and load it into a regular nn.Module.
-    Returns True on success.
-    """
-    try:
-        script_mod = torch.jit.load(script_path, map_location="cpu")
-    except Exception as e:
-        print(f"Cannot load script {script_path}: {e}")
-        return False
-    try:
-        sd = script_mod.state_dict()
-    except Exception as e:
-        print(f"ScriptModule.state_dict() failed: {e}")
-        return False
-    try:
-        module_model.load_state_dict(sd, strict=False)
-        print("Weights successfully loaded from ScriptModule into Python nn.Module (strict=False).")
-        return True
-    except Exception as e:
-        print(f"load_state_dict failed: {e}")
-        return False
-# ========================= Helper to get logits from any model type =========================
-def get_logits_from_model(model, inputs: torch.Tensor):
-    inputs = inputs.to(device)
-    out = model(inputs)
-    if isinstance(out, (tuple, list)):
-        return out[0]
-    return out
-# ========================= Evaluation =========================
-def evaluate(model, dataloader, criterion):
-    model.eval()
-    total_loss = 0.0
-    count = 0
-    with torch.no_grad():
-        for inputs, targets in dataloader:
-            inputs, targets = inputs.to(device), targets.to(device)
-            logits = get_logits_from_model(model, inputs)
-            logits = logits.contiguous().view(-1, logits.size(-1))
-            targets = targets.contiguous().view(-1)[:logits.shape[0]]
-            loss = criterion(logits, targets)
-            total_loss += float(loss.item())
-            count += 1
-    model.train()
-    return total_loss / max(1, count)
-# ========================= Training loop =========================
-def train():
-    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-    print("Loading model...")
-    python_model, python_source = try_load_python_model()
-    model = None
-    model_source = None
-    # Prefer Python nn.Module if available
-    if python_model is not None:
-        model = python_model
-        model_source = python_source
-        loaded = False
-        # Try to load latest weights (state_dict first, then JIT → state_dict)
-        if PT_STATE_DICT_PATH.exists():
-            try:
-                sd = torch.load(PT_STATE_DICT_PATH, map_location="cpu")
-                model.load_state_dict(sd, strict=False)
-                print(f"Loaded state_dict from {PT_STATE_DICT_PATH}")
-                loaded = True
-            except Exception as e:
-                print(f"Failed to load state_dict from {PT_STATE_DICT_PATH}: {e}")
-        if not loaded and LAST_TRAINED_PATH.exists():
-            if load_weights_from_script_to_module(LAST_TRAINED_PATH, model):
-                loaded = True
-        if not loaded and BASE_MODEL_PATH.exists():
-            if load_weights_from_script_to_module(BASE_MODEL_PATH, model):
-                loaded = True
-    else:
-        # Fallback to JIT ScriptModule
-        if LAST_TRAINED_PATH.exists():
-            model = torch.jit.load(LAST_TRAINED_PATH, map_location=device)
-            model_source = f"jit:{LAST_TRAINED_PATH}"
-        elif BASE_MODEL_PATH.exists():
-            model = torch.jit.load(BASE_MODEL_PATH, map_location=device)
-            model_source = f"jit:{BASE_MODEL_PATH}"
-        else:
-            print("ERROR: No model found (neither Python module nor JIT). Place a model file or Python implementation.")
-            return
-    print(f"Model loaded from: {model_source}")
-    # If we are using a real nn.Module → move to device + enable gradient checkpointing if possible
-    is_python_module = isinstance(model, nn.Module)
-    if is_python_module:
-        model.to(device)
-        model.train()
-        try:
-            model.gradient_checkpointing_enable()
-            print("Gradient checkpointing ENABLED on Python nn.Module.")
-        except Exception:
-            try:
-                model.gradient_checkpointing = True
-                print("Set attribute gradient_checkpointing = True (best-effort).")
-            except Exception:
-                print("Gradient checkpointing not available on this Python model.")
-    else:
-        # ScriptModule path
-        try:
-            model.to(device)
-        except Exception:
-            print("Warning: model.to(device) failed for ScriptModule; trying best-effort buffer move.")
-        model.train()
-        print("Training on ScriptModule (gradient checkpointing not available).")
-    # ========================= Dataset preparation =========================
-    if not CLEAN_PATH.exists():
-        if not RAW_PATH.exists():
-            raise FileNotFoundError(f"Missing dataset {RAW_PATH}")
-        print("Cleaning raw dataset → cleaned version...")
-        text = RAW_PATH.read_text(encoding="utf-8")
-        text = re.sub(r" {2,}", " ", text)
-        text = text.replace(" \n", "\n").replace("\n ", "\n")
-        CLEAN_PATH.write_text(text, encoding="utf-8")
-        print(f"Cleaned dataset saved → {CLEAN_PATH}")
-    train_dataset = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='train', val_ratio=VAL_SPLIT_RATIO)
-    val_dataset   = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='val',   val_ratio=VAL_SPLIT_RATIO)
-    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
-    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
-    # ========================= Optimizer (try 8-bit first) =========================
-    try:
-        import bitsandbytes as bnb  # type: ignore
-        try:
-            optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
-        except Exception:
-            optimizer = bnb.optim.Adam8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
-        print("Using bitsandbytes 8-bit optimizer.")
-    except Exception:
-        optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
-        print("Using standard torch.optim.AdamW (bitsandbytes not available).")
-    criterion = nn.CrossEntropyLoss()
-    scaler = GradScaler(enabled=(device.type == 'cuda'))
-    if device.type == 'cuda':
-        torch.cuda.empty_cache()
-    total_steps = (len(train_dataset) // BATCH_SIZE) * EPOCHS if len(train_dataset) > 0 else 0
-    print(f"\nSTARTING training: epochs={EPOCHS}, approx. steps={total_steps}, examples={len(train_dataset)}")
-    print(f"Batch size={BATCH_SIZE}, seq_len={TRAIN_SEQ_LEN}, device={device}, AMP={'on' if device.type=='cuda' else 'off'}")
-    global_step = 0
-    for epoch in range(1, EPOCHS + 1):
-        print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
-        epoch_loss = 0.0
-        pbar = tqdm(train_loader, desc=f"Epoch {epoch} [TRAIN]", leave=False)
-        for inputs, targets in pbar:
-            inputs, targets = inputs.to(device), targets.to(device)
-            optimizer.zero_grad(set_to_none=True)
-            with autocast(enabled=(device.type == 'cuda')):
-                logits = get_logits_from_model(model, inputs)
-                logits = logits.contiguous().view(-1, logits.size(-1))
-                targets_view = targets.contiguous().view(-1)[:logits.shape[0]]
-                loss = criterion(logits, targets_view)
-            # Backward pass (AMP-safe)
-            if device.type == 'cuda':
-                try:
-                    scaler.scale(loss).backward()
-                    scaler.unscale_(optimizer)
-                except Exception as e:
-                    print("Scaled backward failed:", e)
-                    loss.backward()
-                try:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
-                except Exception:
-                    pass
-                try:
-                    scaler.step(optimizer)
-                    scaler.update()
-                except RuntimeError as e:
-                    print("RuntimeError in scaler.step():", e)
-                    print(torch.cuda.memory_summary())
-                    # Fallback without scaler
-                    try:
-                        scaler.unscale_(optimizer)
-                        optimizer.step()
-                    except Exception as e2:
-                        print("Fallback optimizer.step() failed:", e2)
-                        raise e
-            else:
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
-                optimizer.step()
-            if device.type == 'cuda':
-                torch.cuda.empty_cache()
-            loss_val = float(loss.item())
-            epoch_loss += loss_val
-            global_step += 1
-            pbar.set_postfix({"loss": f"{loss_val:.4f}", "ppl": f"{math.exp(min(loss_val, 10)):.2f}", "step": global_step})
-        avg_train_loss = epoch_loss / max(1, len(train_dataset) // BATCH_SIZE)
-        print(f"[TRAIN] Avg loss: {avg_train_loss:.4f} | Perplexity: {math.exp(avg_train_loss):.2f}")
-        print("Running validation...")
-        val_loss = evaluate(model, val_loader, criterion)
-        print(f"[VAL] Avg loss: {val_loss:.4f} | Perplexity: {math.exp(val_loss):.2f}")
-        # Save checkpoint for this epoch
-        epoch_dir = OUTPUT_DIR / f"epoch{epoch}"
-        epoch_dir.mkdir(parents=True, exist_ok=True)
-        try:
-            if is_python_module:
-                model.eval()
-                dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
-                try:
-                    traced = torch.jit.trace(model, dummy, strict=False)
-                    torch.jit.save(traced, epoch_dir / MODEL_SAVE_NAME)
-                    print(f"Exported traced JIT → {epoch_dir / MODEL_SAVE_NAME}")
-                except Exception as e:
-                    torch.save(model.state_dict(), epoch_dir / "state_dict.pt")
-                    print(f"Saved state_dict (trace failed): {e}")
-                model.train()
-            else:
-                torch.jit.save(model, epoch_dir / MODEL_SAVE_NAME)
-                print(f"Saved ScriptModule → {epoch_dir / MODEL_SAVE_NAME}")
-        except Exception as e:
-            print("Error while saving epoch model:", e)
-        cleanup_old_epochs()
-    # ========================= Final model save =========================
-    final_dir = OUTPUT_DIR / "final"
-    final_dir.mkdir(parents=True, exist_ok=True)
-    try:
-        if is_python_module:
-            model.eval()
-            dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
-            traced = torch.jit.trace(model, dummy, strict=False)
-            torch.jit.save(traced, final_dir / MODEL_SAVE_NAME)
-            print(f"Final traced JIT saved → {final_dir / MODEL_SAVE_NAME}")
-        else:
-            torch.jit.save(model, final_dir / MODEL_SAVE_NAME)
-            print(f"Final ScriptModule saved → {final_dir / MODEL_SAVE_NAME}")
-    except Exception:
-        torch.save(model.state_dict(), final_dir / "state_dict.pt")
-        print("Final model saved as state_dict (trace failed).")
-    # Save tokenizer with the final model
-    try:
-        train_dataset.tokenizer.save_pretrained(final_dir)
-    except Exception:
-        pass
-    # Backup previous last-trained model and update the "current" symlink/file
-    if LAST_TRAINED_PATH.exists():
-        backup_path = BACKUP_DIR / f"gpt_last_trained_backup_{int(LAST_TRAINED_PATH.stat().st_mtime)}.script.pt"
-        shutil.copy(LAST_TRAINED_PATH, backup_path)
-        print(f"Backed up previous last_trained → {backup_path}")
-    if (final_dir / MODEL_SAVE_NAME).exists():
-        shutil.copy(final_dir / MODEL_SAVE_NAME, LAST_TRAINED_PATH)
-        print(f"Copied final model → {LAST_TRAINED_PATH}")
-    elif (final_dir / "state_dict.pt").exists():
-        shutil.copy(final_dir / "state_dict.pt", LAST_TRAINED_PATH.with_suffix(".state_dict.pt"))
-    print("TRAINING COMPLETED.")
-# ========================= Entrypoint =========================
-if __name__ == "__main__":
-    if not RAW_PATH.exists():
-        print(f"ERROR: dataset {RAW_PATH} not found. Place your training text there.")
-        sys.exit(1)
     train()

+# Copyright (c) 2025 CMS Manhattan
+# All rights reserved.
+# Author: Konstantin Vladimirovich Grabko
+# Email: grabko@cmsmanhattan.com
+# Phone: +1(516)777-0945
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Additional terms:
+# Any commercial use or distribution of this software or derivative works
+# requires explicit written permission from the copyright holder.
+#!/usr/bin/env python3
+"""
+# install tokenizer before run
+mkdir -p tokenizer
+wget -O tokenizer/tokenizer.json https://huggingface.co/gpt2/resolve/main/tokenizer.json
+wget -O tokenizer/vocab.json https://huggingface.co/gpt2/resolve/main/vocab.json
+wget -O tokenizer/merges.txt https://huggingface.co/gpt2/resolve/main/merges.txt
+wget -O tokenizer/tokenizer_config.json https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
+Updated fine-tuning script, version "prefer Python nn.Module with gradient checkpointing".
+What it does:
+- Tries to load a local Python implementation of the model (as torch.nn.Module). If found — uses it and
+  enables gradient_checkpointing (if the model supports it).
+- If no Python model class is found — falls back to JIT ScriptModule (as before).
+- If the original weights are only available as JIT, attempts to extract state_dict() from the ScriptModule
+  and load it into the nn.Module (best-effort).
+- Saves the final trained model as a JIT torch.jit.save at the end, or as state_dict if something fails.
+- Saves the tokenizer locally (./tokenizer) and uses it. Gives a helpful message if the tokenizer is missing.
+- Supports AMP (autocast + GradScaler) on GPU.
+- Optional support for bitsandbytes 8-bit optimizer (if installed).
+- Comments and console messages are in Russian.
+Before running: if you have a Python file with the model implementation
+(for example gpt_modern_1b.py or gpt_modern_1b_class.py), place it in the same folder
+and make sure it contains a class named JiRackPyTorch (or one of the other names the script looks for).
+If no such file exists — the script will just use the JIT model as before.
+"""
+import os
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128,garbage_collection_threshold:0.6")
+import sys
+import importlib
+import math
+import shutil
+import re
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import IterableDataset, DataLoader
+from transformers import GPT2TokenizerFast
+from tqdm import tqdm
+from torch.cuda.amp import GradScaler, autocast
+# ========================= SETTINGS =========================
+TRAIN_SEQ_LEN = int(os.environ.get("TRAIN_SEQ_LEN", 64))
+BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 1))
+EPOCHS = int(os.environ.get("EPOCHS", 999))
+LEARNING_RATE = float(os.environ.get("LEARNING_RATE", 6e-6))
+WEIGHT_DECAY = float(os.environ.get("WEIGHT_DECAY", 0.01))
+GRAD_CLIP = float(os.environ.get("GRAD_CLIP", 1.0))
+KEEP_LAST_EPOCHS = int(os.environ.get("KEEP_LAST_EPOCHS", 3))
+VAL_SPLIT_RATIO = float(os.environ.get("VAL_SPLIT_RATIO", 0.05))
+BASE_MODEL_PATH = Path("models/gpt_modern_1b_class.script.pt")
+LAST_TRAINED_PATH = Path("models/gpt_1b_last_trained.script.pt")
+PT_STATE_DICT_PATH = Path("models/gpt_modern_1b_class.state_dict.pt")
+BACKUP_DIR = Path("models/backups")
+BACKUP_DIR.mkdir(parents=True, exist_ok=True)
+RAW_PATH = Path("datasets/dialogues_text.txt")
+CLEAN_PATH = Path("datasets/dialogues_text_clean.txt")
+TOKENIZER_LOCAL_DIR = Path("./tokenizer")
+OUTPUT_DIR = Path("build/fine_tuning_output")
+MODEL_SAVE_NAME = "gpt_finetuned.script.pt"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# ========================= Tokenizer helper =========================
+def _load_tokenizer_local(tokenizer_name: str = "gpt2"):
+    """
+    Try to load tokenizer from local files. If not found — give the user instructions.
+    """
+    candidates = []
+    env_path = os.environ.get("TOKENIZER_PATH")
+    if env_path:
+        candidates.append(env_path)
+    candidates.append(str(TOKENIZER_LOCAL_DIR))
+    candidates.append(tokenizer_name)
+    candidates.append("./")
+    for cand in candidates:
+        try:
+            tok = GPT2TokenizerFast.from_pretrained(cand, local_files_only=True)
+            if getattr(tok, "pad_token", None) is None:
+                tok.pad_token = tok.eos_token
+            print(f"Tokenizer loaded from: {cand}")
+            return tok
+        except Exception:
+            continue
+    raise RuntimeError(
+        "Local tokenizer not found. Place tokenizer.json or (vocab.json + merges.txt) into ./tokenizer\n"
+        "OR set the path via TOKENIZER_PATH environment variable.\n"
+        "Example: export TOKENIZER_PATH=/path/to/tokenizer\n"
+        "If you have internet access, you can temporarily use transformers.GPT2TokenizerFast.from_pretrained('gpt2')"
+    )
+# ========================= Dataset =========================
+class LazyTextDataset(IterableDataset):
+    def __init__(self, text_file: Path, seq_len: int = TRAIN_SEQ_LEN, tokenizer_name: str = "gpt2",
+                 split_type: str = 'train', val_ratio: float = VAL_SPLIT_RATIO):
+        self.seq_len = seq_len
+        self.tokenizer = _load_tokenizer_local(tokenizer_name)
+        self.text_file = Path(text_file)
+        self.split_type = split_type
+        self.val_ratio = val_ratio
+        print(f"Loading and tokenizing {self.text_file} (one-time tokenization into ids)...")
+        with open(self.text_file, "r", encoding="utf-8") as f:
+            data = f.read()
+        self.tokens = self.tokenizer.encode(data)
+        total_tokens = max(0, len(self.tokens) - 1)
+        total_batches = total_tokens // seq_len if seq_len > 0 else 0
+        val_size = int(total_batches * val_ratio)
+        train_size = total_batches - val_size
+        if split_type == 'train':
+            self.start = 0
+            self.stop = train_size
+        elif split_type == 'val':
+            self.start = train_size
+            self.stop = train_size + val_size
+        else:
+            raise ValueError("split_type must be 'train' or 'val'")
+        self.total_sequences = max(0, self.stop - self.start)
+        print(f"Split {split_type}: {self.total_sequences} sequences (out of {total_batches})")
+    def __iter__(self):
+        for i in range(self.start * self.seq_len, self.stop * self.seq_len, self.seq_len):
+            if i + self.seq_len + 1 > len(self.tokens):
+                break
+            input_seq = torch.tensor(self.tokens[i: i + self.seq_len], dtype=torch.long)
+            label_seq = torch.tensor(self.tokens[i + 1: i + self.seq_len + 1], dtype=torch.long)
+            yield input_seq, label_seq
+    def __len__(self):
+        return self.total_sequences
+# ========================= Try to load Python nn.Module model =========================
+def try_load_python_model():
+    """
+    Attempt to find and import a local Python model implementation (nn.Module).
+    Looks for several possible module and class names.
+    Returns (model_instance, source_description) or (None, None).
+    """
+    candidates_modules = [
+        "gpt_modern_1b_class",
+        "gpt_modern_1b",
+        "gpt_modern_1b_class_fixed",
+        "model", "ji_rack_model"
+    ]
+    candidates_class_names = [
+        "JiRackPyTorch",
+        "JiRackPyTorch1B",
+        "GPTModel",
+        "JiRackModel"
+    ]
+    for modname in candidates_modules:
+        try:
+            spec = importlib.util.find_spec(modname)
+            if spec is None:
+                continue
+            mod = importlib.import_module(modname)
+            for cls_name in candidates_class_names:
+                if hasattr(mod, cls_name):
+                    cls = getattr(mod, cls_name)
+                    try:
+                        inst = cls()
+                        print(f"Loaded Python model class {cls_name} from module {modname}")
+                        return inst, f"python:{modname}.{cls_name}"
+                    except Exception as e:
+                        print(f"Found class {cls_name} in {modname} but instantiation failed: {e}")
+                        continue
+        except Exception:
+            continue
+    return None, None
+# ========================= Utility: load weights from JIT script into nn.Module =========================
+def load_weights_from_script_to_module(script_path: Path, module_model: nn.Module):
+    """
+    Best-effort: extract state_dict from a ScriptModule and load it into a regular nn.Module.
+    Returns True on success.
+    """
+    try:
+        script_mod = torch.jit.load(script_path, map_location="cpu")
+    except Exception as e:
+        print(f"Cannot load script {script_path}: {e}")
+        return False
+    try:
+        sd = script_mod.state_dict()
+    except Exception as e:
+        print(f"ScriptModule.state_dict() failed: {e}")
+        return False
+    try:
+        module_model.load_state_dict(sd, strict=False)
+        print("Weights successfully loaded from ScriptModule into Python nn.Module (strict=False).")
+        return True
+    except Exception as e:
+        print(f"load_state_dict failed: {e}")
+        return False
+# ========================= Helper to get logits from any model type =========================
+def get_logits_from_model(model, inputs: torch.Tensor):
+    inputs = inputs.to(device)
+    out = model(inputs)
+    if isinstance(out, (tuple, list)):
+        return out[0]
+    return out
+# ========================= Evaluation =========================
+def evaluate(model, dataloader, criterion):
+    model.eval()
+    total_loss = 0.0
+    count = 0
+    with torch.no_grad():
+        for inputs, targets in dataloader:
+            inputs, targets = inputs.to(device), targets.to(device)
+            logits = get_logits_from_model(model, inputs)
+            logits = logits.contiguous().view(-1, logits.size(-1))
+            targets = targets.contiguous().view(-1)[:logits.shape[0]]
+            loss = criterion(logits, targets)
+            total_loss += float(loss.item())
+            count += 1
+    model.train()
+    return total_loss / max(1, count)
+# ========================= Training loop =========================
+def train():
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    print("Loading model...")
+    python_model, python_source = try_load_python_model()
+    model = None
+    model_source = None
+    # Prefer Python nn.Module if available
+    if python_model is not None:
+        model = python_model
+        model_source = python_source
+        loaded = False
+        # Try to load latest weights (state_dict first, then JIT → state_dict)
+        if PT_STATE_DICT_PATH.exists():
+            try:
+                sd = torch.load(PT_STATE_DICT_PATH, map_location="cpu")
+                model.load_state_dict(sd, strict=False)
+                print(f"Loaded state_dict from {PT_STATE_DICT_PATH}")
+                loaded = True
+            except Exception as e:
+                print(f"Failed to load state_dict from {PT_STATE_DICT_PATH}: {e}")
+        if not loaded and LAST_TRAINED_PATH.exists():
+            if load_weights_from_script_to_module(LAST_TRAINED_PATH, model):
+                loaded = True
+        if not loaded and BASE_MODEL_PATH.exists():
+            if load_weights_from_script_to_module(BASE_MODEL_PATH, model):
+                loaded = True
+    else:
+        # Fallback to JIT ScriptModule
+        if LAST_TRAINED_PATH.exists():
+            model = torch.jit.load(LAST_TRAINED_PATH, map_location=device)
+            model_source = f"jit:{LAST_TRAINED_PATH}"
+        elif BASE_MODEL_PATH.exists():
+            model = torch.jit.load(BASE_MODEL_PATH, map_location=device)
+            model_source = f"jit:{BASE_MODEL_PATH}"
+        else:
+            print("ERROR: No model found (neither Python module nor JIT). Place a model file or Python implementation.")
+            return
+    print(f"Model loaded from: {model_source}")
+    # If we are using a real nn.Module → move to device + enable gradient checkpointing if possible
+    is_python_module = isinstance(model, nn.Module)
+    if is_python_module:
+        model.to(device)
+        model.train()
+        try:
+            model.gradient_checkpointing_enable()
+            print("Gradient checkpointing ENABLED on Python nn.Module.")
+        except Exception:
+            try:
+                model.gradient_checkpointing = True
+                print("Set attribute gradient_checkpointing = True (best-effort).")
+            except Exception:
+                print("Gradient checkpointing not available on this Python model.")
+    else:
+        # ScriptModule path
+        try:
+            model.to(device)
+        except Exception:
+            print("Warning: model.to(device) failed for ScriptModule; trying best-effort buffer move.")
+        model.train()
+        print("Training on ScriptModule (gradient checkpointing not available).")
+    # ========================= Dataset preparation =========================
+    if not CLEAN_PATH.exists():
+        if not RAW_PATH.exists():
+            raise FileNotFoundError(f"Missing dataset {RAW_PATH}")
+        print("Cleaning raw dataset → cleaned version...")
+        text = RAW_PATH.read_text(encoding="utf-8")
+        text = re.sub(r" {2,}", " ", text)
+        text = text.replace(" \n", "\n").replace("\n ", "\n")
+        CLEAN_PATH.write_text(text, encoding="utf-8")
+        print(f"Cleaned dataset saved → {CLEAN_PATH}")
+    train_dataset = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='train', val_ratio=VAL_SPLIT_RATIO)
+    val_dataset   = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='val',   val_ratio=VAL_SPLIT_RATIO)
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
+    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
+    # ========================= Optimizer (try 8-bit first) =========================
+    try:
+        import bitsandbytes as bnb  # type: ignore
+        try:
+            optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        except Exception:
+            optimizer = bnb.optim.Adam8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        print("Using bitsandbytes 8-bit optimizer.")
+    except Exception:
+        optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        print("Using standard torch.optim.AdamW (bitsandbytes not available).")
+    criterion = nn.CrossEntropyLoss()
+    scaler = GradScaler(enabled=(device.type == 'cuda'))
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+    total_steps = (len(train_dataset) // BATCH_SIZE) * EPOCHS if len(train_dataset) > 0 else 0
+    print(f"\nSTARTING training: epochs={EPOCHS}, approx. steps={total_steps}, examples={len(train_dataset)}")
+    print(f"Batch size={BATCH_SIZE}, seq_len={TRAIN_SEQ_LEN}, device={device}, AMP={'on' if device.type=='cuda' else 'off'}")
+    global_step = 0
+    for epoch in range(1, EPOCHS + 1):
+        print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
+        epoch_loss = 0.0
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch} [TRAIN]", leave=False)
+        for inputs, targets in pbar:
+            inputs, targets = inputs.to(device), targets.to(device)
+            optimizer.zero_grad(set_to_none=True)
+            with autocast(enabled=(device.type == 'cuda')):
+                logits = get_logits_from_model(model, inputs)
+                logits = logits.contiguous().view(-1, logits.size(-1))
+                targets_view = targets.contiguous().view(-1)[:logits.shape[0]]
+                loss = criterion(logits, targets_view)
+            # Backward pass (AMP-safe)
+            if device.type == 'cuda':
+                try:
+                    scaler.scale(loss).backward()
+                    scaler.unscale_(optimizer)
+                except Exception as e:
+                    print("Scaled backward failed:", e)
+                    loss.backward()
+                try:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+                except Exception:
+                    pass
+                try:
+                    scaler.step(optimizer)
+                    scaler.update()
+                except RuntimeError as e:
+                    print("RuntimeError in scaler.step():", e)
+                    print(torch.cuda.memory_summary())
+                    # Fallback without scaler
+                    try:
+                        scaler.unscale_(optimizer)
+                        optimizer.step()
+                    except Exception as e2:
+                        print("Fallback optimizer.step() failed:", e2)
+                        raise e
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+                optimizer.step()
+            if device.type == 'cuda':
+                torch.cuda.empty_cache()
+            loss_val = float(loss.item())
+            epoch_loss += loss_val
+            global_step += 1
+            pbar.set_postfix({"loss": f"{loss_val:.4f}", "ppl": f"{math.exp(min(loss_val, 10)):.2f}", "step": global_step})
+        avg_train_loss = epoch_loss / max(1, len(train_dataset) // BATCH_SIZE)
+        print(f"[TRAIN] Avg loss: {avg_train_loss:.4f} | Perplexity: {math.exp(avg_train_loss):.2f}")
+        print("Running validation...")
+        val_loss = evaluate(model, val_loader, criterion)
+        print(f"[VAL] Avg loss: {val_loss:.4f} | Perplexity: {math.exp(val_loss):.2f}")
+        # Save checkpoint for this epoch
+        epoch_dir = OUTPUT_DIR / f"epoch{epoch}"
+        epoch_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            if is_python_module:
+                model.eval()
+                dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
+                try:
+                    traced = torch.jit.trace(model, dummy, strict=False)
+                    torch.jit.save(traced, epoch_dir / MODEL_SAVE_NAME)
+                    print(f"Exported traced JIT → {epoch_dir / MODEL_SAVE_NAME}")
+                except Exception as e:
+                    torch.save(model.state_dict(), epoch_dir / "state_dict.pt")
+                    print(f"Saved state_dict (trace failed): {e}")
+                model.train()
+            else:
+                torch.jit.save(model, epoch_dir / MODEL_SAVE_NAME)
+                print(f"Saved ScriptModule → {epoch_dir / MODEL_SAVE_NAME}")
+        except Exception as e:
+            print("Error while saving epoch model:", e)
+        cleanup_old_epochs()
+    # ========================= Final model save =========================
+    final_dir = OUTPUT_DIR / "final"
+    final_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        if is_python_module:
+            model.eval()
+            dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
+            traced = torch.jit.trace(model, dummy, strict=False)
+            torch.jit.save(traced, final_dir / MODEL_SAVE_NAME)
+            print(f"Final traced JIT saved → {final_dir / MODEL_SAVE_NAME}")
+        else:
+            torch.jit.save(model, final_dir / MODEL_SAVE_NAME)
+            print(f"Final ScriptModule saved → {final_dir / MODEL_SAVE_NAME}")
+    except Exception:
+        torch.save(model.state_dict(), final_dir / "state_dict.pt")
+        print("Final model saved as state_dict (trace failed).")
+    # Save tokenizer with the final model
+    try:
+        train_dataset.tokenizer.save_pretrained(final_dir)
+    except Exception:
+        pass
+    # Backup previous last-trained model and update the "current" symlink/file
+    if LAST_TRAINED_PATH.exists():
+        backup_path = BACKUP_DIR / f"gpt_last_trained_backup_{int(LAST_TRAINED_PATH.stat().st_mtime)}.script.pt"
+        shutil.copy(LAST_TRAINED_PATH, backup_path)
+        print(f"Backed up previous last_trained → {backup_path}")
+    if (final_dir / MODEL_SAVE_NAME).exists():
+        shutil.copy(final_dir / MODEL_SAVE_NAME, LAST_TRAINED_PATH)
+        print(f"Copied final model → {LAST_TRAINED_PATH}")
+    elif (final_dir / "state_dict.pt").exists():
+        shutil.copy(final_dir / "state_dict.pt", LAST_TRAINED_PATH.with_suffix(".state_dict.pt"))
+    print("TRAINING COMPLETED.")
+# ========================= Entrypoint =========================
+if __name__ == "__main__":
+    if not RAW_PATH.exists():
+        print(f"ERROR: dataset {RAW_PATH} not found. Place your training text there.")
+        sys.exit(1)
     train()