CMSManhattan
/

JiRack_GPT3_33b

Model card Files Files and versions

xet

Community

kgrabko commited on Dec 5, 2025

Commit

355f97d

verified ·

1 Parent(s): 9a02192

Upload fine_tune_jit_with_validation_torch_script_cuda_33b.py

Browse files

Files changed (1) hide show

fine_tune_jit_with_validation_torch_script_cuda_33b.py +488 -0

fine_tune_jit_with_validation_torch_script_cuda_33b.py ADDED Viewed

	@@ -0,0 +1,488 @@

+#!/usr/bin/env python3
+# Copyright (c) 2025 CMS Manhattan
+# All rights reserved.
+#
+# This file is part of a project authored by CMS Manhattan. You may use, distribute, and modify
+# this code under the terms of the GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007
+# please read <http://www.gnu.org/licenses/>.
+"""
+mkdir -p tokenizer
+wget -O tokenizer/tokenizer.json https://huggingface.co/gpt2/resolve/main/tokenizer.json
+wget -O tokenizer/vocab.json https://huggingface.co/gpt2/resolve/main/vocab.json
+wget -O tokenizer/merges.txt https://huggingface.co/gpt2/resolve/main/merges.txt
+wget -O tokenizer/tokenizer_config.json https://huggingface.co/gpt2/resolve/main/tokenizer_config.json
+Updated fine-tuning script, version "prefer Python nn.Module with gradient checkpointing".
+What it does:
+- Tries to load a local Python model implementation (nn.Module). If found — uses it and
+  enables gradient_checkpointing (if implemented).
+- If the Python model class is not found, falls back to JIT ScriptModule (as before).
+- If the original weights are only available as JIT, attempts to extract state_dict() from the ScriptModule
+  and load it into the nn.Module (best-effort).
+- Saves the final trained model as a JIT (torch.jit.save) at the end, or saves state_dict if an error occurs.
+- Saves the tokenizer locally (./tokenizer) and uses it. If the tokenizer is missing, gives a helpful hint.
+- Supports AMP (autocast + GradScaler) for GPU.
+- Optional support for bitsandbytes 8-bit optimizer (if available).
+- Comments and messages are in Russian.
+Before running: if you have a file with the model implementation (e.g., gpt_modern_1b.py or gpt_modern_1b_class.py),
+place it in the same directory and make sure it contains a class named JiRackPyTorch or another name we're looking for.
+If not — the script will fall back to using the JIT model as before.
+"""
+import os
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128,garbage_collection_threshold:0.6")
+import sys
+import importlib
+import math
+import shutil
+import re
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import IterableDataset, DataLoader
+from transformers import GPT2TokenizerFast
+from tqdm import tqdm
+from torch.cuda.amp import GradScaler, autocast
+# ========================= SETTINGS =========================
+TRAIN_SEQ_LEN = int(os.environ.get("TRAIN_SEQ_LEN", 64))
+BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 1))
+EPOCHS = int(os.environ.get("EPOCHS", 999))
+LEARNING_RATE = float(os.environ.get("LEARNING_RATE", 6e-6))
+WEIGHT_DECAY = float(os.environ.get("WEIGHT_DECAY", 0.01))
+GRAD_CLIP = float(os.environ.get("GRAD_CLIP", 1.0))
+KEEP_LAST_EPOCHS = int(os.environ.get("KEEP_LAST_EPOCHS", 3))
+VAL_SPLIT_RATIO = float(os.environ.get("VAL_SPLIT_RATIO", 0.05))
+BASE_MODEL_PATH = Path("models/gpt_modern_33b_class.script.pt")
+LAST_TRAINED_PATH = Path("models/gpt_33b_last_trained.script.pt")
+PT_STATE_DICT_PATH = Path("models/gpt_modern_33b_class.state_dict.pt")
+BACKUP_DIR = Path("models/backups")
+BACKUP_DIR.mkdir(parents=True, exist_ok=True)
+RAW_PATH = Path("datasets/dialogues_text.txt")
+CLEAN_PATH = Path("datasets/dialogues_text_clean.txt")
+TOKENIZER_LOCAL_DIR = Path("./tokenizer")
+OUTPUT_DIR = Path("build/fine_tuning_output")
+MODEL_SAVE_NAME = "gpt_finetuned.script.pt"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# ========================= Tokenizer helper =========================
+def _load_tokenizer_local(tokenizer_name: str = "gpt2"):
+    """
+    Try to load the tokenizer locally. If not found — give user instructions.
+    """
+    candidates = []
+    env_path = os.environ.get("TOKENIZER_PATH")
+    if env_path:
+        candidates.append(env_path)
+    candidates.append(str(TOKENIZER_LOCAL_DIR))
+    candidates.append(tokenizer_name)
+    candidates.append("./")
+    for cand in candidates:
+        try:
+            tok = GPT2TokenizerFast.from_pretrained(cand, local_files_only=True)
+            if getattr(tok, "pad_token", None) is None:
+                tok.pad_token = tok.eos_token
+            print(f"Tokenizer loaded from: {cand}")
+            return tok
+        except Exception:
+            continue
+    raise RuntimeError(
+        "Local tokenizer not found. Place tokenizer.json or (vocab.json + merges.txt) in ./tokenizer\n"
+        "OR specify the path via TOKENIZER_PATH environment variable.\n"
+        "Example: export TOKENIZER_PATH=/path/to/tokenizer\n"
+        "If you have internet access, you can temporarily use transformers.GPT2TokenizerFast.from_pretrained('gpt2')"
+    )
+# ========================= Dataset =========================
+class LazyTextDataset(IterableDataset):
+    def __init__(self, text_file: Path, seq_len: int = TRAIN_SEQ_LEN, tokenizer_name: str = "gpt2",
+                 split_type: str = 'train', val_ratio: float = VAL_SPLIT_RATIO):
+        self.seq_len = seq_len
+        self.tokenizer = _load_tokenizer_local(tokenizer_name)
+        self.text_file = Path(text_file)
+        self.split_type = split_type
+        self.val_ratio = val_ratio
+        print(f"Loading and tokenizing {self.text_file} (one-time tokenization into ids)...")
+        with open(self.text_file, "r", encoding="utf-8") as f:
+            data = f.read()
+        self.tokens = self.tokenizer.encode(data)
+        total_tokens = max(0, len(self.tokens) - 1)
+        total_batches = total_tokens // seq_len if seq_len > 0 else 0
+        val_size = int(total_batches * val_ratio)
+        train_size = total_batches - val_size
+        if split_type == 'train':
+            self.start = 0
+            self.stop = train_size
+        elif split_type == 'val':
+            self.start = train_size
+            self.stop = train_size + val_size
+        else:
+            raise ValueError("split_type must be 'train' or 'val'")
+        self.total_sequences = max(0, self.stop - self.start)
+        print(f"Split {split_type}: {self.total_sequences} sequences (out of {total_batches})")
+    def __iter__(self):
+        for i in range(self.start * self.seq_len, self.stop * self.seq_len, self.seq_len):
+            if i + self.seq_len + 1 > len(self.tokens):
+                break
+            input_seq = torch.tensor(self.tokens[i: i + self.seq_len], dtype=torch.long)
+            label_seq = torch.tensor(self.tokens[i + 1: i + self.seq_len + 1], dtype=torch.long)
+            yield input_seq, label_seq
+    def __len__(self):
+        return self.total_sequences
+# ========================= Attempt to load Python nn.Module model =========================
+def try_load_python_model():
+    """
+    Try to find a local module/class implementing the model (nn.Module).
+    Search through possible module and class names.
+    Returns (model_instance, source_str) or (None, None).
+    """
+    candidates_modules = [
+        "gpt_modern_1b_class",
+        "gpt_modern_1b",
+        "gpt_modern_1b_class_fixed",
+        "model", "ji_rack_model"
+    ]
+    candidates_class_names = [
+        "JiRackPyTorch",
+        "JiRackPyTorch1B",
+        "GPTModel",
+        "JiRackModel"
+    ]
+    for modname in candidates_modules:
+        try:
+            spec = importlib.util.find_spec(modname)
+            if spec is None:
+                continue
+            mod = importlib.import_module(modname)
+            for cls_name in candidates_class_names:
+                if hasattr(mod, cls_name):
+                    cls = getattr(mod, cls_name)
+                    try:
+                        inst = cls()
+                        print(f"Loaded Python model class {cls_name} from module {modname}")
+                        return inst, f"python:{modname}.{cls_name}"
+                    except Exception as e:
+                        print(f"Found class {cls_name} in {modname} but failed to instantiate: {e}")
+                        continue
+        except Exception:
+            continue
+    return None, None
+# ========================= Utility: load weights from script -> module =========================
+def load_weights_from_script_to_module(script_path: Path, module_model: nn.Module):
+    """
+    Best-effort: load state_dict from ScriptModule and apply it to nn.Module.
+    Returns True on success.
+    """
+    try:
+        script_mod = torch.jit.load(script_path, map_location="cpu")
+    except Exception as e:
+        print(f"Cannot load script at {script_path}: {e}")
+        return False
+    try:
+        sd = script_mod.state_dict()
+    except Exception as e:
+        print(f"ScriptModule.state_dict() failed: {e}")
+        return False
+    # Try to load into module_model
+    try:
+        module_model.load_state_dict(sd, strict=False)
+        print("Weights successfully loaded from ScriptModule.state_dict() into Python nn.Module (strict=False).")
+        return True
+    except Exception as e:
+        print(f"load_state_dict failed: {e}")
+        return False
+# ========================= get_logits helper =========================
+def get_logits_from_model(model, inputs: torch.Tensor):
+    inputs = inputs.to(device)
+    out = model(inputs)
+    if isinstance(out, (tuple, list)):
+        return out[0]
+    return out
+# ========================= EVALUATION =========================
+def evaluate(model, dataloader, criterion):
+    model.eval()
+    total_loss = 0.0
+    count = 0
+    with torch.no_grad():
+        for inputs, targets in dataloader:
+            inputs, targets = inputs.to(device), targets.to(device)
+            logits = get_logits_from_model(model, inputs)
+            logits = logits.contiguous().view(-1, logits.size(-1))
+            targets = targets.contiguous().view(-1)[:logits.shape[0]]
+            loss = criterion(logits, targets)
+            total_loss += float(loss.item())
+            count += 1
+    model.train()
+    return total_loss / max(1, count)
+# ========================= TRAINING LOOP =========================
+def train():
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    print("Loading model...")
+    python_model, python_source = try_load_python_model()
+    model = None
+    model_source = None
+    # If we have a Python model class, instantiate and try to load weights
+    if python_model is not None:
+        model = python_model
+        model_source = python_source
+        # Try to load weights from last trained state_dict or from JIT script
+        loaded = False
+        if PT_STATE_DICT_PATH.exists():
+            try:
+                sd = torch.load(PT_STATE_DICT_PATH, map_location="cpu")
+                model.load_state_dict(sd, strict=False)
+                print(f"Loaded state_dict from {PT_STATE_DICT_PATH}")
+                loaded = True
+            except Exception as e:
+                print(f"Failed to load state_dict from {PT_STATE_DICT_PATH}: {e}")
+        if (not loaded) and LAST_TRAINED_PATH.exists():
+            if load_weights_from_script_to_module(LAST_TRAINED_PATH, model):
+                loaded = True
+        if (not loaded) and BASE_MODEL_PATH.exists():
+            if load_weights_from_script_to_module(BASE_MODEL_PATH, model):
+                loaded = True
+    else:
+        # Fallback to ScriptModule (JIT)
+        if LAST_TRAINED_PATH.exists():
+            model = torch.jit.load(LAST_TRAINED_PATH, map_location=device)
+            model_source = f"jit:{LAST_TRAINED_PATH}"
+        elif BASE_MODEL_PATH.exists():
+            model = torch.jit.load(BASE_MODEL_PATH, map_location=device)
+            model_source = f"jit:{BASE_MODEL_PATH}"
+        else:
+            print("ERROR: No model found (neither Python module nor JIT). Place a model file or Python module.")
+            return
+    print(f"Model loaded from: {model_source}")
+    # If we have an nn.Module (Python), move to device and enable gradient checkpointing if available
+    is_python_module = isinstance(model, nn.Module)
+    if is_python_module:
+        model.to(device)
+        model.train()
+        try:
+            model.gradient_checkpointing_enable()
+            print("Gradient checkpointing ENABLED on Python nn.Module.")
+        except Exception:
+            # try alternative attribute
+            try:
+                model.gradient_checkpointing = True
+                print("Set attribute gradient_checkpointing = True (best-effort).")
+            except Exception:
+                print("Gradient checkpointing not available on this Python model.")
+    else:
+        # ScriptModule
+        try:
+            model.to(device)
+        except Exception:
+            print("Warning: model.to(device) failed for ScriptModule; attempting best-effort buffer moves.")
+        model.train()
+        print("Training on ScriptModule (gradient checkpointing not available).")
+    # Dataset preparation
+    if not CLEAN_PATH.exists():
+        # Try to clean RAW -> CLEAN
+        if not RAW_PATH.exists():
+            raise FileNotFoundError(f"Missing dataset {RAW_PATH}")
+        print("Cleaning raw dataset to create cleaned version...")
+        text = RAW_PATH.read_text(encoding="utf-8")
+        text = re.sub(r" {2,}", " ", text)
+        text = text.replace(" \n", "\n").replace("\n ", "\n")
+        CLEAN_PATH.write_text(text, encoding="utf-8")
+        print(f"Saved cleaned dataset → {CLEAN_PATH}")
+    train_dataset = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='train', val_ratio=VAL_SPLIT_RATIO)
+    val_dataset = LazyTextDataset(CLEAN_PATH, seq_len=TRAIN_SEQ_LEN, split_type='val', val_ratio=VAL_SPLIT_RATIO)
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
+    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True, num_workers=0)
+    # Optimizer: try bitsandbytes 8-bit first (if available)
+    try:
+        import bitsandbytes as bnb  # type: ignore
+        try:
+            optimizer = bnb.optim.AdamW8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        except Exception:
+            optimizer = bnb.optim.Adam8bit(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        print("Using bitsandbytes 8-bit optimizer.")
+    except Exception:
+        optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
+        print("Using torch.optim.AdamW (bitsandbytes not available).")
+    criterion = nn.CrossEntropyLoss()
+    scaler = GradScaler(enabled=(device.type == 'cuda'))
+    # Pre-clean GPU memory
+    if device.type == 'cuda':
+        torch.cuda.empty_cache()
+    total_steps = (len(train_dataset) // BATCH_SIZE) * EPOCHS if len(train_dataset) > 0 else 0
+    print(f"\nSTARTING training: epochs={EPOCHS}, approx steps={total_steps}, examples={len(train_dataset)}")
+    print(f"Batch size={BATCH_SIZE}, seq_len={TRAIN_SEQ_LEN}, device={device}, AMP={'on' if device.type=='cuda' else 'off'}")
+    global_step = 0
+    for epoch in range(1, EPOCHS + 1):
+        print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
+        epoch_loss = 0.0
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch} [TRAIN]", leave=False)
+        for inputs, targets in pbar:
+            inputs, targets = inputs.to(device), targets.to(device)
+            optimizer.zero_grad(set_to_none=True)
+            with autocast(enabled=(device.type == 'cuda')):
+                logits = get_logits_from_model(model, inputs)
+                logits = logits.contiguous().view(-1, logits.size(-1))
+                targets_view = targets.contiguous().view(-1)[:logits.shape[0]]
+                loss = criterion(logits, targets_view)
+            # Backward pass + optimizer step (AMP-safe)
+            if device.type == 'cuda':
+                try:
+                    scaler.scale(loss).backward()
+                    scaler.unscale_(optimizer)
+                except Exception as e:
+                    print("Scaled backward failed:", e)
+                    loss.backward()
+                try:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+                except Exception:
+                    pass
+                try:
+                    scaler.step(optimizer)
+                    scaler.update()
+                except RuntimeError as e:
+                    print("RuntimeError during scaler.step():", e)
+                    print(torch.cuda.memory_summary())
+                    # Fallback: regular step
+                    try:
+                        scaler.unscale_(optimizer)
+                        optimizer.step()
+                    except Exception as e2:
+                        print("Fallback optimizer.step() failed:", e2)
+                        raise e
+            else:
+                loss.backward()
+                try:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+                except Exception:
+                    pass
+                optimizer.step()
+            if device.type == 'cuda':
+                torch.cuda.empty_cache()
+            loss_val = float(loss.item())
+            epoch_loss += loss_val
+            global_step += 1
+            pbar.set_postfix({"loss": f"{loss_val:.4f}", "ppl": f"{math.exp(min(loss_val, 10)):.2f}", "step": f"{global_step}"})
+        avg_train_loss = epoch_loss / max(1, len(train_dataset) // BATCH_SIZE)
+        print(f"[TRAIN] Avg loss: {avg_train_loss:.4f} | PPL: {math.exp(avg_train_loss):.2f}")
+        print("Running validation...")
+        val_loss = evaluate(model, val_loader, criterion)
+        print(f"[VAL] Avg loss: {val_loss:.4f} | PPL: {math.exp(val_loss):.2f}")
+        # Save current epoch
+        epoch_dir = OUTPUT_DIR / f"epoch{epoch}"
+        epoch_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            if is_python_module:
+                model.eval()
+                dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
+                try:
+                    traced = torch.jit.trace(model, dummy, strict=False)
+                    torch.jit.save(traced, epoch_dir / MODEL_SAVE_NAME)
+                    print(f"Exported traced JIT to {epoch_dir / MODEL_SAVE_NAME}")
+                except Exception as e:
+                    torch.save(model.state_dict(), epoch_dir / "state_dict.pt")
+                    print(f"Saved state_dict due to trace failure: {e}")
+                model.train()
+            else:
+                torch.jit.save(model, epoch_dir / MODEL_SAVE_NAME)
+                print(f"Saved ScriptModule to {epoch_dir / MODEL_SAVE_NAME}")
+        except Exception as e:
+            print("Exception during model save:", e)
+        cleanup_old_epochs()
+    # Final model save
+    final_dir = OUTPUT_DIR / "final"
+    final_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        if is_python_module:
+            model.eval()
+            dummy = torch.randint(0, 50257, (1, min(32, TRAIN_SEQ_LEN)), device=device)
+            try:
+                traced = torch.jit.trace(model, dummy, strict=False)
+                torch.jit.save(traced, final_dir / MODEL_SAVE_NAME)
+                print(f"Final JIT saved to {final_dir / MODEL_SAVE_NAME}")
+            except Exception:
+                torch.save(model.state_dict(), final_dir / "state_dict.pt")
+                print("Final model saved as state_dict (trace failed).")
+        else:
+            torch.jit.save(model, final_dir / MODEL_SAVE_NAME)
+            print(f"Final ScriptModule saved to {final_dir / MODEL_SAVE_NAME}")
+    except Exception:
+        try:
+            torch.save(model.state_dict(), final_dir / "state_dict.pt")
+        except Exception:
+            pass
+    # Save tokenizer with the final model
+    try:
+        train_dataset.tokenizer.save_pretrained(final_dir)
+    except Exception:
+        pass
+    # Backup previous last_trained and update with new one
+    if LAST_TRAINED_PATH.exists():
+        backup_path = BACKUP_DIR / f"gpt_last_trained_backup_{int(LAST_TRAINED_PATH.stat().st_mtime)}.script.pt"
+        try:
+            shutil.copy(LAST_TRAINED_PATH, backup_path)
+            print(f"Backed up previous last_trained -> {backup_path}")
+        except Exception:
+            pass
+    try:
+        if (final_dir / MODEL_SAVE_NAME).exists():
+            shutil.copy(final_dir / MODEL_SAVE_NAME, LAST_TRAINED_PATH)
+            print(f"Copied final model to {LAST_TRAINED_PATH}")
+        elif (final_dir / "state_dict.pt").exists():
+            shutil.copy(final_dir / "state_dict.pt", LAST_TRAINED_PATH.with_suffix(".state_dict.pt"))
+            print("Copied final state_dict to LAST_TRAINED_PATH with .state_dict.pt suffix")
+    except Exception:
+        pass
+    print("TRAINING COMPLETED.")
+# ========================= Entrypoint =========================
+if __name__ == "__main__":
+    if not RAW_PATH.exists():
+        print(f"ERROR: dataset {RAW_PATH} not found. Put your training text there.")
+        sys.exit(1)
+    train()