|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Language Model Training Script |
|
|
|
|
|
This script implements the complete training pipeline for GPT-style language models. |
|
|
It includes optimization, checkpointing, progress monitoring, and CPU-optimized training |
|
|
for limited hardware environments. |
|
|
|
|
|
FEATURES: |
|
|
- CPU-optimized training with memory management |
|
|
- Gradient accumulation for effective large batch sizes |
|
|
- Learning rate scheduling with warmup |
|
|
- Model checkpointing and resume capability |
|
|
- Real-time monitoring of loss, perplexity, and speed |
|
|
- Memory usage tracking and optimization |
|
|
- Automatic mixed precision (if available) |
|
|
|
|
|
HARDWARE OPTIMIZATION: |
|
|
- Designed for 8GB RAM systems |
|
|
- Efficient CPU training with PyTorch optimizations |
|
|
- Gradient accumulation to simulate larger batches |
|
|
- Memory cleanup and garbage collection |
|
|
- Progress saving for long training runs |
|
|
|
|
|
Usage: |
|
|
python core/src/train_model.py \\ |
|
|
--model-size small \\ |
|
|
--data-file data/clean/training_data.txt \\ |
|
|
--tokenizer-dir data/tokenizer/ \\ |
|
|
--output-dir models/my-model/ \\ |
|
|
--max-steps 10000 |
|
|
|
|
|
Requirements: |
|
|
- PyTorch |
|
|
- SentencePiece |
|
|
- Our model architecture and data loader |
|
|
|
|
|
Author: Louis Chua Bean Chong |
|
|
License: GPLv3 |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import gc |
|
|
import json |
|
|
import math |
|
|
import os |
|
|
import time |
|
|
from pathlib import Path |
|
|
from typing import Dict |
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.optim as optim |
|
|
from torch.optim.lr_scheduler import CosineAnnealingLR |
|
|
|
|
|
|
|
|
try: |
|
|
from data_loader import TextDataLoader |
|
|
from model import GPTModel, create_model |
|
|
except ImportError: |
|
|
import sys |
|
|
|
|
|
sys.path.append(os.path.dirname(__file__)) |
|
|
from data_loader import TextDataLoader |
|
|
from model import GPTModel, create_model |
|
|
|
|
|
|
|
|
class TrainingConfig: |
|
|
"""Configuration for model training parameters.""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
learning_rate: float = 1e-4, |
|
|
batch_size: int = 32, |
|
|
max_steps: int = 100000, |
|
|
warmup_steps: int = 10000, |
|
|
gradient_clipping: float = 1.0, |
|
|
weight_decay: float = 0.01, |
|
|
mixed_precision: bool = True, |
|
|
gradient_checkpointing: bool = True, |
|
|
): |
|
|
self.learning_rate = learning_rate |
|
|
self.batch_size = batch_size |
|
|
self.max_steps = max_steps |
|
|
self.warmup_steps = warmup_steps |
|
|
self.gradient_clipping = gradient_clipping |
|
|
self.weight_decay = weight_decay |
|
|
self.mixed_precision = mixed_precision |
|
|
self.gradient_checkpointing = gradient_checkpointing |
|
|
|
|
|
|
|
|
class ModelTrainer: |
|
|
""" |
|
|
Comprehensive trainer for GPT-style language models. |
|
|
|
|
|
Handles the complete training pipeline including data loading, optimization, |
|
|
checkpointing, and progress monitoring. |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
model: GPTModel, |
|
|
data_loader: TextDataLoader, |
|
|
output_dir: str, |
|
|
device: str = "cpu", |
|
|
learning_rate: float = 3e-4, |
|
|
weight_decay: float = 0.01, |
|
|
warmup_steps: int = 1000, |
|
|
max_steps: int = 10000, |
|
|
gradient_accumulation_steps: int = 4, |
|
|
gradient_clipping: float = 1.0, |
|
|
save_every: int = 1000, |
|
|
eval_every: int = 500, |
|
|
log_every: int = 100, |
|
|
): |
|
|
""" |
|
|
Initialize the model trainer. |
|
|
|
|
|
Args: |
|
|
model: GPT model to train |
|
|
data_loader: Data loader for training data |
|
|
output_dir: Directory to save checkpoints and logs |
|
|
device: Training device ("cpu" or "cuda") |
|
|
learning_rate: Peak learning rate |
|
|
weight_decay: Weight decay for regularization |
|
|
warmup_steps: Number of warmup steps for learning rate |
|
|
max_steps: Maximum training steps |
|
|
gradient_accumulation_steps: Steps to accumulate gradients |
|
|
gradient_clipping: Maximum gradient norm |
|
|
save_every: Save checkpoint every N steps |
|
|
eval_every: Evaluate model every N steps |
|
|
log_every: Log progress every N steps |
|
|
""" |
|
|
self.model = model.to(device) |
|
|
self.data_loader = data_loader |
|
|
self.output_dir = Path(output_dir) |
|
|
self.device = device |
|
|
|
|
|
|
|
|
self.learning_rate = learning_rate |
|
|
self.weight_decay = weight_decay |
|
|
self.warmup_steps = warmup_steps |
|
|
self.max_steps = max_steps |
|
|
self.gradient_accumulation_steps = gradient_accumulation_steps |
|
|
self.gradient_clipping = gradient_clipping |
|
|
|
|
|
|
|
|
self.save_every = save_every |
|
|
self.eval_every = eval_every |
|
|
self.log_every = log_every |
|
|
|
|
|
|
|
|
self.output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
self.optimizer = self._create_optimizer() |
|
|
self.scheduler = self._create_scheduler() |
|
|
|
|
|
|
|
|
self.step = 0 |
|
|
self.epoch = 0 |
|
|
self.best_loss = float("inf") |
|
|
self.training_log = [] |
|
|
|
|
|
|
|
|
self.start_time = None |
|
|
self.step_times = [] |
|
|
|
|
|
print("๐ ModelTrainer initialized") |
|
|
print(f" Device: {device}") |
|
|
print(f" Model parameters: {model.get_num_params():,}") |
|
|
print(f" Learning rate: {learning_rate}") |
|
|
print(f" Max steps: {max_steps:,}") |
|
|
print(f" Gradient accumulation: {gradient_accumulation_steps}") |
|
|
print(f" Output directory: {output_dir}") |
|
|
|
|
|
def _create_optimizer(self) -> optim.Optimizer: |
|
|
"""Create AdamW optimizer with weight decay.""" |
|
|
|
|
|
decay_params = [] |
|
|
no_decay_params = [] |
|
|
|
|
|
for name, param in self.model.named_parameters(): |
|
|
if not param.requires_grad: |
|
|
continue |
|
|
|
|
|
|
|
|
if len(param.shape) == 1 or name.endswith(".bias"): |
|
|
no_decay_params.append(param) |
|
|
else: |
|
|
decay_params.append(param) |
|
|
|
|
|
param_groups = [ |
|
|
{"params": decay_params, "weight_decay": self.weight_decay}, |
|
|
{"params": no_decay_params, "weight_decay": 0.0}, |
|
|
] |
|
|
|
|
|
|
|
|
optimizer = optim.AdamW( |
|
|
param_groups, |
|
|
lr=self.learning_rate, |
|
|
betas=(0.9, 0.95), |
|
|
eps=1e-8, |
|
|
) |
|
|
|
|
|
return optimizer |
|
|
|
|
|
def _create_scheduler(self) -> torch.optim.lr_scheduler._LRScheduler: |
|
|
"""Create learning rate scheduler with warmup and cosine decay.""" |
|
|
if self.warmup_steps > 0: |
|
|
|
|
|
|
|
|
class WarmupCosineScheduler(torch.optim.lr_scheduler._LRScheduler): |
|
|
def __init__(self, optimizer, warmup_steps, max_steps, min_lr_factor=0.1): |
|
|
self.warmup_steps = warmup_steps |
|
|
self.max_steps = max_steps |
|
|
self.min_lr_factor = min_lr_factor |
|
|
super().__init__(optimizer) |
|
|
|
|
|
def get_lr(self): |
|
|
if self.last_epoch < self.warmup_steps: |
|
|
|
|
|
factor = self.last_epoch / self.warmup_steps |
|
|
return [base_lr * (0.01 + 0.99 * factor) for base_lr in self.base_lrs] |
|
|
else: |
|
|
|
|
|
progress = (self.last_epoch - self.warmup_steps) / ( |
|
|
self.max_steps - self.warmup_steps |
|
|
) |
|
|
progress = min(progress, 1.0) |
|
|
factor = 0.5 * (1 + math.cos(math.pi * progress)) |
|
|
factor = self.min_lr_factor + (1 - self.min_lr_factor) * factor |
|
|
return [base_lr * factor for base_lr in self.base_lrs] |
|
|
|
|
|
scheduler = WarmupCosineScheduler( |
|
|
self.optimizer, |
|
|
warmup_steps=self.warmup_steps, |
|
|
max_steps=self.max_steps, |
|
|
min_lr_factor=0.1, |
|
|
) |
|
|
else: |
|
|
|
|
|
scheduler = CosineAnnealingLR( |
|
|
self.optimizer, T_max=self.max_steps, eta_min=self.learning_rate * 0.1 |
|
|
) |
|
|
|
|
|
return scheduler |
|
|
|
|
|
def _calculate_loss(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: |
|
|
""" |
|
|
Calculate cross-entropy loss for autoregressive language modeling. |
|
|
|
|
|
This method computes the standard cross-entropy loss used in language model training. |
|
|
The loss measures how well the model predicts the next token in the sequence. |
|
|
|
|
|
Mathematical formulation: |
|
|
Loss = -โ log(P(target_token | context)) |
|
|
where P is the softmax probability distribution over vocabulary |
|
|
|
|
|
Implementation details: |
|
|
- Reshapes 3D tensors to 2D for efficient computation |
|
|
- Uses PyTorch's optimized cross_entropy function |
|
|
- Handles padding tokens by ignoring them in loss calculation |
|
|
- Computes mean loss across all valid positions |
|
|
|
|
|
Why cross-entropy for language modeling: |
|
|
- Natural choice for multi-class classification (next token prediction) |
|
|
- Provides strong gradient signal for correct token probabilities |
|
|
- Mathematically equivalent to minimizing negative log-likelihood |
|
|
- Well-studied optimization properties for neural language models |
|
|
|
|
|
Args: |
|
|
logits: Raw model predictions of shape (batch_size, seq_len, vocab_size) |
|
|
Contains unnormalized scores for each token in vocabulary |
|
|
These will be converted to probabilities via softmax internally |
|
|
targets: Ground truth next tokens of shape (batch_size, seq_len) |
|
|
Contains token IDs representing the true next tokens |
|
|
Should be input sequence shifted by one position |
|
|
|
|
|
Returns: |
|
|
torch.Tensor: Scalar loss value representing prediction error |
|
|
Lower values indicate better next-token prediction accuracy |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
logits = logits.view(-1, logits.size(-1)) |
|
|
targets = targets.view(-1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
loss = nn.functional.cross_entropy(logits, targets, ignore_index=-1) |
|
|
|
|
|
|
|
|
|
|
|
return loss |
|
|
|
|
|
def _get_memory_usage(self) -> Dict[str, float]: |
|
|
"""Get current memory usage statistics.""" |
|
|
memory_stats = {} |
|
|
|
|
|
if torch.cuda.is_available() and self.device.startswith("cuda"): |
|
|
memory_stats["gpu_allocated_mb"] = torch.cuda.memory_allocated() / (1024**2) |
|
|
memory_stats["gpu_cached_mb"] = torch.cuda.memory_reserved() / (1024**2) |
|
|
|
|
|
|
|
|
import psutil |
|
|
|
|
|
process = psutil.Process() |
|
|
memory_stats["cpu_memory_mb"] = process.memory_info().rss / (1024**2) |
|
|
|
|
|
return memory_stats |
|
|
|
|
|
def _log_step(self, step: int, loss: float, lr: float, step_time: float) -> None: |
|
|
"""Log training progress for a single step.""" |
|
|
perplexity = math.exp(min(loss, 10)) |
|
|
|
|
|
|
|
|
tokens_per_batch = self.data_loader.batch_size * self.data_loader.seq_len |
|
|
tokens_per_second = tokens_per_batch / step_time if step_time > 0 else 0 |
|
|
|
|
|
|
|
|
memory_stats = self._get_memory_usage() |
|
|
|
|
|
|
|
|
log_entry = { |
|
|
"step": step, |
|
|
"loss": loss, |
|
|
"perplexity": perplexity, |
|
|
"learning_rate": lr, |
|
|
"step_time": step_time, |
|
|
"tokens_per_second": tokens_per_second, |
|
|
"memory_mb": memory_stats.get("cpu_memory_mb", 0), |
|
|
} |
|
|
|
|
|
self.training_log.append(log_entry) |
|
|
|
|
|
|
|
|
_ = time.time() - self.start_time if self.start_time else 0 |
|
|
eta_seconds = (self.max_steps - step) * step_time if step_time > 0 else 0 |
|
|
eta_hours = eta_seconds / 3600 |
|
|
|
|
|
print( |
|
|
f"Step {step:,}/{self.max_steps:,} | " |
|
|
f"Loss: {loss:.4f} | " |
|
|
f"PPL: {perplexity:.2f} | " |
|
|
f"LR: {lr:.2e} | " |
|
|
f"Time: {step_time:.2f}s | " |
|
|
f"Tokens/s: {tokens_per_second:.1f} | " |
|
|
f"Memory: {memory_stats.get('cpu_memory_mb', 0):.0f}MB | " |
|
|
f"ETA: {eta_hours:.1f}h" |
|
|
) |
|
|
|
|
|
def _save_checkpoint(self, step: int, is_best: bool = False) -> None: |
|
|
"""Save model checkpoint.""" |
|
|
checkpoint = { |
|
|
"step": step, |
|
|
"epoch": self.epoch, |
|
|
"model_state_dict": self.model.state_dict(), |
|
|
"optimizer_state_dict": self.optimizer.state_dict(), |
|
|
"scheduler_state_dict": self.scheduler.state_dict(), |
|
|
"best_loss": self.best_loss, |
|
|
"training_log": self.training_log, |
|
|
"config": self.model.config.__dict__, |
|
|
} |
|
|
|
|
|
|
|
|
checkpoint_path = self.output_dir / f"checkpoint_step_{step}.pt" |
|
|
torch.save(checkpoint, checkpoint_path) |
|
|
|
|
|
|
|
|
if is_best: |
|
|
best_path = self.output_dir / "best_model.pt" |
|
|
torch.save(checkpoint, best_path) |
|
|
print(f"๐พ New best model saved: {best_path}") |
|
|
|
|
|
|
|
|
log_path = self.output_dir / "training_log.json" |
|
|
with open(log_path, "w") as f: |
|
|
json.dump(self.training_log, f, indent=2) |
|
|
|
|
|
print(f"๐พ Checkpoint saved: {checkpoint_path}") |
|
|
|
|
|
def _load_checkpoint(self, checkpoint_path: str) -> None: |
|
|
"""Load model checkpoint to resume training.""" |
|
|
if not os.path.exists(checkpoint_path): |
|
|
print(f"โ ๏ธ Checkpoint not found: {checkpoint_path}") |
|
|
return |
|
|
|
|
|
print(f"๐ Loading checkpoint: {checkpoint_path}") |
|
|
|
|
|
checkpoint = torch.load(checkpoint_path, map_location=self.device) |
|
|
|
|
|
self.model.load_state_dict(checkpoint["model_state_dict"]) |
|
|
self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) |
|
|
self.scheduler.load_state_dict(checkpoint["scheduler_state_dict"]) |
|
|
|
|
|
self.step = checkpoint["step"] |
|
|
self.epoch = checkpoint["epoch"] |
|
|
self.best_loss = checkpoint["best_loss"] |
|
|
self.training_log = checkpoint.get("training_log", []) |
|
|
|
|
|
print("โ Checkpoint loaded successfully") |
|
|
print(f" Resuming from step: {self.step:,}") |
|
|
print(f" Best loss so far: {self.best_loss:.4f}") |
|
|
|
|
|
def train(self) -> None: |
|
|
"""Main training loop.""" |
|
|
print("\n๐ Starting training...") |
|
|
print(f" Model: {self.model.config.model_name}") |
|
|
print(f" Parameters: {self.model.get_num_params():,}") |
|
|
print(f" Device: {self.device}") |
|
|
print(f" Max steps: {self.max_steps:,}") |
|
|
print("=" * 80) |
|
|
|
|
|
self.model.train() |
|
|
self.start_time = time.time() |
|
|
|
|
|
|
|
|
accumulated_loss = 0.0 |
|
|
self.optimizer.zero_grad() |
|
|
|
|
|
for batch_idx, (input_ids, target_ids) in enumerate(self.data_loader): |
|
|
if self.step >= self.max_steps: |
|
|
break |
|
|
|
|
|
step_start_time = time.time() |
|
|
|
|
|
|
|
|
input_ids = input_ids.to(self.device) |
|
|
target_ids = target_ids.to(self.device) |
|
|
|
|
|
|
|
|
logits, loss = self.model(input_ids, target_ids) |
|
|
|
|
|
|
|
|
loss = loss / self.gradient_accumulation_steps |
|
|
accumulated_loss += loss.item() |
|
|
|
|
|
|
|
|
loss.backward() |
|
|
|
|
|
|
|
|
if (batch_idx + 1) % self.gradient_accumulation_steps == 0: |
|
|
|
|
|
if self.gradient_clipping > 0: |
|
|
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.gradient_clipping) |
|
|
|
|
|
|
|
|
self.optimizer.step() |
|
|
self.scheduler.step() |
|
|
self.optimizer.zero_grad() |
|
|
|
|
|
|
|
|
self.step += 1 |
|
|
step_time = time.time() - step_start_time |
|
|
self.step_times.append(step_time) |
|
|
|
|
|
|
|
|
current_lr = self.scheduler.get_last_lr()[0] |
|
|
|
|
|
|
|
|
if self.step % self.log_every == 0: |
|
|
avg_loss = accumulated_loss |
|
|
self._log_step(self.step, avg_loss, current_lr, step_time) |
|
|
|
|
|
|
|
|
if self.step % self.save_every == 0: |
|
|
is_best = accumulated_loss < self.best_loss |
|
|
if is_best: |
|
|
self.best_loss = accumulated_loss |
|
|
|
|
|
self._save_checkpoint(self.step, is_best) |
|
|
|
|
|
|
|
|
if self.step % 100 == 0: |
|
|
gc.collect() |
|
|
|
|
|
|
|
|
accumulated_loss = 0.0 |
|
|
|
|
|
|
|
|
if self.step >= self.max_steps: |
|
|
break |
|
|
|
|
|
|
|
|
print("\n๐ Training completed!") |
|
|
self._save_checkpoint(self.step, is_best=True) |
|
|
|
|
|
|
|
|
total_time = time.time() - self.start_time |
|
|
avg_step_time = sum(self.step_times) / len(self.step_times) if self.step_times else 0 |
|
|
|
|
|
print("\n๐ Training Summary:") |
|
|
print(f" Steps completed: {self.step:,}") |
|
|
print(f" Total time: {total_time/3600:.2f} hours") |
|
|
print(f" Average time per step: {avg_step_time:.2f}s") |
|
|
print(f" Final loss: {self.best_loss:.4f}") |
|
|
print(f" Final perplexity: {math.exp(min(self.best_loss, 10)):.2f}") |
|
|
print(f" Model saved to: {self.output_dir}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to handle command line training.""" |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Train a GPT-style language model", |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
Examples: |
|
|
# Train small model for quick experimentation |
|
|
python core/src/train_model.py \\ |
|
|
--model-size small \\ |
|
|
--max-steps 5000 \\ |
|
|
--output-dir models/test-small |
|
|
|
|
|
# Train medium model with custom settings |
|
|
python core/src/train_model.py \\ |
|
|
--model-size medium \\ |
|
|
--learning-rate 1e-4 \\ |
|
|
--batch-size 2 \\ |
|
|
--max-steps 50000 \\ |
|
|
--output-dir models/my-medium-model |
|
|
""", |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--model-size", |
|
|
choices=["small", "medium", "large"], |
|
|
default="small", |
|
|
help="Model size to train (default: small)", |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--data-file", |
|
|
default="data/clean/training_data.txt", |
|
|
help="Path to training text file (default: data/clean/training_data.txt)", |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--tokenizer-dir", |
|
|
default="data/tokenizer/", |
|
|
help="Path to tokenizer directory (default: data/tokenizer/)", |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--output-dir", required=True, help="Output directory for model checkpoints" |
|
|
) |
|
|
|
|
|
|
|
|
parser.add_argument( |
|
|
"--seq-len", type=int, default=512, help="Sequence length for training (default: 512)" |
|
|
) |
|
|
|
|
|
parser.add_argument("--batch-size", type=int, default=4, help="Batch size (default: 4)") |
|
|
|
|
|
parser.add_argument( |
|
|
"--learning-rate", type=float, default=3e-4, help="Learning rate (default: 3e-4)" |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--max-steps", type=int, default=10000, help="Maximum training steps (default: 10000)" |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--warmup-steps", type=int, default=1000, help="Warmup steps (default: 1000)" |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--gradient-accumulation-steps", |
|
|
type=int, |
|
|
default=4, |
|
|
help="Gradient accumulation steps (default: 4)", |
|
|
) |
|
|
|
|
|
parser.add_argument( |
|
|
"--device", |
|
|
choices=["cpu", "cuda", "auto"], |
|
|
default="auto", |
|
|
help="Training device (default: auto)", |
|
|
) |
|
|
|
|
|
parser.add_argument("--resume", help="Path to checkpoint to resume training from") |
|
|
|
|
|
parser.add_argument( |
|
|
"--save-every", type=int, default=1000, help="Save checkpoint every N steps (default: 1000)" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("๐ OpenLLM Model Training") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if args.device == "auto": |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
else: |
|
|
device = args.device |
|
|
|
|
|
print(f"Using device: {device}") |
|
|
|
|
|
try: |
|
|
|
|
|
print(f"\n๐๏ธ Creating {args.model_size} model...") |
|
|
model = create_model(args.model_size) |
|
|
|
|
|
|
|
|
print("\n๐ Setting up data loader...") |
|
|
tokenizer_path = os.path.join(args.tokenizer_dir, "tokenizer.model") |
|
|
|
|
|
data_loader = TextDataLoader( |
|
|
data_file=args.data_file, |
|
|
tokenizer_path=tokenizer_path, |
|
|
seq_len=args.seq_len, |
|
|
batch_size=args.batch_size, |
|
|
shuffle=True, |
|
|
) |
|
|
|
|
|
|
|
|
_ = data_loader.get_data_stats() |
|
|
|
|
|
|
|
|
print("\n๐ฏ Setting up trainer...") |
|
|
trainer = ModelTrainer( |
|
|
model=model, |
|
|
data_loader=data_loader, |
|
|
output_dir=args.output_dir, |
|
|
device=device, |
|
|
learning_rate=args.learning_rate, |
|
|
max_steps=args.max_steps, |
|
|
warmup_steps=args.warmup_steps, |
|
|
gradient_accumulation_steps=args.gradient_accumulation_steps, |
|
|
save_every=args.save_every, |
|
|
) |
|
|
|
|
|
|
|
|
if args.resume: |
|
|
trainer._load_checkpoint(args.resume) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
print("\n๐ Training completed successfully!") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nโ Training failed: {e}") |
|
|
import traceback |
|
|
|
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|