brad-ai-1.12.2x / train_lora.py
jowilke77's picture
Update train_lora.py
37f6677 verified
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import json
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_LENGTH = 512
# Load tokenizer and model
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto"
)
# Improved LoRA configuration
lora_config = LoraConfig(
r=16, # Increased from 8 for better capacity
lora_alpha=32, # Increased from 16
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # More modules
lora_dropout=0.1, # Increased for better regularization
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Load and split dataset
print("Loading dataset...")
dataset = load_dataset("json", data_files="train.jsonl")
# Split into train/validation (80/20)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")
def tokenize_function(examples):
"""Tokenize the examples with proper formatting"""
texts = []
for messages in examples["messages"]:
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
texts.append(text)
# Tokenize with padding and truncation
tokenized = tokenizer(
texts,
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
return_tensors=None
)
# Labels are the same as input_ids for causal LM
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=train_dataset.column_names
)
tokenized_eval = eval_dataset.map(
tokenize_function,
batched=True,
remove_columns=eval_dataset.column_names
)
# Improved training arguments
training_args = TrainingArguments(
output_dir="./brad-ai-lora",
# Training hyperparameters
num_train_epochs=5, # Increased from 3
per_device_train_batch_size=2, # Increased from 1
per_device_eval_batch_size=2,
gradient_accumulation_steps=4, # Effective batch size = 8
# Learning rate and scheduling
learning_rate=3e-4, # Slightly increased
lr_scheduler_type="cosine", # Better than default
warmup_ratio=0.1, # Warmup for 10% of training
# Optimization
optim="adamw_torch",
weight_decay=0.01,
max_grad_norm=1.0,
# Logging and evaluation
logging_steps=10,
eval_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=50,
save_total_limit=3, # Keep only best 3 checkpoints
# Performance
fp16=True, # Mixed precision training
dataloader_num_workers=2,
# Monitoring
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
# Misc
report_to="none", # Change to "tensorboard" if you want logging
seed=42
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
tokenizer=tokenizer
)
# Train the model
print("Starting training...")
trainer.train()
# Save the final model
print("Saving model...")
trainer.save_model("./brad-ai-lora-final")
tokenizer.save_pretrained("./brad-ai-lora-final")
# Evaluate final model
print("Final evaluation:")
eval_results = trainer.evaluate()
print(eval_results)
print("Training complete!")