File size: 4,128 Bytes
37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 4f63d46 37f6677 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
import json
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_LENGTH = 512
# Load tokenizer and model
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto"
)
# Improved LoRA configuration
lora_config = LoraConfig(
r=16, # Increased from 8 for better capacity
lora_alpha=32, # Increased from 16
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # More modules
lora_dropout=0.1, # Increased for better regularization
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Load and split dataset
print("Loading dataset...")
dataset = load_dataset("json", data_files="train.jsonl")
# Split into train/validation (80/20)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")
def tokenize_function(examples):
"""Tokenize the examples with proper formatting"""
texts = []
for messages in examples["messages"]:
# Apply chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
texts.append(text)
# Tokenize with padding and truncation
tokenized = tokenizer(
texts,
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
return_tensors=None
)
# Labels are the same as input_ids for causal LM
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
tokenize_function,
batched=True,
remove_columns=train_dataset.column_names
)
tokenized_eval = eval_dataset.map(
tokenize_function,
batched=True,
remove_columns=eval_dataset.column_names
)
# Improved training arguments
training_args = TrainingArguments(
output_dir="./brad-ai-lora",
# Training hyperparameters
num_train_epochs=5, # Increased from 3
per_device_train_batch_size=2, # Increased from 1
per_device_eval_batch_size=2,
gradient_accumulation_steps=4, # Effective batch size = 8
# Learning rate and scheduling
learning_rate=3e-4, # Slightly increased
lr_scheduler_type="cosine", # Better than default
warmup_ratio=0.1, # Warmup for 10% of training
# Optimization
optim="adamw_torch",
weight_decay=0.01,
max_grad_norm=1.0,
# Logging and evaluation
logging_steps=10,
eval_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=50,
save_total_limit=3, # Keep only best 3 checkpoints
# Performance
fp16=True, # Mixed precision training
dataloader_num_workers=2,
# Monitoring
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
# Misc
report_to="none", # Change to "tensorboard" if you want logging
seed=42
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
tokenizer=tokenizer
)
# Train the model
print("Starting training...")
trainer.train()
# Save the final model
print("Saving model...")
trainer.save_model("./brad-ai-lora-final")
tokenizer.save_pretrained("./brad-ai-lora-final")
# Evaluate final model
print("Final evaluation:")
eval_results = trainer.evaluate()
print(eval_results)
print("Training complete!") |