brad-ai-1.12.2x / train_lora.py

Update train_lora.py

37f6677 verified 21 days ago

4.13 kB

	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TrainingArguments,
	Trainer,
	DataCollatorForLanguageModeling
	)
	from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
	from datasets import load_dataset
	import torch
	import json

	MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
	MAX_LENGTH = 512

	# Load tokenizer and model
	print("Loading model and tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	# Improved LoRA configuration
	lora_config = LoraConfig(
	r=16, # Increased from 8 for better capacity
	lora_alpha=32, # Increased from 16
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # More modules
	lora_dropout=0.1, # Increased for better regularization
	bias="none",
	task_type="CAUSAL_LM"
	)

	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# Load and split dataset
	print("Loading dataset...")
	dataset = load_dataset("json", data_files="train.jsonl")

	# Split into train/validation (80/20)
	split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
	train_dataset = split_dataset["train"]
	eval_dataset = split_dataset["test"]

	print(f"Training samples: {len(train_dataset)}")
	print(f"Validation samples: {len(eval_dataset)}")

	def tokenize_function(examples):
	"""Tokenize the examples with proper formatting"""
	texts = []
	for messages in examples["messages"]:
	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=False
	)
	texts.append(text)

	# Tokenize with padding and truncation
	tokenized = tokenizer(
	texts,
	truncation=True,
	max_length=MAX_LENGTH,
	padding="max_length",
	return_tensors=None
	)

	# Labels are the same as input_ids for causal LM
	tokenized["labels"] = tokenized["input_ids"].copy()

	return tokenized

	# Tokenize datasets
	print("Tokenizing datasets...")
	tokenized_train = train_dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=train_dataset.column_names
	)

	tokenized_eval = eval_dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=eval_dataset.column_names
	)

	# Improved training arguments
	training_args = TrainingArguments(
	output_dir="./brad-ai-lora",

	# Training hyperparameters
	num_train_epochs=5, # Increased from 3
	per_device_train_batch_size=2, # Increased from 1
	per_device_eval_batch_size=2,
	gradient_accumulation_steps=4, # Effective batch size = 8

	# Learning rate and scheduling
	learning_rate=3e-4, # Slightly increased
	lr_scheduler_type="cosine", # Better than default
	warmup_ratio=0.1, # Warmup for 10% of training

	# Optimization
	optim="adamw_torch",
	weight_decay=0.01,
	max_grad_norm=1.0,

	# Logging and evaluation
	logging_steps=10,
	eval_strategy="steps",
	eval_steps=50,
	save_strategy="steps",
	save_steps=50,
	save_total_limit=3, # Keep only best 3 checkpoints

	# Performance
	fp16=True, # Mixed precision training
	dataloader_num_workers=2,

	# Monitoring
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	greater_is_better=False,

	# Misc
	report_to="none", # Change to "tensorboard" if you want logging
	seed=42
	)

	# Create trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_train,
	eval_dataset=tokenized_eval,
	tokenizer=tokenizer
	)

	# Train the model
	print("Starting training...")
	trainer.train()

	# Save the final model
	print("Saving model...")
	trainer.save_model("./brad-ai-lora-final")
	tokenizer.save_pretrained("./brad-ai-lora-final")

	# Evaluate final model
	print("Final evaluation:")
	eval_results = trainer.evaluate()
	print(eval_results)

	print("Training complete!")