from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from datasets import load_dataset import torch import json MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct" MAX_LENGTH = 512 # Load tokenizer and model print("Loading model and tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto" ) # Improved LoRA configuration lora_config = LoraConfig( r=16, # Increased from 8 for better capacity lora_alpha=32, # Increased from 16 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # More modules lora_dropout=0.1, # Increased for better regularization bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Load and split dataset print("Loading dataset...") dataset = load_dataset("json", data_files="train.jsonl") # Split into train/validation (80/20) split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42) train_dataset = split_dataset["train"] eval_dataset = split_dataset["test"] print(f"Training samples: {len(train_dataset)}") print(f"Validation samples: {len(eval_dataset)}") def tokenize_function(examples): """Tokenize the examples with proper formatting""" texts = [] for messages in examples["messages"]: # Apply chat template text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False ) texts.append(text) # Tokenize with padding and truncation tokenized = tokenizer( texts, truncation=True, max_length=MAX_LENGTH, padding="max_length", return_tensors=None ) # Labels are the same as input_ids for causal LM tokenized["labels"] = tokenized["input_ids"].copy() return tokenized # Tokenize datasets print("Tokenizing datasets...") tokenized_train = train_dataset.map( tokenize_function, batched=True, remove_columns=train_dataset.column_names ) tokenized_eval = eval_dataset.map( tokenize_function, batched=True, remove_columns=eval_dataset.column_names ) # Improved training arguments training_args = TrainingArguments( output_dir="./brad-ai-lora", # Training hyperparameters num_train_epochs=5, # Increased from 3 per_device_train_batch_size=2, # Increased from 1 per_device_eval_batch_size=2, gradient_accumulation_steps=4, # Effective batch size = 8 # Learning rate and scheduling learning_rate=3e-4, # Slightly increased lr_scheduler_type="cosine", # Better than default warmup_ratio=0.1, # Warmup for 10% of training # Optimization optim="adamw_torch", weight_decay=0.01, max_grad_norm=1.0, # Logging and evaluation logging_steps=10, eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=50, save_total_limit=3, # Keep only best 3 checkpoints # Performance fp16=True, # Mixed precision training dataloader_num_workers=2, # Monitoring load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False, # Misc report_to="none", # Change to "tensorboard" if you want logging seed=42 ) # Create trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_eval, tokenizer=tokenizer ) # Train the model print("Starting training...") trainer.train() # Save the final model print("Saving model...") trainer.save_model("./brad-ai-lora-final") tokenizer.save_pretrained("./brad-ai-lora-final") # Evaluate final model print("Final evaluation:") eval_results = trainer.evaluate() print(eval_results) print("Training complete!")