import os import torch from datasets import load_dataset from unsloth import FastLanguageModel from trl import SFTTrainer from transformers import TrainingArguments def main(): print("Initializing Unsloth QLoRA Fine-Tuning Pipeline for MVM²...") # Configuration max_seq_length = 2048 # Good default for math problems dtype = None # Auto detects Float16, Bfloat16 load_in_4bit = True # Use 4-bit quantization to fit on consumer GPUs (e.g. RTX 3090, 4090, T4) model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit" # Excellent 8B model pre-quantized # 1. Load Model with Unsloth (up to 2x faster, 70% less VRAM) model, tokenizer = FastLanguageModel.from_pretrained( model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, ) # 2. Add LoRA Adapters # We only train 1-5% of the weights, targeting the specific layers that handle reasoning model = FastLanguageModel.get_peft_model( model, r = 16, # Rank target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Optimization: 0 is faster bias = "none", use_gradient_checkpointing = "unsloth", random_state = 3407, use_rslora = False, loftq_config = None, ) # 3. Load the generated MVM2 dataset dataset_path = "models/local_mvm2_adapter/mvm2_training_data.jsonl" if not os.path.exists(dataset_path): raise FileNotFoundError(f"Missing dataset {dataset_path}. Run generate_math_dataset.py first!") dataset = load_dataset('json', data_files=dataset_path, split='train') # Format the messages using the model's chat template def format_chatml(examples): texts = [] for messages in examples["messages"]: text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) texts.append(text) return {"text": texts} dataset = dataset.map(format_chatml, batched=True) # 4. Supervised Fine-Tuning (SFT) Trainer trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = dataset, dataset_text_field = "text", max_seq_length = max_seq_length, dataset_num_proc = 2, packing = False, # Can make training 5x faster for short sequences args = TrainingArguments( per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 5, max_steps = 60, # Set to roughly 1-2 epochs based on dataset size for real training learning_rate = 2e-4, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), logging_steps = 1, optim = "adamw_8bit", weight_decay = 0.01, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", ), ) # 5. Start Training gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") print("\nStarting QLoRA Fine-Tuning...") trainer_stats = trainer.train() # 6. Save Model save_path = "models/local_mvm2_adapter/lora_model" print(f"\nSaving LoRA adapters to {save_path}...") model.save_pretrained(save_path) # Local saving tokenizer.save_pretrained(save_path) print("\n✅ Fine-Tuning Complete! You can now run the MVM2 Engine completely offline by switching 'use_local_model=True' in llm_agent.py.") if __name__ == "__main__": main()