""" Model training script for financial LLM fine-tuning """ import torch import json from datetime import datetime from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, default_data_collator, BitsAndBytesConfig ) from peft import LoraConfig, get_peft_model, TaskType def setup_model_and_tokenizer(config): """Setup model and tokenizer with quantization""" # Speed: enable TF32 on Ampere (A100) try: torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True print("✅ TF32 enabled for faster matmul") except Exception: pass # Clear GPU cache and check memory torch.cuda.empty_cache() total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 allocated_memory = torch.cuda.memory_allocated() / 1e9 free_memory = total_memory - allocated_memory print(f"🔋 A100 Memory Status:") print(f" Total: {total_memory:.1f} GB") print(f" Free: {free_memory:.1f} GB") if free_memory < 15: print("⚠️ Warning: Low GPU memory, consider clearing cache") # Determine quantization mode (default to 8bit to avoid OOM) quantization = config.get("quantization") if quantization is None: quantization = "8bit" print(f"⚙️ Quantization mode: {quantization}") # Quantization config bnb_config = None if quantization == "4bit": bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) elif quantization == "8bit": bnb_config = BitsAndBytesConfig(load_in_8bit=True) # Load tokenizer print(f"Loading tokenizer: {config['model_name']}") tokenizer = AutoTokenizer.from_pretrained(config['model_name']) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # Ensure consistent padding # Load model print(f"Loading model: {config['model_name']}") model_kwargs = dict( device_map={"": 0}, # Force all layers to GPU 0 trust_remote_code=True, torch_dtype=torch.bfloat16, ) # Attention implementation selection: try flash-attn v2, else SDPA, else eager attn_pref = config.get("attn_impl") # "flash" | "sdpa" | "eager" | None chosen_attn = None if attn_pref == "flash": try: import flash_attn # noqa: F401 chosen_attn = "flash_attention_2" except Exception: print("⚠️ flash-attn not available; falling back to SDPA") chosen_attn = "sdpa" elif attn_pref == "sdpa": chosen_attn = "sdpa" elif attn_pref == "eager": chosen_attn = "eager" else: # Auto: prefer flash if importable, otherwise SDPA try: import flash_attn # noqa: F401 chosen_attn = "flash_attention_2" except Exception: chosen_attn = "sdpa" # Pass down to Transformers if supported (>=4.39 for Llama) model_kwargs["attn_implementation"] = chosen_attn print(f"✅ Attention implementation: {chosen_attn}") if bnb_config is not None: model_kwargs["quantization_config"] = bnb_config model = AutoModelForCausalLM.from_pretrained( config['model_name'], **model_kwargs, ) model.config.use_cache = False model.config.pretraining_tp = 1 # Ensure pad token id is set for training/eval if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None: model.config.pad_token_id = tokenizer.pad_token_id # Enable gradient checkpointing on the model to reduce memory try: if config.get('gradient_checkpointing', True): model.gradient_checkpointing_enable() print("✅ Model gradient checkpointing enabled") except Exception: pass # Check memory usage after model loading allocated_after = torch.cuda.memory_allocated() / 1e9 total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 usage_percent = (allocated_after / total_memory) * 100 print(f"Model loaded successfully!") print(f"Model parameters: {model.num_parameters():,}") print(f"🔋 GPU Memory after loading: {allocated_after:.1f}/{total_memory:.1f} GB ({usage_percent:.1f}%)") if usage_percent > 85: print("⚠️ Warning: High GPU memory usage! Consider reducing batch size.") else: print("✅ GPU memory usage looks good for training!") return model, tokenizer def setup_lora(model, config): """Setup LoRA for efficient fine-tuning""" # LoRA configuration # Determine target modules based on model architecture if "DialoGPT" in config['model_name']: target_modules = ["c_attn", "c_proj"] elif "Llama" in config['model_name'] or "llama" in config['model_name']: # Llama 3.1 architecture - target all attention and MLP layers target_modules = [ "q_proj", "k_proj", "v_proj", "o_proj", # Attention layers "gate_proj", "up_proj", "down_proj" # MLP layers ] else: # Default for other transformer models target_modules = ["q_proj", "v_proj"] # Read LoRA hyperparameters with safe defaults lora_r = int(config.get('lora_r', 16)) lora_alpha = int(config.get('lora_alpha', 32)) lora_dropout = float(config.get('lora_dropout', 0.1)) lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, r=lora_r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, target_modules=target_modules, bias="none", ) # Apply LoRA to model model = get_peft_model(model, lora_config) model.print_trainable_parameters() print(f"LoRA configuration applied successfully!") print(f"Target modules: {target_modules}") print(f"LoRA params → r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}") return model def tokenize_dataset(dataset, tokenizer, config): """Tokenize the dataset""" def tokenize_function(examples): """Tokenize the texts""" # Tokenize with consistent padding and truncation tokenized = tokenizer( examples["text"], truncation=True, padding="max_length", max_length=config['max_length'], return_tensors=None, add_special_tokens=True, ) # Set labels (for causal LM, labels = input_ids) # Make sure labels are exactly the same as input_ids tokenized["labels"] = tokenized["input_ids"].copy() return tokenized # Apply tokenization print("Tokenizing dataset...") tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=dataset["train"].column_names, desc="Tokenizing", ) print("Tokenization complete!") # Debug: Check tokenized sample shapes sample = tokenized_dataset["train"][0] print(f"✅ Sample tokenized input_ids shape: {len(sample['input_ids'])}") print(f"✅ Sample tokenized labels shape: {len(sample['labels'])}") print(f"✅ Max length setting: {config['max_length']}") return tokenized_dataset def setup_training(model, tokenizer, tokenized_dataset, config): """Setup training arguments and trainer""" # Use default data collator since we're pre-padding during tokenization data_collator = default_data_collator import transformers transformers_version = transformers.__version__ print(f"🔧 Transformers version: {transformers_version}") use_eval_strategy = hasattr(TrainingArguments, '__dataclass_fields__') and \ 'eval_strategy' in str(TrainingArguments.__dataclass_fields__) eval_param_name = "eval_strategy" if use_eval_strategy else "evaluation_strategy" training_args_dict = { "output_dir": config['output_dir'], "per_device_train_batch_size": config['train_batch_size'], "per_device_eval_batch_size": config['eval_batch_size'], "gradient_accumulation_steps": config['gradient_accumulation_steps'], "num_train_epochs": config['num_epochs'], "learning_rate": config['learning_rate'], "logging_steps": config.get('logging_steps', 25), eval_param_name: "steps", "eval_steps": config.get('eval_steps', 50), # Save checkpoints frequently enough; default aligns with eval steps "save_steps": config.get('save_steps', config.get('eval_steps', 100)), "save_total_limit": 2, "remove_unused_columns": False, "push_to_hub": False, "report_to": None, "load_best_model_at_end": True, "group_by_length": True, "warmup_ratio": config.get('warmup_ratio', 0.03), "weight_decay": config.get('weight_decay', 0.01), "max_grad_norm": config.get('max_grad_norm', 1.0), "lr_scheduler_type": "cosine", "dataloader_num_workers": config.get('dataloader_num_workers', 2), "dataloader_pin_memory": True, "skip_memory_metrics": True, "log_level": "warning", "include_inputs_for_metrics": False, "prediction_loss_only": True, "gradient_checkpointing": config.get('gradient_checkpointing', True), } # Optionally force alignment so a checkpoint is always written at each eval step # This helps ensure the current best (by eval loss) has a corresponding checkpoint if config.get('align_save_with_eval', True): training_args_dict["save_steps"] = training_args_dict.get("eval_steps", training_args_dict.get("save_steps", 100)) use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() if use_bf16: training_args_dict["bf16"] = True training_args_dict["fp16"] = False print("✅ Using bf16 precision") else: training_args_dict["fp16"] = True print("✅ Using fp16 precision") print(f"✅ Using {eval_param_name} parameter for evaluation") training_args = TrainingArguments(**training_args_dict) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], data_collator=data_collator, ) print("Trainer initialized!") print(f"Training samples: {len(tokenized_dataset['train'])}") print(f"Validation samples: {len(tokenized_dataset['validation'])}") # Validate data shapes to prevent tensor errors print("🔍 Validating data shapes...") train_sample = tokenized_dataset["train"][0] val_sample = tokenized_dataset["validation"][0] print(f"✅ Train sample - input_ids: {len(train_sample['input_ids'])}, labels: {len(train_sample['labels'])}") print(f"✅ Val sample - input_ids: {len(val_sample['input_ids'])}, labels: {len(val_sample['labels'])}") # Check a few more samples to ensure consistency for i in range(min(3, len(tokenized_dataset['train']))): sample = tokenized_dataset['train'][i] if len(sample['input_ids']) != config['max_length']: print(f"⚠️ Warning: Sample {i} has inconsistent length: {len(sample['input_ids'])} != {config['max_length']}") if len(sample['input_ids']) != len(sample['labels']): print(f"⚠️ Warning: Sample {i} input_ids and labels length mismatch: {len(sample['input_ids'])} != {len(sample['labels'])}") print("✅ Data validation complete!") return trainer def save_model_and_config(model, tokenizer, trainer, config): """Save the trained model and configuration""" print("Saving model...") # Save LoRA adapter trainer.save_model(config['save_dir']) tokenizer.save_pretrained(config['save_dir']) # Save configuration config_data = { "base_model": config['model_name'], "dataset": config['dataset_name'], "dataset_config": config['dataset_config'], "training_config": config, "lora_config": { "r": config['lora_r'], "alpha": config['lora_alpha'], "dropout": config['lora_dropout'] }, "training_date": datetime.now().isoformat() } with open(f"{config['save_dir']}/training_config.json", "w") as f: json.dump(config_data, f, indent=2, default=str) print(f"Model saved to {config['save_dir']}") # Evaluate on validation set print("Evaluating model on validation set...") test_results = trainer.evaluate() # Save evaluation results with open(f"{config['save_dir']}/test_results.json", "w") as f: json.dump(test_results, f, indent=2) print(f"Evaluation complete! Results saved to {config['save_dir']}/test_results.json") return test_results def run_training(config, processed_dataset): """Run the complete training pipeline""" print("🚀 Starting financial LLM fine-tuning...") print(f"Base model: {config['model_name']}") print(f"Dataset: {config['dataset_name']}") print(f"Training samples: {len(processed_dataset['train'])}") # Setup model and tokenizer model, tokenizer = setup_model_and_tokenizer(config) # Apply LoRA model = setup_lora(model, config) # Tokenize dataset tokenized_dataset = tokenize_dataset(processed_dataset, tokenizer, config) # Setup training trainer = setup_training(model, tokenizer, tokenized_dataset, config) # Start training print("Starting training...") print(f"Training will run for {config['num_epochs']} epochs") print(f"Effective batch size: {config['train_batch_size'] * config['gradient_accumulation_steps']}") trainer.train() print("Training completed!") # Save model and evaluate test_results = save_model_and_config(model, tokenizer, trainer, config) print("🎉 Fine-tuning complete! 🎉") print(f"✅ Model saved to: {config['save_dir']}") print(f"✅ Test perplexity: {test_results.get('eval_loss', 'N/A'):.4f}") return model, tokenizer, trainer if __name__ == "__main__": # Test configuration test_config = { "model_name": "microsoft/DialoGPT-medium", "dataset_name": "Josephgflowers/Finance-Instruct-500k", "dataset_config": "default", "max_length": 512, "train_batch_size": 2, "eval_batch_size": 2, "gradient_accumulation_steps": 8, "learning_rate": 2e-4, "num_epochs": 1, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.1, "output_dir": "./test-financial-lora", "save_dir": "./test-financial-final", "quantization": "8bit", # options: none | 8bit | 4bit "save_steps": 100, "eval_steps": 50, "logging_steps": 25, "gradient_checkpointing": True, "dataloader_num_workers": 2, # Added for testing } print("Testing training pipeline...") # This would require the processed dataset # model, tokenizer, trainer = run_training(test_config, processed_dataset) print("Training pipeline setup complete!")