Llama_for_Finance / train_model.py
TimberGu's picture
Upload train_model.py with huggingface_hub
5106722 verified
"""
Model training script for financial LLM fine-tuning
"""
import torch
import json
from datetime import datetime
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
default_data_collator,
BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType
def setup_model_and_tokenizer(config):
"""Setup model and tokenizer with quantization"""
# Speed: enable TF32 on Ampere (A100)
try:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
print("โœ… TF32 enabled for faster matmul")
except Exception:
pass
# Clear GPU cache and check memory
torch.cuda.empty_cache()
total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
allocated_memory = torch.cuda.memory_allocated() / 1e9
free_memory = total_memory - allocated_memory
print(f"๐Ÿ”‹ A100 Memory Status:")
print(f" Total: {total_memory:.1f} GB")
print(f" Free: {free_memory:.1f} GB")
if free_memory < 15:
print("โš ๏ธ Warning: Low GPU memory, consider clearing cache")
# Determine quantization mode (default to 8bit to avoid OOM)
quantization = config.get("quantization")
if quantization is None:
quantization = "8bit"
print(f"โš™๏ธ Quantization mode: {quantization}")
# Quantization config
bnb_config = None
if quantization == "4bit":
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
elif quantization == "8bit":
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
# Load tokenizer
print(f"Loading tokenizer: {config['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Ensure consistent padding
# Load model
print(f"Loading model: {config['model_name']}")
model_kwargs = dict(
device_map={"": 0}, # Force all layers to GPU 0
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
# Attention implementation selection: try flash-attn v2, else SDPA, else eager
attn_pref = config.get("attn_impl") # "flash" | "sdpa" | "eager" | None
chosen_attn = None
if attn_pref == "flash":
try:
import flash_attn # noqa: F401
chosen_attn = "flash_attention_2"
except Exception:
print("โš ๏ธ flash-attn not available; falling back to SDPA")
chosen_attn = "sdpa"
elif attn_pref == "sdpa":
chosen_attn = "sdpa"
elif attn_pref == "eager":
chosen_attn = "eager"
else:
# Auto: prefer flash if importable, otherwise SDPA
try:
import flash_attn # noqa: F401
chosen_attn = "flash_attention_2"
except Exception:
chosen_attn = "sdpa"
# Pass down to Transformers if supported (>=4.39 for Llama)
model_kwargs["attn_implementation"] = chosen_attn
print(f"โœ… Attention implementation: {chosen_attn}")
if bnb_config is not None:
model_kwargs["quantization_config"] = bnb_config
model = AutoModelForCausalLM.from_pretrained(
config['model_name'],
**model_kwargs,
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# Ensure pad token id is set for training/eval
if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None:
model.config.pad_token_id = tokenizer.pad_token_id
# Enable gradient checkpointing on the model to reduce memory
try:
if config.get('gradient_checkpointing', True):
model.gradient_checkpointing_enable()
print("โœ… Model gradient checkpointing enabled")
except Exception:
pass
# Check memory usage after model loading
allocated_after = torch.cuda.memory_allocated() / 1e9
total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
usage_percent = (allocated_after / total_memory) * 100
print(f"Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")
print(f"๐Ÿ”‹ GPU Memory after loading: {allocated_after:.1f}/{total_memory:.1f} GB ({usage_percent:.1f}%)")
if usage_percent > 85:
print("โš ๏ธ Warning: High GPU memory usage! Consider reducing batch size.")
else:
print("โœ… GPU memory usage looks good for training!")
return model, tokenizer
def setup_lora(model, config):
"""Setup LoRA for efficient fine-tuning"""
# LoRA configuration
# Determine target modules based on model architecture
if "DialoGPT" in config['model_name']:
target_modules = ["c_attn", "c_proj"]
elif "Llama" in config['model_name'] or "llama" in config['model_name']:
# Llama 3.1 architecture - target all attention and MLP layers
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj", # Attention layers
"gate_proj", "up_proj", "down_proj" # MLP layers
]
else:
# Default for other transformer models
target_modules = ["q_proj", "v_proj"]
# Read LoRA hyperparameters with safe defaults
lora_r = int(config.get('lora_r', 16))
lora_alpha = int(config.get('lora_alpha', 32))
lora_dropout = float(config.get('lora_dropout', 0.1))
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
target_modules=target_modules,
bias="none",
)
# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print(f"LoRA configuration applied successfully!")
print(f"Target modules: {target_modules}")
print(f"LoRA params โ†’ r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
return model
def tokenize_dataset(dataset, tokenizer, config):
"""Tokenize the dataset"""
def tokenize_function(examples):
"""Tokenize the texts"""
# Tokenize with consistent padding and truncation
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=config['max_length'],
return_tensors=None,
add_special_tokens=True,
)
# Set labels (for causal LM, labels = input_ids)
# Make sure labels are exactly the same as input_ids
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Apply tokenization
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset["train"].column_names,
desc="Tokenizing",
)
print("Tokenization complete!")
# Debug: Check tokenized sample shapes
sample = tokenized_dataset["train"][0]
print(f"โœ… Sample tokenized input_ids shape: {len(sample['input_ids'])}")
print(f"โœ… Sample tokenized labels shape: {len(sample['labels'])}")
print(f"โœ… Max length setting: {config['max_length']}")
return tokenized_dataset
def setup_training(model, tokenizer, tokenized_dataset, config):
"""Setup training arguments and trainer"""
# Use default data collator since we're pre-padding during tokenization
data_collator = default_data_collator
import transformers
transformers_version = transformers.__version__
print(f"๐Ÿ”ง Transformers version: {transformers_version}")
use_eval_strategy = hasattr(TrainingArguments, '__dataclass_fields__') and \
'eval_strategy' in str(TrainingArguments.__dataclass_fields__)
eval_param_name = "eval_strategy" if use_eval_strategy else "evaluation_strategy"
training_args_dict = {
"output_dir": config['output_dir'],
"per_device_train_batch_size": config['train_batch_size'],
"per_device_eval_batch_size": config['eval_batch_size'],
"gradient_accumulation_steps": config['gradient_accumulation_steps'],
"num_train_epochs": config['num_epochs'],
"learning_rate": config['learning_rate'],
"logging_steps": config.get('logging_steps', 25),
eval_param_name: "steps",
"eval_steps": config.get('eval_steps', 50),
# Save checkpoints frequently enough; default aligns with eval steps
"save_steps": config.get('save_steps', config.get('eval_steps', 100)),
"save_total_limit": 2,
"remove_unused_columns": False,
"push_to_hub": False,
"report_to": None,
"load_best_model_at_end": True,
"group_by_length": True,
"warmup_ratio": config.get('warmup_ratio', 0.03),
"weight_decay": config.get('weight_decay', 0.01),
"max_grad_norm": config.get('max_grad_norm', 1.0),
"lr_scheduler_type": "cosine",
"dataloader_num_workers": config.get('dataloader_num_workers', 2),
"dataloader_pin_memory": True,
"skip_memory_metrics": True,
"log_level": "warning",
"include_inputs_for_metrics": False,
"prediction_loss_only": True,
"gradient_checkpointing": config.get('gradient_checkpointing', True),
}
# Optionally force alignment so a checkpoint is always written at each eval step
# This helps ensure the current best (by eval loss) has a corresponding checkpoint
if config.get('align_save_with_eval', True):
training_args_dict["save_steps"] = training_args_dict.get("eval_steps", training_args_dict.get("save_steps", 100))
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
if use_bf16:
training_args_dict["bf16"] = True
training_args_dict["fp16"] = False
print("โœ… Using bf16 precision")
else:
training_args_dict["fp16"] = True
print("โœ… Using fp16 precision")
print(f"โœ… Using {eval_param_name} parameter for evaluation")
training_args = TrainingArguments(**training_args_dict)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
data_collator=data_collator,
)
print("Trainer initialized!")
print(f"Training samples: {len(tokenized_dataset['train'])}")
print(f"Validation samples: {len(tokenized_dataset['validation'])}")
# Validate data shapes to prevent tensor errors
print("๐Ÿ” Validating data shapes...")
train_sample = tokenized_dataset["train"][0]
val_sample = tokenized_dataset["validation"][0]
print(f"โœ… Train sample - input_ids: {len(train_sample['input_ids'])}, labels: {len(train_sample['labels'])}")
print(f"โœ… Val sample - input_ids: {len(val_sample['input_ids'])}, labels: {len(val_sample['labels'])}")
# Check a few more samples to ensure consistency
for i in range(min(3, len(tokenized_dataset['train']))):
sample = tokenized_dataset['train'][i]
if len(sample['input_ids']) != config['max_length']:
print(f"โš ๏ธ Warning: Sample {i} has inconsistent length: {len(sample['input_ids'])} != {config['max_length']}")
if len(sample['input_ids']) != len(sample['labels']):
print(f"โš ๏ธ Warning: Sample {i} input_ids and labels length mismatch: {len(sample['input_ids'])} != {len(sample['labels'])}")
print("โœ… Data validation complete!")
return trainer
def save_model_and_config(model, tokenizer, trainer, config):
"""Save the trained model and configuration"""
print("Saving model...")
# Save LoRA adapter
trainer.save_model(config['save_dir'])
tokenizer.save_pretrained(config['save_dir'])
# Save configuration
config_data = {
"base_model": config['model_name'],
"dataset": config['dataset_name'],
"dataset_config": config['dataset_config'],
"training_config": config,
"lora_config": {
"r": config['lora_r'],
"alpha": config['lora_alpha'],
"dropout": config['lora_dropout']
},
"training_date": datetime.now().isoformat()
}
with open(f"{config['save_dir']}/training_config.json", "w") as f:
json.dump(config_data, f, indent=2, default=str)
print(f"Model saved to {config['save_dir']}")
# Evaluate on validation set
print("Evaluating model on validation set...")
test_results = trainer.evaluate()
# Save evaluation results
with open(f"{config['save_dir']}/test_results.json", "w") as f:
json.dump(test_results, f, indent=2)
print(f"Evaluation complete! Results saved to {config['save_dir']}/test_results.json")
return test_results
def run_training(config, processed_dataset):
"""Run the complete training pipeline"""
print("๐Ÿš€ Starting financial LLM fine-tuning...")
print(f"Base model: {config['model_name']}")
print(f"Dataset: {config['dataset_name']}")
print(f"Training samples: {len(processed_dataset['train'])}")
# Setup model and tokenizer
model, tokenizer = setup_model_and_tokenizer(config)
# Apply LoRA
model = setup_lora(model, config)
# Tokenize dataset
tokenized_dataset = tokenize_dataset(processed_dataset, tokenizer, config)
# Setup training
trainer = setup_training(model, tokenizer, tokenized_dataset, config)
# Start training
print("Starting training...")
print(f"Training will run for {config['num_epochs']} epochs")
print(f"Effective batch size: {config['train_batch_size'] * config['gradient_accumulation_steps']}")
trainer.train()
print("Training completed!")
# Save model and evaluate
test_results = save_model_and_config(model, tokenizer, trainer, config)
print("๐ŸŽ‰ Fine-tuning complete! ๐ŸŽ‰")
print(f"โœ… Model saved to: {config['save_dir']}")
print(f"โœ… Test perplexity: {test_results.get('eval_loss', 'N/A'):.4f}")
return model, tokenizer, trainer
if __name__ == "__main__":
# Test configuration
test_config = {
"model_name": "microsoft/DialoGPT-medium",
"dataset_name": "Josephgflowers/Finance-Instruct-500k",
"dataset_config": "default",
"max_length": 512,
"train_batch_size": 2,
"eval_batch_size": 2,
"gradient_accumulation_steps": 8,
"learning_rate": 2e-4,
"num_epochs": 1,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"output_dir": "./test-financial-lora",
"save_dir": "./test-financial-final",
"quantization": "8bit", # options: none | 8bit | 4bit
"save_steps": 100,
"eval_steps": 50,
"logging_steps": 25,
"gradient_checkpointing": True,
"dataloader_num_workers": 2, # Added for testing
}
print("Testing training pipeline...")
# This would require the processed dataset
# model, tokenizer, trainer = run_training(test_config, processed_dataset)
print("Training pipeline setup complete!")