|
|
""" |
|
|
Model training script for financial LLM fine-tuning |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import json |
|
|
from datetime import datetime |
|
|
from transformers import ( |
|
|
AutoTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
DataCollatorForLanguageModeling, |
|
|
default_data_collator, |
|
|
BitsAndBytesConfig |
|
|
) |
|
|
from peft import LoraConfig, get_peft_model, TaskType |
|
|
|
|
|
|
|
|
def setup_model_and_tokenizer(config): |
|
|
"""Setup model and tokenizer with quantization""" |
|
|
|
|
|
|
|
|
try: |
|
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
|
torch.backends.cudnn.allow_tf32 = True |
|
|
print("โ
TF32 enabled for faster matmul") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
torch.cuda.empty_cache() |
|
|
total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 |
|
|
allocated_memory = torch.cuda.memory_allocated() / 1e9 |
|
|
free_memory = total_memory - allocated_memory |
|
|
|
|
|
print(f"๐ A100 Memory Status:") |
|
|
print(f" Total: {total_memory:.1f} GB") |
|
|
print(f" Free: {free_memory:.1f} GB") |
|
|
|
|
|
if free_memory < 15: |
|
|
print("โ ๏ธ Warning: Low GPU memory, consider clearing cache") |
|
|
|
|
|
|
|
|
quantization = config.get("quantization") |
|
|
if quantization is None: |
|
|
quantization = "8bit" |
|
|
print(f"โ๏ธ Quantization mode: {quantization}") |
|
|
|
|
|
|
|
|
bnb_config = None |
|
|
if quantization == "4bit": |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
) |
|
|
elif quantization == "8bit": |
|
|
bnb_config = BitsAndBytesConfig(load_in_8bit=True) |
|
|
|
|
|
|
|
|
print(f"Loading tokenizer: {config['model_name']}") |
|
|
tokenizer = AutoTokenizer.from_pretrained(config['model_name']) |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.padding_side = "right" |
|
|
|
|
|
|
|
|
print(f"Loading model: {config['model_name']}") |
|
|
model_kwargs = dict( |
|
|
device_map={"": 0}, |
|
|
trust_remote_code=True, |
|
|
torch_dtype=torch.bfloat16, |
|
|
) |
|
|
|
|
|
|
|
|
attn_pref = config.get("attn_impl") |
|
|
chosen_attn = None |
|
|
if attn_pref == "flash": |
|
|
try: |
|
|
import flash_attn |
|
|
chosen_attn = "flash_attention_2" |
|
|
except Exception: |
|
|
print("โ ๏ธ flash-attn not available; falling back to SDPA") |
|
|
chosen_attn = "sdpa" |
|
|
elif attn_pref == "sdpa": |
|
|
chosen_attn = "sdpa" |
|
|
elif attn_pref == "eager": |
|
|
chosen_attn = "eager" |
|
|
else: |
|
|
|
|
|
try: |
|
|
import flash_attn |
|
|
chosen_attn = "flash_attention_2" |
|
|
except Exception: |
|
|
chosen_attn = "sdpa" |
|
|
|
|
|
|
|
|
model_kwargs["attn_implementation"] = chosen_attn |
|
|
print(f"โ
Attention implementation: {chosen_attn}") |
|
|
if bnb_config is not None: |
|
|
model_kwargs["quantization_config"] = bnb_config |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
config['model_name'], |
|
|
**model_kwargs, |
|
|
) |
|
|
|
|
|
model.config.use_cache = False |
|
|
model.config.pretraining_tp = 1 |
|
|
|
|
|
if getattr(model.config, "pad_token_id", None) is None and tokenizer.pad_token_id is not None: |
|
|
model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
|
|
|
|
|
try: |
|
|
if config.get('gradient_checkpointing', True): |
|
|
model.gradient_checkpointing_enable() |
|
|
print("โ
Model gradient checkpointing enabled") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
allocated_after = torch.cuda.memory_allocated() / 1e9 |
|
|
total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 |
|
|
usage_percent = (allocated_after / total_memory) * 100 |
|
|
|
|
|
print(f"Model loaded successfully!") |
|
|
print(f"Model parameters: {model.num_parameters():,}") |
|
|
print(f"๐ GPU Memory after loading: {allocated_after:.1f}/{total_memory:.1f} GB ({usage_percent:.1f}%)") |
|
|
|
|
|
if usage_percent > 85: |
|
|
print("โ ๏ธ Warning: High GPU memory usage! Consider reducing batch size.") |
|
|
else: |
|
|
print("โ
GPU memory usage looks good for training!") |
|
|
|
|
|
return model, tokenizer |
|
|
|
|
|
|
|
|
def setup_lora(model, config): |
|
|
"""Setup LoRA for efficient fine-tuning""" |
|
|
|
|
|
|
|
|
|
|
|
if "DialoGPT" in config['model_name']: |
|
|
target_modules = ["c_attn", "c_proj"] |
|
|
elif "Llama" in config['model_name'] or "llama" in config['model_name']: |
|
|
|
|
|
target_modules = [ |
|
|
"q_proj", "k_proj", "v_proj", "o_proj", |
|
|
"gate_proj", "up_proj", "down_proj" |
|
|
] |
|
|
else: |
|
|
|
|
|
target_modules = ["q_proj", "v_proj"] |
|
|
|
|
|
|
|
|
lora_r = int(config.get('lora_r', 16)) |
|
|
lora_alpha = int(config.get('lora_alpha', 32)) |
|
|
lora_dropout = float(config.get('lora_dropout', 0.1)) |
|
|
|
|
|
lora_config = LoraConfig( |
|
|
task_type=TaskType.CAUSAL_LM, |
|
|
r=lora_r, |
|
|
lora_alpha=lora_alpha, |
|
|
lora_dropout=lora_dropout, |
|
|
target_modules=target_modules, |
|
|
bias="none", |
|
|
) |
|
|
|
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
model.print_trainable_parameters() |
|
|
|
|
|
print(f"LoRA configuration applied successfully!") |
|
|
print(f"Target modules: {target_modules}") |
|
|
print(f"LoRA params โ r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}") |
|
|
return model |
|
|
|
|
|
|
|
|
def tokenize_dataset(dataset, tokenizer, config): |
|
|
"""Tokenize the dataset""" |
|
|
|
|
|
def tokenize_function(examples): |
|
|
"""Tokenize the texts""" |
|
|
|
|
|
tokenized = tokenizer( |
|
|
examples["text"], |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
max_length=config['max_length'], |
|
|
return_tensors=None, |
|
|
add_special_tokens=True, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
print("Tokenizing dataset...") |
|
|
tokenized_dataset = dataset.map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=dataset["train"].column_names, |
|
|
desc="Tokenizing", |
|
|
) |
|
|
|
|
|
print("Tokenization complete!") |
|
|
|
|
|
|
|
|
sample = tokenized_dataset["train"][0] |
|
|
print(f"โ
Sample tokenized input_ids shape: {len(sample['input_ids'])}") |
|
|
print(f"โ
Sample tokenized labels shape: {len(sample['labels'])}") |
|
|
print(f"โ
Max length setting: {config['max_length']}") |
|
|
|
|
|
return tokenized_dataset |
|
|
|
|
|
|
|
|
def setup_training(model, tokenizer, tokenized_dataset, config): |
|
|
"""Setup training arguments and trainer""" |
|
|
|
|
|
|
|
|
data_collator = default_data_collator |
|
|
|
|
|
import transformers |
|
|
transformers_version = transformers.__version__ |
|
|
print(f"๐ง Transformers version: {transformers_version}") |
|
|
|
|
|
use_eval_strategy = hasattr(TrainingArguments, '__dataclass_fields__') and \ |
|
|
'eval_strategy' in str(TrainingArguments.__dataclass_fields__) |
|
|
eval_param_name = "eval_strategy" if use_eval_strategy else "evaluation_strategy" |
|
|
|
|
|
training_args_dict = { |
|
|
"output_dir": config['output_dir'], |
|
|
"per_device_train_batch_size": config['train_batch_size'], |
|
|
"per_device_eval_batch_size": config['eval_batch_size'], |
|
|
"gradient_accumulation_steps": config['gradient_accumulation_steps'], |
|
|
"num_train_epochs": config['num_epochs'], |
|
|
"learning_rate": config['learning_rate'], |
|
|
"logging_steps": config.get('logging_steps', 25), |
|
|
eval_param_name: "steps", |
|
|
"eval_steps": config.get('eval_steps', 50), |
|
|
|
|
|
"save_steps": config.get('save_steps', config.get('eval_steps', 100)), |
|
|
"save_total_limit": 2, |
|
|
"remove_unused_columns": False, |
|
|
"push_to_hub": False, |
|
|
"report_to": None, |
|
|
"load_best_model_at_end": True, |
|
|
"group_by_length": True, |
|
|
"warmup_ratio": config.get('warmup_ratio', 0.03), |
|
|
"weight_decay": config.get('weight_decay', 0.01), |
|
|
"max_grad_norm": config.get('max_grad_norm', 1.0), |
|
|
"lr_scheduler_type": "cosine", |
|
|
"dataloader_num_workers": config.get('dataloader_num_workers', 2), |
|
|
"dataloader_pin_memory": True, |
|
|
"skip_memory_metrics": True, |
|
|
"log_level": "warning", |
|
|
"include_inputs_for_metrics": False, |
|
|
"prediction_loss_only": True, |
|
|
"gradient_checkpointing": config.get('gradient_checkpointing', True), |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if config.get('align_save_with_eval', True): |
|
|
training_args_dict["save_steps"] = training_args_dict.get("eval_steps", training_args_dict.get("save_steps", 100)) |
|
|
|
|
|
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() |
|
|
if use_bf16: |
|
|
training_args_dict["bf16"] = True |
|
|
training_args_dict["fp16"] = False |
|
|
print("โ
Using bf16 precision") |
|
|
else: |
|
|
training_args_dict["fp16"] = True |
|
|
print("โ
Using fp16 precision") |
|
|
|
|
|
print(f"โ
Using {eval_param_name} parameter for evaluation") |
|
|
training_args = TrainingArguments(**training_args_dict) |
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=tokenized_dataset["train"], |
|
|
eval_dataset=tokenized_dataset["validation"], |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
print("Trainer initialized!") |
|
|
print(f"Training samples: {len(tokenized_dataset['train'])}") |
|
|
print(f"Validation samples: {len(tokenized_dataset['validation'])}") |
|
|
|
|
|
|
|
|
print("๐ Validating data shapes...") |
|
|
train_sample = tokenized_dataset["train"][0] |
|
|
val_sample = tokenized_dataset["validation"][0] |
|
|
|
|
|
print(f"โ
Train sample - input_ids: {len(train_sample['input_ids'])}, labels: {len(train_sample['labels'])}") |
|
|
print(f"โ
Val sample - input_ids: {len(val_sample['input_ids'])}, labels: {len(val_sample['labels'])}") |
|
|
|
|
|
|
|
|
for i in range(min(3, len(tokenized_dataset['train']))): |
|
|
sample = tokenized_dataset['train'][i] |
|
|
if len(sample['input_ids']) != config['max_length']: |
|
|
print(f"โ ๏ธ Warning: Sample {i} has inconsistent length: {len(sample['input_ids'])} != {config['max_length']}") |
|
|
if len(sample['input_ids']) != len(sample['labels']): |
|
|
print(f"โ ๏ธ Warning: Sample {i} input_ids and labels length mismatch: {len(sample['input_ids'])} != {len(sample['labels'])}") |
|
|
|
|
|
print("โ
Data validation complete!") |
|
|
|
|
|
return trainer |
|
|
|
|
|
|
|
|
def save_model_and_config(model, tokenizer, trainer, config): |
|
|
"""Save the trained model and configuration""" |
|
|
|
|
|
print("Saving model...") |
|
|
|
|
|
|
|
|
trainer.save_model(config['save_dir']) |
|
|
tokenizer.save_pretrained(config['save_dir']) |
|
|
|
|
|
|
|
|
config_data = { |
|
|
"base_model": config['model_name'], |
|
|
"dataset": config['dataset_name'], |
|
|
"dataset_config": config['dataset_config'], |
|
|
"training_config": config, |
|
|
"lora_config": { |
|
|
"r": config['lora_r'], |
|
|
"alpha": config['lora_alpha'], |
|
|
"dropout": config['lora_dropout'] |
|
|
}, |
|
|
"training_date": datetime.now().isoformat() |
|
|
} |
|
|
|
|
|
with open(f"{config['save_dir']}/training_config.json", "w") as f: |
|
|
json.dump(config_data, f, indent=2, default=str) |
|
|
|
|
|
print(f"Model saved to {config['save_dir']}") |
|
|
|
|
|
|
|
|
print("Evaluating model on validation set...") |
|
|
test_results = trainer.evaluate() |
|
|
|
|
|
|
|
|
with open(f"{config['save_dir']}/test_results.json", "w") as f: |
|
|
json.dump(test_results, f, indent=2) |
|
|
|
|
|
print(f"Evaluation complete! Results saved to {config['save_dir']}/test_results.json") |
|
|
|
|
|
return test_results |
|
|
|
|
|
|
|
|
def run_training(config, processed_dataset): |
|
|
"""Run the complete training pipeline""" |
|
|
|
|
|
print("๐ Starting financial LLM fine-tuning...") |
|
|
print(f"Base model: {config['model_name']}") |
|
|
print(f"Dataset: {config['dataset_name']}") |
|
|
print(f"Training samples: {len(processed_dataset['train'])}") |
|
|
|
|
|
|
|
|
model, tokenizer = setup_model_and_tokenizer(config) |
|
|
|
|
|
|
|
|
model = setup_lora(model, config) |
|
|
|
|
|
|
|
|
tokenized_dataset = tokenize_dataset(processed_dataset, tokenizer, config) |
|
|
|
|
|
|
|
|
trainer = setup_training(model, tokenizer, tokenized_dataset, config) |
|
|
|
|
|
|
|
|
print("Starting training...") |
|
|
print(f"Training will run for {config['num_epochs']} epochs") |
|
|
print(f"Effective batch size: {config['train_batch_size'] * config['gradient_accumulation_steps']}") |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
print("Training completed!") |
|
|
|
|
|
|
|
|
test_results = save_model_and_config(model, tokenizer, trainer, config) |
|
|
|
|
|
print("๐ Fine-tuning complete! ๐") |
|
|
print(f"โ
Model saved to: {config['save_dir']}") |
|
|
print(f"โ
Test perplexity: {test_results.get('eval_loss', 'N/A'):.4f}") |
|
|
|
|
|
return model, tokenizer, trainer |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
test_config = { |
|
|
"model_name": "microsoft/DialoGPT-medium", |
|
|
"dataset_name": "Josephgflowers/Finance-Instruct-500k", |
|
|
"dataset_config": "default", |
|
|
"max_length": 512, |
|
|
"train_batch_size": 2, |
|
|
"eval_batch_size": 2, |
|
|
"gradient_accumulation_steps": 8, |
|
|
"learning_rate": 2e-4, |
|
|
"num_epochs": 1, |
|
|
"lora_r": 16, |
|
|
"lora_alpha": 32, |
|
|
"lora_dropout": 0.1, |
|
|
"output_dir": "./test-financial-lora", |
|
|
"save_dir": "./test-financial-final", |
|
|
"quantization": "8bit", |
|
|
"save_steps": 100, |
|
|
"eval_steps": 50, |
|
|
"logging_steps": 25, |
|
|
"gradient_checkpointing": True, |
|
|
"dataloader_num_workers": 2, |
|
|
} |
|
|
|
|
|
print("Testing training pipeline...") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Training pipeline setup complete!") |