code2-repo / trainer.py

Deepu1965

Upload folder using huggingface_hub

21613a7 verified about 2 months ago

28 kB

	"""
	Legal-BERT Training Pipeline - Learning-Based Risk Classification
	PHASE 1 IMPROVEMENTS: Focal Loss, Rebalanced weights, Class boosting, LR scheduling
	"""
	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader
	from torch.optim.lr_scheduler import OneCycleLR
	import numpy as np
	from typing import Dict, List, Tuple, Any
	import os
	from sklearn.metrics import accuracy_score, classification_report, recall_score
	from sklearn.utils.class_weight import compute_class_weight
	import json
	import time

	from config import LegalBertConfig
	from model import HierarchicalLegalBERT, LegalBertTokenizer
	from risk_discovery import UnsupervisedRiskDiscovery, LDARiskDiscovery
	from data_loader import CUADDataLoader
	from focal_loss import FocalLoss, compute_class_weights
	from risk_postprocessing import merge_duplicate_topics, detect_duplicate_topics, validate_cluster_quality

	def collate_batch(batch):
	"""
	Custom collate function to handle variable-length sequences in batch.
	Pads all sequences to the maximum length in the batch.
	"""
	# Find max length in this batch
	max_len = max(item['input_ids'].size(0) for item in batch)

	# Prepare batched tensors
	input_ids_batch = []
	attention_mask_batch = []
	risk_labels_batch = []
	severity_scores_batch = []
	importance_scores_batch = []

	for item in batch:
	input_ids = item['input_ids']
	attention_mask = item['attention_mask']
	current_len = input_ids.size(0)

	# Pad if needed
	if current_len < max_len:
	padding_len = max_len - current_len
	# Pad with 0 (PAD token) for input_ids
	input_ids = torch.cat([input_ids, torch.zeros(padding_len, dtype=torch.long)])
	# Pad with 0 for attention_mask (0 = don't attend)
	attention_mask = torch.cat([attention_mask, torch.zeros(padding_len, dtype=torch.long)])

	input_ids_batch.append(input_ids)
	attention_mask_batch.append(attention_mask)
	risk_labels_batch.append(item['risk_label'])
	severity_scores_batch.append(item['severity_score'])
	importance_scores_batch.append(item['importance_score'])

	# Stack into batched tensors
	return {
	'input_ids': torch.stack(input_ids_batch),
	'attention_mask': torch.stack(attention_mask_batch),
	'risk_label': torch.stack(risk_labels_batch),
	'severity_score': torch.stack(severity_scores_batch),
	'importance_score': torch.stack(importance_scores_batch)
	}

	class LegalClauseDataset(Dataset):
	"""Dataset for legal clauses with discovered risk labels"""

	def __init__(self, clauses: List[str], risk_labels: List[int],
	severity_scores: List[float], importance_scores: List[float],
	tokenizer: LegalBertTokenizer, max_length: int = 512):
	self.clauses = clauses
	self.risk_labels = risk_labels
	self.severity_scores = severity_scores
	self.importance_scores = importance_scores
	self.tokenizer = tokenizer
	self.max_length = max_length

	def __len__(self):
	return len(self.clauses)

	def __getitem__(self, idx):
	clause = self.clauses[idx]

	# Tokenize
	encoded = self.tokenizer.tokenize_clauses([clause], self.max_length)

	return {
	'input_ids': encoded['input_ids'].squeeze(0),
	'attention_mask': encoded['attention_mask'].squeeze(0),
	'risk_label': torch.tensor(self.risk_labels[idx], dtype=torch.long),
	'severity_score': torch.tensor(self.severity_scores[idx], dtype=torch.float),
	'importance_score': torch.tensor(self.importance_scores[idx], dtype=torch.float)
	}

	class LegalBertTrainer:
	"""
	Trainer for Legal-BERT with discovered risk patterns.
	NO hardcoded risk categories!
	"""

	def __init__(self, config: LegalBertConfig):
	self.config = config
	self.device = torch.device(config.device)

	# Initialize risk discovery based on configured method
	risk_method = config.risk_discovery_method.lower()

	if risk_method == 'lda':
	print(f"🎯 Using LDA (Topic Modeling) for risk discovery")
	self.risk_discovery = LDARiskDiscovery(
	n_clusters=config.risk_discovery_clusters,
	doc_topic_prior=config.lda_doc_topic_prior,
	topic_word_prior=config.lda_topic_word_prior,
	max_iter=config.lda_max_iter,
	max_features=config.lda_max_features,
	learning_method=config.lda_learning_method,
	random_state=42
	)
	elif risk_method == 'kmeans':
	print(f"🎯 Using K-Means for risk discovery")
	self.risk_discovery = UnsupervisedRiskDiscovery(
	n_clusters=config.risk_discovery_clusters,
	random_state=42
	)
	else:
	print(f"⚠️ Unknown risk discovery method '{risk_method}', defaulting to LDA")
	self.risk_discovery = LDARiskDiscovery(
	n_clusters=config.risk_discovery_clusters,
	doc_topic_prior=config.lda_doc_topic_prior,
	topic_word_prior=config.lda_topic_word_prior,
	max_iter=config.lda_max_iter,
	max_features=config.lda_max_features,
	learning_method=config.lda_learning_method,
	random_state=42
	)

	self.tokenizer = LegalBertTokenizer(config.bert_model_name)

	# Will be initialized during training
	self.model = None
	self.optimizer = None
	self.scheduler = None

	# Training state
	self.training_history = {
	'train_loss': [],
	'val_loss': [],
	'train_acc': [],
	'val_acc': [],
	'per_class_recall': [] # Track per-class recall for Classes 0 and 5
	}

	# PHASE 1 IMPROVEMENT: Initialize loss functions with Focal Loss
	if config.use_focal_loss:
	print("🔥 Using Focal Loss for classification (gamma=2.5)")
	# Will be initialized after discovering class distribution
	self.classification_loss = None # Set in prepare_data
	else:
	print("⚠️ Using standard CrossEntropyLoss (not recommended)")
	self.classification_loss = nn.CrossEntropyLoss()

	self.regression_loss = nn.MSELoss()

	# Early stopping state
	self.best_val_loss = float('inf')
	self.patience_counter = 0

	def prepare_data(self, data_path: str) -> Tuple[DataLoader, DataLoader, DataLoader]:
	"""Load data and discover risk patterns"""
	print("🔄 Preparing data with unsupervised risk discovery...")

	# Load CUAD data
	data_loader = CUADDataLoader(data_path)
	df_clauses, contracts = data_loader.load_data()
	splits = data_loader.create_splits()

	# Get training clauses for risk discovery
	train_clauses = splits['train']['clause_text'].tolist()

	# Discover risk patterns from training data
	discovered_patterns = self.risk_discovery.discover_risk_patterns(train_clauses)

	# PHASE 2 IMPROVEMENT: Validate and merge duplicate topics
	print("\n🔍 Validating discovered risk patterns...")
	validation_report = validate_cluster_quality(discovered_patterns, min_cluster_size=150)

	if not validation_report['is_valid']:
	print("⚠️ Cluster quality issues detected:")
	for issue in validation_report['issues']:
	print(f" - {issue}")

	if validation_report['warnings']:
	for warning in validation_report['warnings']:
	print(f" ⚠️ {warning}")

	# Detect and merge duplicate topics (e.g., Classes 0 and 6 both named "LIABILITY")
	merge_rules = detect_duplicate_topics(discovered_patterns)

	if merge_rules:
	print(f"\n🔧 Merging {len(merge_rules)} duplicate topic groups...")
	discovered_patterns, original_labels = merge_duplicate_topics(
	discovered_patterns,
	self.risk_discovery.cluster_labels,
	merge_rules
	)
	# Update risk discovery with merged results
	self.risk_discovery.discovered_patterns = discovered_patterns
	self.risk_discovery.cluster_labels = original_labels
	self.risk_discovery.n_clusters = len(discovered_patterns)
	print(f"✅ Merged to {self.risk_discovery.n_clusters} distinct risk categories\n")

	# PHASE 1 IMPROVEMENT: Compute class weights with minority boost
	# Get training labels to compute balanced weights
	train_risk_labels = self.risk_discovery.get_risk_labels(train_clauses)

	if self.config.use_focal_loss:
	print("\n📊 Computing class weights for Focal Loss...")
	class_weights = compute_class_weights(
	train_risk_labels,
	num_classes=self.risk_discovery.n_clusters,
	minority_boost=self.config.minority_class_boost
	)

	# Initialize Focal Loss with computed weights
	self.classification_loss = FocalLoss(
	alpha=class_weights,
	gamma=self.config.focal_loss_gamma,
	reduction='mean'
	)
	print(f"✅ Focal Loss initialized with γ={self.config.focal_loss_gamma}\n")

	# Create datasets for each split
	datasets = {}
	dataloaders = {}

	for split_name, split_data in splits.items():
	clauses = split_data['clause_text'].tolist()

	# Get discovered risk labels
	risk_labels = self.risk_discovery.get_risk_labels(clauses)

	# Generate synthetic severity and importance scores
	# (In practice, these could be learned from other signals)
	severity_scores = self._generate_synthetic_scores(clauses, 'severity')
	importance_scores = self._generate_synthetic_scores(clauses, 'importance')

	# Create dataset
	dataset = LegalClauseDataset(
	clauses=clauses,
	risk_labels=risk_labels,
	severity_scores=severity_scores,
	importance_scores=importance_scores,
	tokenizer=self.tokenizer,
	max_length=self.config.max_sequence_length
	)

	datasets[split_name] = dataset

	# Create dataloader
	shuffle = (split_name == 'train')
	dataloader = DataLoader(
	dataset,
	batch_size=self.config.batch_size,
	shuffle=shuffle,
	num_workers=0, # Set to 0 to avoid multiprocessing issues
	collate_fn=collate_batch # Custom collate for variable-length sequences
	)
	dataloaders[split_name] = dataloader

	print(f"✅ Data preparation complete!")
	print(f"📊 Discovered {len(discovered_patterns)} risk patterns")

	return dataloaders['train'], dataloaders['val'], dataloaders['test']

	def _generate_synthetic_scores(self, clauses: List[str], score_type: str) -> List[float]:
	"""
	Calculate severity/importance scores based on extracted text features
	NOT synthetic - based on actual risk analysis from the clauses
	"""
	scores = []

	for clause in clauses:
	# Extract risk features from the clause
	features = self.risk_discovery.extract_risk_features(clause)

	if score_type == 'severity':
	# Calculate severity based on risk indicators
	# Higher severity for liability, prohibition, and obligation terms
	score = (
	features.get('risk_intensity', 0) * 30 + # Risk intensity (liability, prohibition)
	features.get('obligation_strength', 0) * 20 + # Obligation strength
	features.get('prohibition_terms_density', 0) * 100 + # Prohibitions are severe
	features.get('liability_terms_density', 0) * 100 + # Liability is severe
	min(features.get('monetary_terms_count', 0) * 0.5, 2) # Monetary impact
	)
	else: # importance
	# Calculate importance based on legal complexity and clause characteristics
	score = (
	features.get('legal_complexity', 0) * 30 + # Legal complexity
	min(features.get('clause_length', 0) / 50, 1) * 20 + # Longer = potentially more important
	features.get('conditional_risk_density', 0) * 100 + # Conditional clauses are important
	features.get('obligation_terms_complexity', 0) * 100 + # Obligations are important
	features.get('temporal_urgency_density', 0) * 50 # Time-sensitive = important
	)

	# Normalize to 0-10 scale
	normalized_score = min(max(score, 0), 10)
	scores.append(normalized_score)

	return scores

	def setup_training(self, train_loader: DataLoader):
	"""Initialize model, optimizer, and scheduler"""
	num_discovered_risks = self.risk_discovery.n_clusters

	# Initialize Hierarchical BERT model (context-aware)
	print("📊 Using Hierarchical BERT model (context-aware)")
	self.model = HierarchicalLegalBERT(
	config=self.config,
	num_discovered_risks=num_discovered_risks,
	hidden_dim=self.config.hierarchical_hidden_dim,
	num_lstm_layers=self.config.hierarchical_num_lstm_layers
	).to(self.device)

	# Initialize optimizer
	self.optimizer = torch.optim.AdamW(
	self.model.parameters(),
	lr=self.config.learning_rate,
	weight_decay=self.config.weight_decay
	)

	# PHASE 1 IMPROVEMENT: Initialize OneCycleLR scheduler
	if self.config.use_lr_scheduler:
	total_steps = len(train_loader) * self.config.num_epochs
	self.scheduler = OneCycleLR(
	self.optimizer,
	max_lr=self.config.learning_rate,
	total_steps=total_steps,
	pct_start=self.config.scheduler_pct_start, # 10% warmup
	anneal_strategy='cos',
	div_factor=25.0, # initial_lr = max_lr / 25
	final_div_factor=10000.0 # min_lr = initial_lr / 10000
	)
	print(f"📈 OneCycleLR scheduler initialized (warmup={self.config.scheduler_pct_start*100:.0f}%)")
	else:
	self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
	self.optimizer,
	T_max=len(train_loader) * self.config.num_epochs
	)
	print("⚠️ Using basic CosineAnnealingLR (not recommended)")

	print(f"🏗️ Model initialized with {num_discovered_risks} discovered risk categories")

	def compute_loss(self, outputs: Dict[str, torch.Tensor], batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
	"""Compute multi-task loss"""

	# Classification loss (discovered risk patterns)
	classification_loss = self.classification_loss(
	outputs['risk_logits'],
	batch['risk_label']
	)

	# Severity regression loss
	severity_loss = self.regression_loss(
	outputs['severity_score'],
	batch['severity_score']
	)

	# Importance regression loss
	importance_loss = self.regression_loss(
	outputs['importance_score'],
	batch['importance_score']
	)

	# Weighted combination
	total_loss = (
	self.config.task_weights['classification'] * classification_loss +
	self.config.task_weights['severity'] * severity_loss +
	self.config.task_weights['importance'] * importance_loss
	)

	return {
	'total_loss': total_loss,
	'classification_loss': classification_loss,
	'severity_loss': severity_loss,
	'importance_loss': importance_loss
	}

	def train_epoch(self, train_loader: DataLoader, epoch: int) -> Tuple[float, float, Dict[str, float]]:
	"""Train for one epoch"""
	self.model.train()
	total_loss = 0
	correct_predictions = 0
	total_samples = 0

	loss_components = {'classification': 0, 'severity': 0, 'importance': 0}

	for batch_idx, batch in enumerate(train_loader):
	# Move batch to device
	input_ids = batch['input_ids'].to(self.device)
	attention_mask = batch['attention_mask'].to(self.device)
	risk_labels = batch['risk_label'].to(self.device)
	severity_scores = batch['severity_score'].to(self.device)
	importance_scores = batch['importance_score'].to(self.device)

	# Forward pass (hierarchical model in training mode)
	outputs = self.model.forward_single_clause(input_ids, attention_mask)

	# Prepare batch for loss computation
	batch_for_loss = {
	'risk_label': risk_labels,
	'severity_score': severity_scores,
	'importance_score': importance_scores
	}

	# Compute loss
	losses = self.compute_loss(outputs, batch_for_loss)

	# Backward pass
	self.optimizer.zero_grad()
	losses['total_loss'].backward()

	# PHASE 1 IMPROVEMENT: Gradient clipping (prevents explosion with high classification weight)
	torch.nn.utils.clip_grad_norm_(
	self.model.parameters(),
	max_norm=self.config.gradient_clip_norm
	)

	self.optimizer.step()
	self.scheduler.step()

	# Update metrics
	total_loss += losses['total_loss'].item()

	# Classification accuracy
	predictions = torch.argmax(outputs['risk_logits'], dim=-1)
	correct_predictions += (predictions == risk_labels).sum().item()
	total_samples += risk_labels.size(0)

	# Loss components
	loss_components['classification'] += losses['classification_loss'].item()
	loss_components['severity'] += losses['severity_loss'].item()
	loss_components['importance'] += losses['importance_loss'].item()

	# Progress logging
	if batch_idx % 50 == 0:
	print(f" Batch {batch_idx}/{len(train_loader)}, Loss: {losses['total_loss'].item():.4f}")

	avg_loss = total_loss / len(train_loader)
	accuracy = correct_predictions / total_samples

	# Average loss components
	for key in loss_components:
	loss_components[key] /= len(train_loader)

	return avg_loss, accuracy, loss_components

	def validate_epoch(self, val_loader: DataLoader) -> Tuple[float, float, np.ndarray]:
	"""Validate for one epoch with per-class recall tracking"""
	self.model.eval()
	total_loss = 0
	correct_predictions = 0
	total_samples = 0

	# PHASE 1 IMPROVEMENT: Track predictions and labels for per-class metrics
	all_predictions = []
	all_labels = []

	with torch.no_grad():
	for batch in val_loader:
	# Move batch to device
	input_ids = batch['input_ids'].to(self.device)
	attention_mask = batch['attention_mask'].to(self.device)
	risk_labels = batch['risk_label'].to(self.device)
	severity_scores = batch['severity_score'].to(self.device)
	importance_scores = batch['importance_score'].to(self.device)

	# Forward pass (hierarchical model in training mode)
	outputs = self.model.forward_single_clause(input_ids, attention_mask)

	# Prepare batch for loss computation
	batch_for_loss = {
	'risk_label': risk_labels,
	'severity_score': severity_scores,
	'importance_score': importance_scores
	}

	# Compute loss
	losses = self.compute_loss(outputs, batch_for_loss)
	total_loss += losses['total_loss'].item()

	# Classification accuracy
	predictions = torch.argmax(outputs['risk_logits'], dim=-1)
	correct_predictions += (predictions == risk_labels).sum().item()
	total_samples += risk_labels.size(0)

	# Store for per-class metrics
	all_predictions.extend(predictions.cpu().numpy())
	all_labels.extend(risk_labels.cpu().numpy())

	avg_loss = total_loss / len(val_loader)
	accuracy = correct_predictions / total_samples

	# PHASE 1 IMPROVEMENT: Compute per-class recall (especially for Classes 0 and 5)
	per_class_recall = recall_score(
	all_labels,
	all_predictions,
	average=None, # Return recall for each class
	zero_division=0
	)

	return avg_loss, accuracy, per_class_recall

	def train(self, train_loader: DataLoader, val_loader: DataLoader) -> Dict[str, List[float]]:
	"""Complete training pipeline"""
	print(f"🚀 Starting Legal-BERT training...")
	print(f"Device: {self.device}")
	print(f"Epochs: {self.config.num_epochs}")
	print(f"Batch size: {self.config.batch_size}")

	self.setup_training(train_loader)

	# Track total training time
	total_start_time = time.time()

	for epoch in range(self.config.num_epochs):
	print(f"\n📈 Epoch {epoch+1}/{self.config.num_epochs}")

	# Track epoch time
	epoch_start_time = time.time()

	# Train
	train_loss, train_acc, loss_components = self.train_epoch(train_loader, epoch)

	# Validate (now returns per-class recall too)
	val_loss, val_acc, per_class_recall = self.validate_epoch(val_loader)

	# Calculate epoch time
	epoch_time = time.time() - epoch_start_time

	# Store history
	self.training_history['train_loss'].append(train_loss)
	self.training_history['val_loss'].append(val_loss)
	self.training_history['train_acc'].append(train_acc)
	self.training_history['val_acc'].append(val_acc)
	self.training_history['per_class_recall'].append(per_class_recall.tolist())

	# Print detailed results
	print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
	print(f" Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
	print(f" Loss Components - Class: {loss_components['classification']:.4f}, "
	f"Sev: {loss_components['severity']:.4f}, Imp: {loss_components['importance']:.4f}")

	# PHASE 1 IMPROVEMENT: Display per-class recall (focus on Classes 0 and 5)
	print(f" Per-Class Recall:")
	critical_classes = [0, 5] # Classes with 0% recall in previous training
	for cls_idx, recall in enumerate(per_class_recall):
	marker = " ⚠️ CRITICAL" if cls_idx in critical_classes else ""
	print(f" Class {cls_idx}: {recall:.3f}{marker}")

	# Display epoch time
	print(f" ⏱️ Epoch Time: {epoch_time:.2f}s ({epoch_time/60:.2f} minutes)")

	# PHASE 1 IMPROVEMENT: Early stopping check
	if val_loss < self.best_val_loss:
	self.best_val_loss = val_loss
	self.patience_counter = 0
	print(f" ✅ New best validation loss: {val_loss:.4f}")
	else:
	self.patience_counter += 1
	print(f" ⚠️ No improvement ({self.patience_counter}/{self.config.early_stopping_patience})")

	if self.patience_counter >= self.config.early_stopping_patience:
	print(f"\n🛑 Early stopping triggered after {epoch+1} epochs")
	break

	# Log results (optional: save checkpoint)
	print(f" 📊 Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
	print(f" 📊 Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
	print(f" 🔍 Loss Components:")
	print(f" Classification: {loss_components['classification']:.4f}")
	print(f" Severity: {loss_components['severity']:.4f}")
	print(f" Importance: {loss_components['importance']:.4f}")
	print(f" ⏱️ Epoch Time: {epoch_time:.2f}s ({epoch_time/60:.2f} minutes)")

	# Save checkpoint
	self.save_checkpoint(epoch)

	# Calculate total training time
	total_time = time.time() - total_start_time

	print(f"\n✅ Training complete!")
	print(f"⏱️ Total Training Time: {total_time:.2f}s ({total_time/60:.2f} minutes / {total_time/3600:.2f} hours)")
	print(f"⏱️ Average Time per Epoch: {total_time/self.config.num_epochs:.2f}s")

	return self.training_history

	def save_checkpoint(self, epoch: int):
	"""Save model checkpoint"""
	if not os.path.exists(self.config.checkpoint_dir):
	os.makedirs(self.config.checkpoint_dir)

	checkpoint = {
	'epoch': epoch,
	'model_state_dict': self.model.state_dict(),
	'optimizer_state_dict': self.optimizer.state_dict(),
	'scheduler_state_dict': self.scheduler.state_dict(),
	'training_history': self.training_history,
	'config': self.config,
	'discovered_patterns': self.risk_discovery.discovered_patterns
	}

	checkpoint_path = os.path.join(
	self.config.checkpoint_dir,
	f'legal_bert_epoch_{epoch+1}.pt'
	)

	torch.save(checkpoint, checkpoint_path)
	print(f"💾 Checkpoint saved: {checkpoint_path}")

	def load_checkpoint(self, checkpoint_path: str):
	"""Load model checkpoint"""
	checkpoint = torch.load(checkpoint_path, map_location=self.device)

	# Restore model
	num_discovered_risks = len(checkpoint['discovered_patterns'])
	self.model = HierarchicalLegalBERT(
	config=checkpoint['config'],
	num_discovered_risks=num_discovered_risks,
	hidden_dim=checkpoint['config'].hierarchical_hidden_dim,
	num_lstm_layers=checkpoint['config'].hierarchical_num_lstm_layers
	).to(self.device)
	self.model.load_state_dict(checkpoint['model_state_dict'])

	# Restore training state
	self.training_history = checkpoint['training_history']
	self.risk_discovery.discovered_patterns = checkpoint['discovered_patterns']

	print(f"✅ Checkpoint loaded: {checkpoint_path}")

	return checkpoint['epoch']