ganeshkonapalli commited on
Commit
1746034
·
verified ·
1 Parent(s): 8f20bd2

Create train_utils.py

Browse files
Files changed (1) hide show
  1. train_utils.py +295 -0
train_utils.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # train_utils.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.optim import AdamW
6
+ from sklearn.metrics import classification_report
7
+ from sklearn.utils.class_weight import compute_class_weight
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ import pandas as pd
11
+ import os
12
+ import joblib
13
+
14
+ from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
15
+
16
+ def get_class_weights(data_df, field, label_encoder):
17
+ """
18
+ Computes balanced class weights for a given target field.
19
+ These weights can be used in the loss function to mitigate class imbalance.
20
+ Args:
21
+ data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data.
22
+ field (str): The name of the label column for which to compute weights.
23
+ label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field.
24
+ Returns:
25
+ torch.Tensor: A tensor of class weights for the specified field.
26
+ """
27
+ # Get the original labels for the specified field
28
+ y = data_df[field].values
29
+ # Use label_encoder.transform directly - it will handle unseen labels
30
+ try:
31
+ y_encoded = label_encoder.transform(y)
32
+ except ValueError as e:
33
+ print(f"Warning: {e}")
34
+ print(f"Using only seen labels for class weights calculation")
35
+ # Filter out unseen labels
36
+ seen_labels = set(label_encoder.classes_)
37
+ y_filtered = [label for label in y if label in seen_labels]
38
+ y_encoded = label_encoder.transform(y_filtered)
39
+
40
+ # Ensure y_encoded is integer type
41
+ y_encoded = y_encoded.astype(int)
42
+
43
+ # Initialize counts for all possible classes
44
+ n_classes = len(label_encoder.classes_)
45
+ class_counts = np.zeros(n_classes, dtype=int)
46
+
47
+ # Count occurrences of each class
48
+ for i in range(n_classes):
49
+ class_counts[i] = np.sum(y_encoded == i)
50
+
51
+ # Calculate weights for all classes
52
+ total_samples = len(y_encoded)
53
+ class_weights = np.ones(n_classes) # Default weight of 1 for unseen classes
54
+ seen_classes = class_counts > 0
55
+ if np.any(seen_classes):
56
+ class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
57
+
58
+ return torch.tensor(class_weights, dtype=torch.float)
59
+
60
+ def initialize_criterions(data_df, label_encoders):
61
+ """
62
+ Initializes CrossEntropyLoss criteria for each label column, applying class weights.
63
+ Args:
64
+ data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights.
65
+ label_encoders (dict): Dictionary of LabelEncoder objects.
66
+ Returns:
67
+ dict: A dictionary where keys are label column names and values are
68
+ initialized `torch.nn.CrossEntropyLoss` objects.
69
+ """
70
+ field_criterions = {}
71
+ for field in LABEL_COLUMNS:
72
+ # Get class weights for the current field
73
+ weights = get_class_weights(data_df, field, label_encoders[field])
74
+ # Initialize CrossEntropyLoss with the computed weights and move to the device
75
+ field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
76
+ return field_criterions
77
+
78
+ def train_model(model, loader, optimizer, field_criterions, epoch):
79
+ """
80
+ Trains the given PyTorch model for one epoch.
81
+ Args:
82
+ model (torch.nn.Module): The model to train.
83
+ loader (torch.utils.data.DataLoader): DataLoader for training data.
84
+ optimizer (torch.optim.Optimizer): Optimizer for model parameters.
85
+ field_criterions (dict): Dictionary of loss functions for each label.
86
+ epoch (int): Current epoch number (for progress bar description).
87
+ Returns:
88
+ float: Average training loss for the epoch.
89
+ """
90
+ model.train() # Set the model to training mode
91
+ total_loss = 0
92
+ # Use tqdm for a progress bar during training
93
+ tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training")
94
+
95
+ for batch in tqdm_loader:
96
+ # Unpack batch based on whether it contains metadata
97
+ if len(batch) == 2: # Text-only models (inputs, labels)
98
+ inputs, labels = batch
99
+ input_ids = inputs['input_ids'].to(DEVICE)
100
+ attention_mask = inputs['attention_mask'].to(DEVICE)
101
+ labels = labels.to(DEVICE)
102
+ # Forward pass through the model
103
+ outputs = model(input_ids, attention_mask)
104
+ elif len(batch) == 3: # Text + Metadata models (inputs, metadata, labels)
105
+ inputs, metadata, labels = batch
106
+ input_ids = inputs['input_ids'].to(DEVICE)
107
+ attention_mask = inputs['attention_mask'].to(DEVICE)
108
+ metadata = metadata.to(DEVICE)
109
+ labels = labels.to(DEVICE)
110
+ # Forward pass through the hybrid model
111
+ outputs = model(input_ids, attention_mask, metadata)
112
+ else:
113
+ raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.")
114
+
115
+ loss = 0
116
+ # Calculate total loss by summing loss for each label column
117
+ # `outputs` is a list of logits, one for each label column
118
+ for i, output_logits in enumerate(outputs):
119
+ # `labels[:, i]` gets the true labels for the i-th label column
120
+ # `field_criterions[LABEL_COLUMNS[i]]` selects the appropriate loss function
121
+ loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
122
+
123
+ optimizer.zero_grad() # Clear previous gradients
124
+ loss.backward() # Backpropagation
125
+ optimizer.step() # Update model parameters
126
+ total_loss += loss.item() # Accumulate loss
127
+ tqdm_loader.set_postfix(loss=loss.item()) # Update progress bar with current batch loss
128
+
129
+ return total_loss / len(loader) # Return average loss for the epoch
130
+
131
+ def evaluate_model(model, loader):
132
+ """
133
+ Evaluates the given PyTorch model on a validation/test set.
134
+ Args:
135
+ model (torch.nn.Module): The model to evaluate.
136
+ loader (torch.utils.data.DataLoader): DataLoader for evaluation data.
137
+ Returns:
138
+ tuple: A tuple containing:
139
+ - reports (dict): Classification reports (dict format) for each label column.
140
+ - truths (list): List of true label arrays for each label column.
141
+ - predictions (list): List of predicted label arrays for each label column.
142
+ """
143
+ model.eval() # Set the model to evaluation mode (disables dropout, batch norm updates, etc.)
144
+ # Initialize lists to store predictions and true labels for each output head
145
+ predictions = [[] for _ in range(len(LABEL_COLUMNS))]
146
+ truths = [[] for _ in range(len(LABEL_COLUMNS))]
147
+
148
+ with torch.no_grad(): # Disable gradient calculations during evaluation for efficiency
149
+ for batch in tqdm(loader, desc="Evaluation"):
150
+ if len(batch) == 2:
151
+ inputs, labels = batch
152
+ input_ids = inputs['input_ids'].to(DEVICE)
153
+ attention_mask = inputs['attention_mask'].to(DEVICE)
154
+ labels = labels.to(DEVICE)
155
+ outputs = model(input_ids, attention_mask)
156
+ elif len(batch) == 3:
157
+ inputs, metadata, labels = batch
158
+ input_ids = inputs['input_ids'].to(DEVICE)
159
+ attention_mask = inputs['attention_mask'].to(DEVICE)
160
+ metadata = metadata.to(DEVICE)
161
+ labels = labels.to(DEVICE)
162
+ outputs = model(input_ids, attention_mask, metadata)
163
+ else:
164
+ raise ValueError("Unsupported batch format.")
165
+
166
+ for i, output_logits in enumerate(outputs):
167
+ # Get the predicted class by taking the argmax of the logits
168
+ preds = torch.argmax(output_logits, dim=1).cpu().numpy()
169
+ predictions[i].extend(preds)
170
+ # Get the true labels for the current output head
171
+ truths[i].extend(labels[:, i].cpu().numpy())
172
+
173
+ reports = {}
174
+ # Generate classification report for each label column
175
+ for i, col in enumerate(LABEL_COLUMNS):
176
+ try:
177
+ # `zero_division=0` handles cases where a class might have no true or predicted samples
178
+ reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
179
+ except ValueError:
180
+ # Handle cases where a label might not appear in the validation set,
181
+ # which could cause classification_report to fail.
182
+ print(f"Warning: Could not generate classification report for {col}. Skipping.")
183
+ reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
184
+ return reports, truths, predictions
185
+
186
+ def summarize_metrics(metrics):
187
+ """
188
+ Summarizes classification reports into a readable Pandas DataFrame.
189
+ Args:
190
+ metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`.
191
+ Returns:
192
+ pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field.
193
+ """
194
+ summary = []
195
+ for field, report in metrics.items():
196
+ # Safely get metrics, defaulting to 0 if not present (e.g., for empty reports)
197
+ precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
198
+ recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
199
+ f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
200
+ support = report['weighted avg']['support'] if 'weighted avg' in report else 0
201
+ accuracy = report['accuracy'] if 'accuracy' in report else 0 # Accuracy is usually top-level
202
+ summary.append({
203
+ "Field": field,
204
+ "Precision": precision,
205
+ "Recall": recall,
206
+ "F1-Score": f1,
207
+ "Accuracy": accuracy,
208
+ "Support": support
209
+ })
210
+ return pd.DataFrame(summary)
211
+
212
+ def save_model(model, model_name, save_format='pth'):
213
+ """
214
+ Saves the state dictionary of a PyTorch model.
215
+ Args:
216
+ model (torch.nn.Module): The trained PyTorch model.
217
+ model_name (str): A descriptive name for the model (used for filename).
218
+ save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models).
219
+ """
220
+ # Construct the save path dynamically relative to the project root
221
+ if save_format == 'pth':
222
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
223
+ torch.save(model.state_dict(), model_path)
224
+ elif save_format == 'pickle':
225
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
226
+ joblib.dump(model, model_path)
227
+ else:
228
+ raise ValueError(f"Unsupported save format: {save_format}")
229
+
230
+ print(f"Model saved to {model_path}")
231
+
232
+ def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
233
+ """
234
+ Loads the state dictionary into a PyTorch model.
235
+ Args:
236
+ model (torch.nn.Module): An initialized model instance (architecture).
237
+ model_name (str): The name of the model to load.
238
+ model_class (class): The class of the model (e.g., BertMultiOutputModel).
239
+ num_labels (list): List of number of classes for each label.
240
+ metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only).
241
+ Returns:
242
+ torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode.
243
+ """
244
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
245
+ if not os.path.exists(model_path):
246
+ print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.")
247
+ # Re-initialize the model if not found, to ensure it has the correct architecture
248
+ if metadata_dim > 0:
249
+ return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
250
+ else:
251
+ return model_class(num_labels).to(DEVICE)
252
+
253
+ model.load_state_dict(torch.load(model_path, map_location=DEVICE))
254
+ model.to(DEVICE)
255
+ model.eval() # Set to evaluation mode after loading
256
+ print(f"Model loaded from {model_path}")
257
+ return model
258
+
259
+ def predict_probabilities(model, loader):
260
+ """
261
+ Generates prediction probabilities for each label for a given model.
262
+ This is used for confidence scoring and feeding into a voting ensemble.
263
+ Args:
264
+ model (torch.nn.Module): The trained PyTorch model.
265
+ loader (torch.utils.data.DataLoader): DataLoader for the data to predict on.
266
+ Returns:
267
+ list: A list of lists of numpy arrays. Each inner list corresponds to a label column,
268
+ containing the softmax probabilities for each sample for that label.
269
+ """
270
+ model.eval() # Set to evaluation mode
271
+ # List to store probabilities for each output head
272
+ all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
273
+
274
+ with torch.no_grad():
275
+ for batch in tqdm(loader, desc="Predicting Probabilities"):
276
+ # Unpack batch, ignoring labels as we only need inputs
277
+ if len(batch) == 2:
278
+ inputs, _ = batch
279
+ input_ids = inputs['input_ids'].to(DEVICE)
280
+ attention_mask = inputs['attention_mask'].to(DEVICE)
281
+ outputs = model(input_ids, attention_mask)
282
+ elif len(batch) == 3:
283
+ inputs, metadata, _ = batch
284
+ input_ids = inputs['input_ids'].to(DEVICE)
285
+ attention_mask = inputs['attention_mask'].to(DEVICE)
286
+ metadata = metadata.to(DEVICE)
287
+ outputs = model(input_ids, attention_mask, metadata)
288
+ else:
289
+ raise ValueError("Unsupported batch format.")
290
+
291
+ for i, out_logits in enumerate(outputs):
292
+ # Apply softmax to logits to get probabilities
293
+ probs = torch.softmax(out_logits, dim=1).cpu().numpy()
294
+ all_probabilities[i].extend(probs)
295
+ return all_probabilities