Spaces:

point9
/

Roberta_model_Test

Sleeping

App Files Files Community

subbunanepalli commited on Jun 12, 2025

Commit

80abfe1

verified ·

1 Parent(s): 31f3076

Create train_utils.py

Browse files

Files changed (1) hide show

train_utils.py +210 -0

train_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from sklearn.metrics import classification_report
+from sklearn.utils.class_weight import compute_class_weight
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+import os
+import joblib
+from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
+def get_class_weights(data_df, field, label_encoder):
+    """
+    Computes balanced class weights for a given target field.
+    These weights are used with RoBERTa model training to handle class imbalance.
+    """
+    y = data_df[field].values
+    try:
+        y_encoded = label_encoder.transform(y)
+    except ValueError as e:
+        print(f"Warning: {e}")
+        print("Using only seen labels for class weights calculation")
+        seen_labels = set(label_encoder.classes_)
+        y_filtered = [label for label in y if label in seen_labels]
+        y_encoded = label_encoder.transform(y_filtered)
+    y_encoded = y_encoded.astype(int)
+    n_classes = len(label_encoder.classes_)
+    class_counts = np.zeros(n_classes, dtype=int)
+    for i in range(n_classes):
+        class_counts[i] = np.sum(y_encoded == i)
+    total_samples = len(y_encoded)
+    class_weights = np.ones(n_classes)
+    seen_classes = class_counts > 0
+    if np.any(seen_classes):
+        class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
+    return torch.tensor(class_weights, dtype=torch.float)
+def initialize_criterions(data_df, label_encoders):
+    """
+    Initializes loss functions with class weights for each label field for RoBERTa.
+    """
+    field_criterions = {}
+    for field in LABEL_COLUMNS:
+        weights = get_class_weights(data_df, field, label_encoders[field])
+        field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
+    return field_criterions
+def train_model(model, loader, optimizer, field_criterions, epoch):
+    """
+    Trains the RoBERTa-based model for one epoch.
+    """
+    model.train()
+    total_loss = 0
+    tqdm_loader = tqdm(loader, desc=f"RoBERTa Epoch {epoch + 1} Training")
+    for batch in tqdm_loader:
+        if len(batch) == 2:
+            inputs, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            labels = labels.to(DEVICE)
+            outputs = model(input_ids, attention_mask)
+        elif len(batch) == 3:
+            inputs, metadata, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            metadata = metadata.to(DEVICE)
+            labels = labels.to(DEVICE)
+            outputs = model(input_ids, attention_mask, metadata)
+        else:
+            raise ValueError("Unsupported batch format.")
+        loss = 0
+        for i, output_logits in enumerate(outputs):
+            loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+        tqdm_loader.set_postfix(loss=loss.item())
+    return total_loss / len(loader)
+def evaluate_model(model, loader):
+    """
+    Evaluates the RoBERTa model and returns classification reports and metrics.
+    """
+    model.eval()
+    predictions = [[] for _ in range(len(LABEL_COLUMNS))]
+    truths = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="RoBERTa Evaluation"):
+            if len(batch) == 2:
+                inputs, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, output_logits in enumerate(outputs):
+                preds = torch.argmax(output_logits, dim=1).cpu().numpy()
+                predictions[i].extend(preds)
+                truths[i].extend(labels[:, i].cpu().numpy())
+    reports = {}
+    for i, col in enumerate(LABEL_COLUMNS):
+        try:
+            reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
+        except ValueError:
+            print(f"Warning: Classification report failed for {col}")
+            reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
+    return reports, truths, predictions
+def summarize_metrics(metrics):
+    """
+    Summarizes classification reports into a Pandas DataFrame (RoBERTa).
+    """
+    summary = []
+    for field, report in metrics.items():
+        precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
+        recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
+        f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
+        support = report['weighted avg']['support'] if 'weighted avg' in report else 0
+        accuracy = report['accuracy'] if 'accuracy' in report else 0
+        summary.append({
+            "Field": field,
+            "Precision": precision,
+            "Recall": recall,
+            "F1-Score": f1,
+            "Accuracy": accuracy,
+            "Support": support
+        })
+    return pd.DataFrame(summary)
+def save_model(model, model_name, save_format='pth'):
+    """
+    Saves RoBERTa model weights.
+    """
+    if save_format == 'pth':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+        torch.save(model.state_dict(), model_path)
+    elif save_format == 'pickle':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
+        joblib.dump(model, model_path)
+    else:
+        raise ValueError(f"Unsupported save format: {save_format}")
+    print(f"Model saved to {model_path}")
+def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
+    """
+    Loads a saved RoBERTa model from disk.
+    """
+    model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+    if not os.path.exists(model_path):
+        print(f"Warning: {model_path} not found. Returning a new model instance.")
+        if metadata_dim > 0:
+            return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
+        else:
+            return model_class(num_labels).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.to(DEVICE)
+    model.eval()
+    print(f"RoBERTa model loaded from {model_path}")
+    return model
+def predict_probabilities(model, loader):
+    """
+    Generates softmax prediction probabilities from a trained RoBERTa model.
+    """
+    model.eval()
+    all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="RoBERTa Predicting Probabilities"):
+            if len(batch) == 2:
+                inputs, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, out_logits in enumerate(outputs):
+                probs = torch.softmax(out_logits, dim=1).cpu().numpy()
+                all_probabilities[i].extend(probs)
+    return all_probabilities