logreg__ / train_utils.py
ganeshkonapalli's picture
Create train_utils.py
89e4a53 verified
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import joblib
from config import LABEL_COLUMNS, MODEL_SAVE_DIR
def train_logreg_models(X, y, label_encoders, model_class):
"""
Trains one Logistic Regression model per label column.
Args:
X (array-like): Feature matrix (e.g., TF-IDF vectors).
y (DataFrame): Target DataFrame containing all label columns.
label_encoders (dict): Label encoders for each target.
model_class: LogisticRegression class.
Returns:
dict: Trained models keyed by label name.
"""
models = {}
for col in LABEL_COLUMNS:
print(f"Training Logistic Regression model for {col}...")
model = model_class()
model.fit(X, y[col])
models[col] = model
return models
def evaluate_logreg_models(models, X_val, y_val, label_encoders):
"""
Evaluates Logistic Regression models on validation data.
Args:
models (dict): Dictionary of trained models per label.
X_val (array-like): Validation features.
y_val (DataFrame): Validation labels.
label_encoders (dict): Encoders used for decoding.
Returns:
tuple: (classification_reports, true_labels_list, predicted_labels_list)
"""
reports = {}
truths = []
predictions = []
for col in LABEL_COLUMNS:
model = models[col]
y_true = y_val[col]
y_pred = model.predict(X_val)
truths.append(y_true.tolist())
predictions.append(y_pred.tolist())
report = classification_report(
y_true, y_pred, output_dict=True, zero_division=0
)
reports[col] = report
return reports, truths, predictions
def summarize_metrics(metrics):
summary = []
for field, report in metrics.items():
precision = report['weighted avg'].get('precision', 0)
recall = report['weighted avg'].get('recall', 0)
f1 = report['weighted avg'].get('f1-score', 0)
support = report['weighted avg'].get('support', 0)
accuracy = report.get('accuracy', 0)
summary.append({
"Field": field,
"Precision": precision,
"Recall": recall,
"F1-Score": f1,
"Accuracy": accuracy,
"Support": support
})
return pd.DataFrame(summary)
def save_logreg_models(models, model_name):
model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
joblib.dump(models, model_path)
print(f"Saved Logistic Regression models to {model_path}")
def load_logreg_models(model_name):
model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model not found at {model_path}")
models = joblib.load(model_path)
print(f"Loaded Logistic Regression models from {model_path}")
return models
def predict_logreg_probabilities(models, X):
"""
Returns probability distributions for each label.
Returns:
list: One list per label of probability arrays.
"""
all_probs = []
for col in LABEL_COLUMNS:
probs = models[col].predict_proba(X)
all_probs.append(probs)
return all_probs