import os import numpy as np import pandas as pd from sklearn.metrics import classification_report from tqdm import tqdm import joblib from config import LABEL_COLUMNS, MODEL_SAVE_DIR def train_logreg_models(X, y, label_encoders, model_class): """ Trains one Logistic Regression model per label column. Args: X (array-like): Feature matrix (e.g., TF-IDF vectors). y (DataFrame): Target DataFrame containing all label columns. label_encoders (dict): Label encoders for each target. model_class: LogisticRegression class. Returns: dict: Trained models keyed by label name. """ models = {} for col in LABEL_COLUMNS: print(f"Training Logistic Regression model for {col}...") model = model_class() model.fit(X, y[col]) models[col] = model return models def evaluate_logreg_models(models, X_val, y_val, label_encoders): """ Evaluates Logistic Regression models on validation data. Args: models (dict): Dictionary of trained models per label. X_val (array-like): Validation features. y_val (DataFrame): Validation labels. label_encoders (dict): Encoders used for decoding. Returns: tuple: (classification_reports, true_labels_list, predicted_labels_list) """ reports = {} truths = [] predictions = [] for col in LABEL_COLUMNS: model = models[col] y_true = y_val[col] y_pred = model.predict(X_val) truths.append(y_true.tolist()) predictions.append(y_pred.tolist()) report = classification_report( y_true, y_pred, output_dict=True, zero_division=0 ) reports[col] = report return reports, truths, predictions def summarize_metrics(metrics): summary = [] for field, report in metrics.items(): precision = report['weighted avg'].get('precision', 0) recall = report['weighted avg'].get('recall', 0) f1 = report['weighted avg'].get('f1-score', 0) support = report['weighted avg'].get('support', 0) accuracy = report.get('accuracy', 0) summary.append({ "Field": field, "Precision": precision, "Recall": recall, "F1-Score": f1, "Accuracy": accuracy, "Support": support }) return pd.DataFrame(summary) def save_logreg_models(models, model_name): model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") joblib.dump(models, model_path) print(f"Saved Logistic Regression models to {model_path}") def load_logreg_models(model_name): model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") if not os.path.exists(model_path): raise FileNotFoundError(f"Model not found at {model_path}") models = joblib.load(model_path) print(f"Loaded Logistic Regression models from {model_path}") return models def predict_logreg_probabilities(models, X): """ Returns probability distributions for each label. Returns: list: One list per label of probability arrays. """ all_probs = [] for col in LABEL_COLUMNS: probs = models[col].predict_proba(X) all_probs.append(probs) return all_probs