Spaces:
Running
Running
| import os | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics import classification_report | |
| from tqdm import tqdm | |
| import joblib | |
| from config import LABEL_COLUMNS, MODEL_SAVE_DIR | |
| def train_logreg_models(X, y, label_encoders, model_class): | |
| """ | |
| Trains one Logistic Regression model per label column. | |
| Args: | |
| X (array-like): Feature matrix (e.g., TF-IDF vectors). | |
| y (DataFrame): Target DataFrame containing all label columns. | |
| label_encoders (dict): Label encoders for each target. | |
| model_class: LogisticRegression class. | |
| Returns: | |
| dict: Trained models keyed by label name. | |
| """ | |
| models = {} | |
| for col in LABEL_COLUMNS: | |
| print(f"Training Logistic Regression model for {col}...") | |
| model = model_class() | |
| model.fit(X, y[col]) | |
| models[col] = model | |
| return models | |
| def evaluate_logreg_models(models, X_val, y_val, label_encoders): | |
| """ | |
| Evaluates Logistic Regression models on validation data. | |
| Args: | |
| models (dict): Dictionary of trained models per label. | |
| X_val (array-like): Validation features. | |
| y_val (DataFrame): Validation labels. | |
| label_encoders (dict): Encoders used for decoding. | |
| Returns: | |
| tuple: (classification_reports, true_labels_list, predicted_labels_list) | |
| """ | |
| reports = {} | |
| truths = [] | |
| predictions = [] | |
| for col in LABEL_COLUMNS: | |
| model = models[col] | |
| y_true = y_val[col] | |
| y_pred = model.predict(X_val) | |
| truths.append(y_true.tolist()) | |
| predictions.append(y_pred.tolist()) | |
| report = classification_report( | |
| y_true, y_pred, output_dict=True, zero_division=0 | |
| ) | |
| reports[col] = report | |
| return reports, truths, predictions | |
| def summarize_metrics(metrics): | |
| summary = [] | |
| for field, report in metrics.items(): | |
| precision = report['weighted avg'].get('precision', 0) | |
| recall = report['weighted avg'].get('recall', 0) | |
| f1 = report['weighted avg'].get('f1-score', 0) | |
| support = report['weighted avg'].get('support', 0) | |
| accuracy = report.get('accuracy', 0) | |
| summary.append({ | |
| "Field": field, | |
| "Precision": precision, | |
| "Recall": recall, | |
| "F1-Score": f1, | |
| "Accuracy": accuracy, | |
| "Support": support | |
| }) | |
| return pd.DataFrame(summary) | |
| def save_logreg_models(models, model_name): | |
| model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") | |
| joblib.dump(models, model_path) | |
| print(f"Saved Logistic Regression models to {model_path}") | |
| def load_logreg_models(model_name): | |
| model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") | |
| if not os.path.exists(model_path): | |
| raise FileNotFoundError(f"Model not found at {model_path}") | |
| models = joblib.load(model_path) | |
| print(f"Loaded Logistic Regression models from {model_path}") | |
| return models | |
| def predict_logreg_probabilities(models, X): | |
| """ | |
| Returns probability distributions for each label. | |
| Returns: | |
| list: One list per label of probability arrays. | |
| """ | |
| all_probs = [] | |
| for col in LABEL_COLUMNS: | |
| probs = models[col].predict_proba(X) | |
| all_probs.append(probs) | |
| return all_probs | |