File size: 3,279 Bytes
89e4a53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm
import joblib

from config import LABEL_COLUMNS, MODEL_SAVE_DIR


def train_logreg_models(X, y, label_encoders, model_class):
    """
    Trains one Logistic Regression model per label column.

    Args:
        X (array-like): Feature matrix (e.g., TF-IDF vectors).
        y (DataFrame): Target DataFrame containing all label columns.
        label_encoders (dict): Label encoders for each target.
        model_class: LogisticRegression class.

    Returns:
        dict: Trained models keyed by label name.
    """
    models = {}
    for col in LABEL_COLUMNS:
        print(f"Training Logistic Regression model for {col}...")
        model = model_class()
        model.fit(X, y[col])
        models[col] = model
    return models


def evaluate_logreg_models(models, X_val, y_val, label_encoders):
    """
    Evaluates Logistic Regression models on validation data.

    Args:
        models (dict): Dictionary of trained models per label.
        X_val (array-like): Validation features.
        y_val (DataFrame): Validation labels.
        label_encoders (dict): Encoders used for decoding.

    Returns:
        tuple: (classification_reports, true_labels_list, predicted_labels_list)
    """
    reports = {}
    truths = []
    predictions = []

    for col in LABEL_COLUMNS:
        model = models[col]
        y_true = y_val[col]
        y_pred = model.predict(X_val)

        truths.append(y_true.tolist())
        predictions.append(y_pred.tolist())

        report = classification_report(
            y_true, y_pred, output_dict=True, zero_division=0
        )
        reports[col] = report

    return reports, truths, predictions


def summarize_metrics(metrics):
    summary = []
    for field, report in metrics.items():
        precision = report['weighted avg'].get('precision', 0)
        recall = report['weighted avg'].get('recall', 0)
        f1 = report['weighted avg'].get('f1-score', 0)
        support = report['weighted avg'].get('support', 0)
        accuracy = report.get('accuracy', 0)
        summary.append({
            "Field": field,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1,
            "Accuracy": accuracy,
            "Support": support
        })
    return pd.DataFrame(summary)


def save_logreg_models(models, model_name):
    model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
    joblib.dump(models, model_path)
    print(f"Saved Logistic Regression models to {model_path}")


def load_logreg_models(model_name):
    model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model not found at {model_path}")
    models = joblib.load(model_path)
    print(f"Loaded Logistic Regression models from {model_path}")
    return models


def predict_logreg_probabilities(models, X):
    """
    Returns probability distributions for each label.

    Returns:
        list: One list per label of probability arrays.
    """
    all_probs = []
    for col in LABEL_COLUMNS:
        probs = models[col].predict_proba(X)
        all_probs.append(probs)
    return all_probs