File size: 5,072 Bytes
c551752
 
 
 
 
 
 
 
47e416f
 
c551752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import joblib
import pandas as pd
import numpy as np
import os
from lime.lime_text import LimeTextExplainer
from .common import simple_text_clean, CLASS_NAMES 

ASSETS_DIR = os.path.join(os.path.dirname(__file__), '..', 'assets') 
PREPROCESSOR_FILENAME = "multinomial_nb_email_preprocessor.joblib"
MODEL_FILENAME = "trained_multinomial_nb_model.joblib"
PREPROCESSOR_PATH = os.path.join(ASSETS_DIR, PREPROCESSOR_FILENAME)
MODEL_PATH = os.path.join(ASSETS_DIR, MODEL_FILENAME)

nb_preprocessor = None
nb_model = None
lime_explainer_nb = None

try:
    nb_preprocessor = joblib.load(PREPROCESSOR_PATH)
    nb_model = joblib.load(MODEL_PATH)
    lime_explainer_nb = LimeTextExplainer(class_names=CLASS_NAMES)
    print("Multinomial NB model, Preprocessor, and LIME Explainer loaded successfully.")
except FileNotFoundError:
    print(f"FATAL ERROR (Naive Bayes): Could not find model ('{MODEL_PATH}') or nb_preprocessor ('{PREPROCESSOR_PATH}').")
    print("Ensure files are in 'app/assets/' and filenames are correct.")
except Exception as e:
    print(f"Error loading Multinomial NB model/preprocessor or initializing LIME: {e}")

def model_predict_probability_for_lime(combined_texts):
    if nb_preprocessor is None or nb_model is None:
        return np.array([[0.5, 0.5]] * len(combined_texts))
    
    subjects = []
    senders = []
    bodies = []

    for combined_text in combined_texts:
        s_marker = "subject: "
        d_marker = " sender: " 
        b_marker = " body: "    

        s_text, d_text, b_text = "", "", ""

        if d_marker in combined_text:
            s_text_part, rest = combined_text.split(d_marker, 1)
            if s_marker in s_text_part:
                s_text = s_text_part.replace(s_marker, "").strip()
            
            if b_marker in rest:
                d_text_part, b_text_part = rest.split(b_marker, 1)
                d_text = d_text_part.strip()
                b_text = b_text_part.strip()
            else: 
                d_text = rest.strip()
        else: 
             if s_marker in combined_text and b_marker in combined_text :
                  s_text_part, b_text_part = combined_text.split(b_marker, 1)
                  s_text = s_text_part.replace(s_marker, "").strip()
                  b_text = b_text_part.strip()
             elif s_marker in combined_text: 
                  s_text = combined_text.replace(s_marker,"").strip()
             else: 
                  b_text = combined_text.strip()


        subjects.append(simple_text_clean(s_text))
        senders.append(simple_text_clean(d_text))
        bodies.append(simple_text_clean(b_text))

    data_for_lime = pd.DataFrame({
        'subject': subjects,
        'sender': senders,
        'body': bodies
    })
    
    try:
        vectorized_input = nb_preprocessor.transform(data_for_lime)
        probabilities = nb_model.predict_proba(vectorized_input)
        return probabilities
    except Exception as e:
        print(f"Error in model_predict_probability_for_lime function during transform/predict: {e}")
        return np.array([[0.5, 0.5]] * len(combined_texts))

def get_prediction_and_explanation_nb(subject: str, sender: str, body: str):
    if nb_preprocessor is None or nb_model is None:
        return {"error": "Model/Preprocessor not loaded. Check server logs.", "prediction": "Error", "label": -1, "confidence": 0.0, "explanation": []}
    
    cleaned_subject = simple_text_clean(subject)
    cleaned_sender = simple_text_clean(sender)
    cleaned_body = simple_text_clean(body)

    input_df_for_model = pd.DataFrame({
        'subject': [cleaned_subject],
        'sender': [cleaned_sender],
        'body': [cleaned_body]
        })

    try:
        vectorized_input = nb_preprocessor.transform(input_df_for_model)
        prediction_label_int = nb_model.predict(vectorized_input)[0]
        probabilities = nb_model.predict_proba(vectorized_input)[0]
        
        predicted_class_name = CLASS_NAMES[prediction_label_int]
        confidence_score = probabilities[prediction_label_int]
    except Exception as e:
        return {"error": f"Prediction error: {e}", "prediction": "Error", 
                "label": -1, "confidence": 0.0, "explanation": []}

    text_for_lime = f"{cleaned_subject} : {cleaned_sender} : {cleaned_body}"

    explanation_data = []
    try:
        exp = lime_explainer_nb.explain_instance(
            text_instance=text_for_lime, 
            classifier_fn=model_predict_probability_for_lime, 
            num_features=15, 
            top_labels=1,  
            labels=(prediction_label_int,)
        )
        explanation_data = exp.as_list(label=prediction_label_int) 
        print(f"LIME Explanation (Top 3): {explanation_data[:3]}")
    except Exception as e:
        print(f"LIME explanation error: {e}")
        explanation_data = [("LIME explanation error or N/A", 0.0)]

    return {
        "prediction": predicted_class_name,
        "label": int(prediction_label_int),
        "confidence": float(confidence_score),
        "explanation": explanation_data
    }