LT360 commited on
Commit
b5c79a0
·
1 Parent(s): e166f59

Initial commit of multinomial-nb-phishing-email-detection-api

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/assets/*.joblib filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ *.manifest
30
+ *.spec
31
+
32
+ # Installer logs
33
+ pip-log.txt
34
+ pip-delete-this-directory.txt
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .nox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *.cover
46
+ *.log
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Environments
51
+ .env
52
+ .venv
53
+ env/
54
+ venv/
55
+ ENV/
56
+ env.bak/
57
+ venv.bak/
58
+ phishing_api_env/
59
+
60
+ # VS Code
61
+ .vscode/*
62
+ !.vscode/settings.json
63
+ !.vscode/tasks.json
64
+ !.vscode/launch.json
65
+ !.vscode/extensions.json
66
+ *.code-workspace
67
+
68
+ # Jupyter Notebook
69
+ .ipynb_checkpoints
70
+
71
+ # Personal files
72
+ secrets.py
73
+ local_settings.py
74
+
75
+ # OS generated files
76
+ .DS_Store
77
+ .DS_Store?
78
+ ._*
79
+ .Spotlight-V100
80
+ .Trashes
81
+ ehthumbs.db
82
+ Thumbs.db
.vscode/launch.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Python Debugger: FastAPI",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "module": "uvicorn",
12
+ "args": [
13
+ "app.main:app",
14
+ "--reload"
15
+ ],
16
+ "jinja": true
17
+ }
18
+ ]
19
+ }
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ # Install dependencies
8
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
9
+
10
+ COPY ./app /code/app
11
+
12
+ # Run the Uvicorn server when the container starts
13
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/__init__.py ADDED
File without changes
app/assets/email_preprocessor_20250506_203148.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:279f139d98042e89d2d46a30c37a0ea32e1aaddae7ae247920476474af43a26a
3
+ size 639092
app/assets/phishing_nb_model_20250506_203148.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5480ff6d4f84e518148e2c415164f50e25e1f1312733ed38717a8a36186b9497
3
+ size 544791
app/main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from typing import List, Tuple, Optional
4
+ from .ml_logic import get_prediction_and_explanation # helper function from ml_logic
5
+
6
+ app = FastAPI(title="AI-Powered Phishing Email Detection System")
7
+
8
+ # Input data model
9
+ class EmailInput(BaseModel):
10
+ subject: Optional[str] = ""
11
+ sender: Optional[str] = ""
12
+ body: str
13
+
14
+ # Define output data model
15
+ class PredictionResponse(BaseModel):
16
+ prediction: str
17
+ label: int
18
+ confidence: float
19
+ explanation: List[Tuple[str, float]]
20
+ error: Optional[str] = None
21
+
22
+
23
+ @app.get("/")
24
+ async def root():
25
+ return {"message": "AI-Powered Phishing Email Detection API. POST to /predict with 'subject', 'sender', 'body'."}
26
+
27
+ @app.post("/predict", response_model=PredictionResponse)
28
+ async def predict_email(email_input: EmailInput):
29
+ try:
30
+ result = get_prediction_and_explanation(
31
+ email_input.subject or "",
32
+ email_input.sender or "",
33
+ email_input.body
34
+ )
35
+ if "error" in result and result["error"]:
36
+ return PredictionResponse(prediction="Error", label=-1, confidence=0.0, explanation=[], error=result["error"])
37
+ return PredictionResponse(**result)
38
+ except Exception as e:
39
+ return PredictionResponse(prediction="Error", label=-1, confidence=0.0, explanation=[], error=f"API error: {str(e)}")
app/ml_logic.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+ import re
4
+ from lime.lime_text import LimeTextExplainer
5
+ import numpy as np
6
+ import os
7
+
8
+ # Configure and setup model and preprocessor files
9
+ ASSETS_DIR = os.path.join(os.path.dirname(__file__), 'assets')
10
+ PREPROCESSOR_FILENAME = "email_preprocessor_20250506_203148.joblib"
11
+ MODEL_FILENAME = "phishing_nb_model_20250506_203148.joblib"
12
+ PREPROCESSOR_PATH = os.path.join(ASSETS_DIR, PREPROCESSOR_FILENAME)
13
+ MODEL_PATH = os.path.join(ASSETS_DIR, MODEL_FILENAME)
14
+
15
+ # Load model and preprocessor
16
+ try:
17
+ preprocessor = joblib.load(PREPROCESSOR_PATH)
18
+ model = joblib.load(MODEL_PATH)
19
+ print("ML Model and Preprocessor loaded successfully from ml_logic.")
20
+ except FileNotFoundError:
21
+ print(f"FATAL ERROR: Could not find model ('{MODEL_PATH}') or preprocessor ('{PREPROCESSOR_PATH}').")
22
+ print("Ensure files are in 'app/assets/' and filenames are correct in ml_logic.py.")
23
+ preprocessor = None
24
+ model = None
25
+ except Exception as e:
26
+ print(f"Error loading ML model/preprocessor: {e}")
27
+ preprocessor = None
28
+ model = None
29
+
30
+ # Text cleaning function, makes everything lowercase, removed non alpha-numeric characters and normalize white spaces
31
+ def simple_text_clean(text):
32
+ if isinstance(text, str):
33
+ text = text.lower()
34
+ text = re.sub(r'[^a-z0-9\s]', '', text)
35
+ text = re.sub(r'\s+', ' ', text).strip()
36
+ else:
37
+ text = ''
38
+ return text
39
+
40
+ # For explanability, LIME setup
41
+ class_names = ['Legitimate', 'Phishing'] # 0: Legitimate, 1: Phishing
42
+ explainer = LimeTextExplainer(class_names=class_names)
43
+
44
+ def model_predict_probability_for_lime(combined_texts):
45
+ if preprocessor is None or model is None:
46
+ return np.array([[0.5, 0.5]] * len(combined_texts))
47
+
48
+ subjects = []
49
+ senders = []
50
+ bodies = []
51
+
52
+ for combined_text in combined_texts:
53
+ s_marker = "subject: "
54
+ d_marker = " sender: "
55
+ b_marker = " body: "
56
+
57
+ s_text, d_text, b_text = "", "", ""
58
+
59
+ if d_marker in combined_text:
60
+ s_text_part, rest = combined_text.split(d_marker, 1)
61
+ if s_marker in s_text_part:
62
+ s_text = s_text_part.replace(s_marker, "").strip()
63
+
64
+ if b_marker in rest:
65
+ d_text_part, b_text_part = rest.split(b_marker, 1)
66
+ d_text = d_text_part.strip()
67
+ b_text = b_text_part.strip()
68
+ else:
69
+ d_text = rest.strip()
70
+ else:
71
+ if s_marker in combined_text and b_marker in combined_text :
72
+ s_text_part, b_text_part = combined_text.split(b_marker, 1)
73
+ s_text = s_text_part.replace(s_marker, "").strip()
74
+ b_text = b_text_part.strip()
75
+ elif s_marker in combined_text:
76
+ s_text = combined_text.replace(s_marker,"").strip()
77
+ else:
78
+ b_text = combined_text.strip()
79
+
80
+
81
+ subjects.append(simple_text_clean(s_text))
82
+ senders.append(simple_text_clean(d_text))
83
+ bodies.append(simple_text_clean(b_text))
84
+
85
+ data_for_lime = pd.DataFrame({
86
+ 'subject': subjects,
87
+ 'sender': senders,
88
+ 'body': bodies
89
+ })
90
+
91
+ try:
92
+ vectorized_input = preprocessor.transform(data_for_lime)
93
+ probabilities = model.predict_proba(vectorized_input)
94
+ return probabilities
95
+ except Exception as e:
96
+ print(f"Error in model_predict_probability_for_lime function during transform/predict: {e}")
97
+ return np.array([[0.5, 0.5]] * len(combined_texts))
98
+
99
+ def get_prediction_and_explanation(subject: str, sender: str, body: str):
100
+ if preprocessor is None or model is None:
101
+ return {"error": "Model/Preprocessor not loaded. Check server logs.", "prediction": "Error", "label": -1, "confidence": 0.0, "explanation": []}
102
+
103
+ cleaned_subject = simple_text_clean(subject)
104
+ cleaned_sender = simple_text_clean(sender)
105
+ cleaned_body = simple_text_clean(body)
106
+
107
+ input_df_for_model = pd.DataFrame({
108
+ 'subject': [cleaned_subject],
109
+ 'sender': [cleaned_sender],
110
+ 'body': [cleaned_body]
111
+ })
112
+
113
+ try:
114
+ vectorized_input = preprocessor.transform(input_df_for_model)
115
+ prediction_label_int = model.predict(vectorized_input)[0]
116
+ probabilities = model.predict_proba(vectorized_input)[0]
117
+
118
+ predicted_class_name = class_names[prediction_label_int]
119
+ confidence_score = probabilities[prediction_label_int]
120
+ except Exception as e:
121
+ return {"error": f"Prediction error: {e}", "prediction": "Error",
122
+ "label": -1, "confidence": 0.0, "explanation": []}
123
+
124
+ text_for_lime = f"{cleaned_subject} : {cleaned_sender} : {cleaned_body}"
125
+
126
+ explanation_data = []
127
+ try:
128
+ exp = explainer.explain_instance(
129
+ text_instance=text_for_lime,
130
+ classifier_fn=model_predict_probability_for_lime,
131
+ num_features=15,
132
+ top_labels=1,
133
+ labels=(prediction_label_int,)
134
+ )
135
+ explanation_data = exp.as_list(label=prediction_label_int)
136
+ print(f"LIME Explanation (Top 3): {explanation_data[:3]}")
137
+ except Exception as e:
138
+ print(f"LIME explanation error: {e}")
139
+ explanation_data = [("LIME explanation error or N/A", 0.0)]
140
+
141
+ return {
142
+ "prediction": predicted_class_name,
143
+ "label": int(prediction_label_int),
144
+ "confidence": float(confidence_score),
145
+ "explanation": explanation_data
146
+ }
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ scikit-learn
4
+ pandas
5
+ joblib
6
+ scipy
7
+ numpy
8
+ lime
9
+ python-multipart
10
+ dill