Spaces:

lleratodev
/

multinomial-nb-phishing-email-detection-api

Sleeping

App Files Files Community

LT360 commited on May 6, 2025

Commit

b5c79a0

1 Parent(s): e166f59

Initial commit of multinomial-nb-phishing-email-detection-api

Browse files

Files changed (10) hide show

.gitattributes +1 -0
.gitignore +82 -0
.vscode/launch.json +19 -0
Dockerfile +13 -0
app/__init__.py +0 -0
app/assets/email_preprocessor_20250506_203148.joblib +3 -0
app/assets/phishing_nb_model_20250506_203148.joblib +3 -0
app/main.py +39 -0
app/ml_logic.py +146 -0
requirements.txt +10 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app/assets/*.joblib filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,82 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.hypothesis/
+.pytest_cache/
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+phishing_api_env/
+# VS Code
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Jupyter Notebook
+.ipynb_checkpoints
+# Personal files
+secrets.py
+local_settings.py
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: FastAPI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "app.main:app",
+                "--reload"
+            ],
+            "jinja": true
+        }
+    ]
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9-slim
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+# Install dependencies
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY ./app /code/app
+# Run the Uvicorn server when the container starts
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__init__.py ADDED Viewed

File without changes

app/assets/email_preprocessor_20250506_203148.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:279f139d98042e89d2d46a30c37a0ea32e1aaddae7ae247920476474af43a26a
+size 639092

app/assets/phishing_nb_model_20250506_203148.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5480ff6d4f84e518148e2c415164f50e25e1f1312733ed38717a8a36186b9497
+size 544791

app/main.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List, Tuple, Optional
+from .ml_logic import get_prediction_and_explanation # helper function from ml_logic
+app = FastAPI(title="AI-Powered Phishing Email Detection System")
+# Input data model
+class EmailInput(BaseModel):
+    subject: Optional[str] = ""
+    sender: Optional[str] = ""
+    body: str
+# Define output data model
+class PredictionResponse(BaseModel):
+    prediction: str
+    label: int
+    confidence: float
+    explanation: List[Tuple[str, float]]
+    error: Optional[str] = None
+@app.get("/")
+async def root():
+    return {"message": "AI-Powered Phishing Email Detection API. POST to /predict with 'subject', 'sender', 'body'."}
+@app.post("/predict", response_model=PredictionResponse)
+async def predict_email(email_input: EmailInput):
+    try:
+        result = get_prediction_and_explanation(
+            email_input.subject or "",
+            email_input.sender or "",
+            email_input.body
+        )
+        if "error" in result and result["error"]:
+             return PredictionResponse(prediction="Error", label=-1, confidence=0.0, explanation=[], error=result["error"])
+        return PredictionResponse(**result)
+    except Exception as e:
+        return PredictionResponse(prediction="Error", label=-1, confidence=0.0, explanation=[], error=f"API error: {str(e)}")

app/ml_logic.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import joblib
+import pandas as pd
+import re
+from lime.lime_text import LimeTextExplainer
+import numpy as np
+import os
+# Configure and setup model and preprocessor files
+ASSETS_DIR = os.path.join(os.path.dirname(__file__), 'assets')
+PREPROCESSOR_FILENAME = "email_preprocessor_20250506_203148.joblib"
+MODEL_FILENAME = "phishing_nb_model_20250506_203148.joblib"
+PREPROCESSOR_PATH = os.path.join(ASSETS_DIR, PREPROCESSOR_FILENAME)
+MODEL_PATH = os.path.join(ASSETS_DIR, MODEL_FILENAME)
+# Load model and preprocessor
+try:
+    preprocessor = joblib.load(PREPROCESSOR_PATH)
+    model = joblib.load(MODEL_PATH)
+    print("ML Model and Preprocessor loaded successfully from ml_logic.")
+except FileNotFoundError:
+    print(f"FATAL ERROR: Could not find model ('{MODEL_PATH}') or preprocessor ('{PREPROCESSOR_PATH}').")
+    print("Ensure files are in 'app/assets/' and filenames are correct in ml_logic.py.")
+    preprocessor = None
+    model = None
+except Exception as e:
+    print(f"Error loading ML model/preprocessor: {e}")
+    preprocessor = None
+    model = None
+# Text cleaning function, makes everything lowercase, removed non alpha-numeric characters and normalize white spaces
+def simple_text_clean(text):
+    if isinstance(text, str):
+        text = text.lower()
+        text = re.sub(r'[^a-z0-9\s]', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+    else:
+        text = ''
+    return text
+# For explanability, LIME setup
+class_names = ['Legitimate', 'Phishing'] # 0: Legitimate, 1: Phishing
+explainer = LimeTextExplainer(class_names=class_names)
+def model_predict_probability_for_lime(combined_texts):
+    if preprocessor is None or model is None:
+        return np.array([[0.5, 0.5]] * len(combined_texts))
+    subjects = []
+    senders = []
+    bodies = []
+    for combined_text in combined_texts:
+        s_marker = "subject: "
+        d_marker = " sender: "
+        b_marker = " body: "
+        s_text, d_text, b_text = "", "", ""
+        if d_marker in combined_text:
+            s_text_part, rest = combined_text.split(d_marker, 1)
+            if s_marker in s_text_part:
+                s_text = s_text_part.replace(s_marker, "").strip()
+            if b_marker in rest:
+                d_text_part, b_text_part = rest.split(b_marker, 1)
+                d_text = d_text_part.strip()
+                b_text = b_text_part.strip()
+            else:
+                d_text = rest.strip()
+        else:
+             if s_marker in combined_text and b_marker in combined_text :
+                  s_text_part, b_text_part = combined_text.split(b_marker, 1)
+                  s_text = s_text_part.replace(s_marker, "").strip()
+                  b_text = b_text_part.strip()
+             elif s_marker in combined_text:
+                  s_text = combined_text.replace(s_marker,"").strip()
+             else:
+                  b_text = combined_text.strip()
+        subjects.append(simple_text_clean(s_text))
+        senders.append(simple_text_clean(d_text))
+        bodies.append(simple_text_clean(b_text))
+    data_for_lime = pd.DataFrame({
+        'subject': subjects,
+        'sender': senders,
+        'body': bodies
+    })
+    try:
+        vectorized_input = preprocessor.transform(data_for_lime)
+        probabilities = model.predict_proba(vectorized_input)
+        return probabilities
+    except Exception as e:
+        print(f"Error in model_predict_probability_for_lime function during transform/predict: {e}")
+        return np.array([[0.5, 0.5]] * len(combined_texts))
+def get_prediction_and_explanation(subject: str, sender: str, body: str):
+    if preprocessor is None or model is None:
+        return {"error": "Model/Preprocessor not loaded. Check server logs.", "prediction": "Error", "label": -1, "confidence": 0.0, "explanation": []}
+    cleaned_subject = simple_text_clean(subject)
+    cleaned_sender = simple_text_clean(sender)
+    cleaned_body = simple_text_clean(body)
+    input_df_for_model = pd.DataFrame({
+        'subject': [cleaned_subject],
+        'sender': [cleaned_sender],
+        'body': [cleaned_body]
+        })
+    try:
+        vectorized_input = preprocessor.transform(input_df_for_model)
+        prediction_label_int = model.predict(vectorized_input)[0]
+        probabilities = model.predict_proba(vectorized_input)[0]
+        predicted_class_name = class_names[prediction_label_int]
+        confidence_score = probabilities[prediction_label_int]
+    except Exception as e:
+        return {"error": f"Prediction error: {e}", "prediction": "Error",
+                "label": -1, "confidence": 0.0, "explanation": []}
+    text_for_lime = f"{cleaned_subject} : {cleaned_sender} : {cleaned_body}"
+    explanation_data = []
+    try:
+        exp = explainer.explain_instance(
+            text_instance=text_for_lime,
+            classifier_fn=model_predict_probability_for_lime,
+            num_features=15,
+            top_labels=1,
+            labels=(prediction_label_int,)
+        )
+        explanation_data = exp.as_list(label=prediction_label_int)
+        print(f"LIME Explanation (Top 3): {explanation_data[:3]}")
+    except Exception as e:
+        print(f"LIME explanation error: {e}")
+        explanation_data = [("LIME explanation error or N/A", 0.0)]
+    return {
+        "prediction": predicted_class_name,
+        "label": int(prediction_label_int),
+        "confidence": float(confidence_score),
+        "explanation": explanation_data
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn[standard]
+scikit-learn
+pandas
+joblib
+scipy
+numpy
+lime
+python-multipart
+dill