Upload 7 files
Browse files- .gitattributes +1 -35
- .gitignore +12 -0
- README.md +1 -12
- api.py +37 -0
- streamlit_app.py +28 -0
- train.py +8 -0
- utils.py +41 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
model/classifier.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
|
| 5 |
+
# Virtual environments
|
| 6 |
+
venv/
|
| 7 |
+
ENV/
|
| 8 |
+
.env
|
| 9 |
+
|
| 10 |
+
# OS files
|
| 11 |
+
.DS_Store
|
| 12 |
+
Thumbs.db
|
README.md
CHANGED
|
@@ -1,12 +1 @@
|
|
| 1 |
-
|
| 2 |
-
title: Email Classification And PII Masking System
|
| 3 |
-
emoji: 🏢
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: streamlit
|
| 7 |
-
sdk_version: 1.44.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
# Email-Classification-and-PII-Masking-System
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from utils import mask_pii, unmask_pii
|
| 5 |
+
from models import load_model
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Create FastAPI app
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
|
| 11 |
+
# Load the trained model at startup
|
| 12 |
+
model = load_model("model/classifier.pkl")
|
| 13 |
+
|
| 14 |
+
# Define request schema
|
| 15 |
+
class EmailRequest(BaseModel):
|
| 16 |
+
email_body: str
|
| 17 |
+
|
| 18 |
+
# Define the API endpoint
|
| 19 |
+
@app.post("/predict")
|
| 20 |
+
def classify_email(req: EmailRequest):
|
| 21 |
+
email_text = req.email_body
|
| 22 |
+
|
| 23 |
+
# Step 1: Mask PII
|
| 24 |
+
masked_text, entities = mask_pii(email_text)
|
| 25 |
+
|
| 26 |
+
# Step 2: Predict Category
|
| 27 |
+
category = model.predict([masked_text])[0]
|
| 28 |
+
|
| 29 |
+
# Step 3: Build the response
|
| 30 |
+
response = {
|
| 31 |
+
"input_email_body": email_text,
|
| 32 |
+
"list_of_masked_entities": entities,
|
| 33 |
+
"masked_email": masked_text,
|
| 34 |
+
"category_of_the_email": category
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
return response
|
streamlit_app.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# streamlit_app.py
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import requests
|
| 4 |
+
|
| 5 |
+
st.title("📧 Email Classification and PII Masking System")
|
| 6 |
+
|
| 7 |
+
email_input = st.text_area("Enter your email text:")
|
| 8 |
+
|
| 9 |
+
if st.button("Classify Email"):
|
| 10 |
+
if email_input.strip() != "":
|
| 11 |
+
# Send to FastAPI
|
| 12 |
+
response = requests.post("http://127.0.0.1:8000/predict", json={"email_body": email_input})
|
| 13 |
+
|
| 14 |
+
if response.status_code == 200:
|
| 15 |
+
result = response.json()
|
| 16 |
+
|
| 17 |
+
st.subheader("🔎 Masked Email:")
|
| 18 |
+
st.write(result['masked_email'])
|
| 19 |
+
|
| 20 |
+
st.subheader("🔐 List of Masked Entities:")
|
| 21 |
+
st.json(result['list_of_masked_entities'])
|
| 22 |
+
|
| 23 |
+
st.subheader("📂 Predicted Category:")
|
| 24 |
+
st.success(result['category_of_the_email'])
|
| 25 |
+
else:
|
| 26 |
+
st.error("Error from API.")
|
| 27 |
+
else:
|
| 28 |
+
st.warning("Please enter email text!")
|
train.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# train.py
|
| 2 |
+
from models import train_and_save_model
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
data_path = "data/combined_emails_with_natural_pii.csv" # Correct dataset path
|
| 6 |
+
model_path = "model/classifier.pkl" # Where model will be saved
|
| 7 |
+
|
| 8 |
+
train_and_save_model(data_path, model_path)
|
utils.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils.py
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def mask_pii(text):
|
| 5 |
+
entities = []
|
| 6 |
+
original_text = text
|
| 7 |
+
|
| 8 |
+
# Define patterns
|
| 9 |
+
patterns = {
|
| 10 |
+
"full_name": r"(?:(?:Mr|Ms|Mrs|Dr)\\.?\s)?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+",
|
| 11 |
+
"email": r"[\\w\\.-]+@[\\w\\.-]+",
|
| 12 |
+
"phone_number": r"(\\+91[-\\s]?)?[6-9]\\d{9}",
|
| 13 |
+
"dob": r"(\\d{2}[/-]\\d{2}[/-]\\d{4})",
|
| 14 |
+
"aadhar_num": r"\\d{4}\\s\\d{4}\\s\\d{4}",
|
| 15 |
+
"credit_debit_no": r"\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}",
|
| 16 |
+
"cvv_no": r"\\b\\d{3}\\b",
|
| 17 |
+
"expiry_no": r"(0[1-9]|1[0-2])/\\d{2}"
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
masked_text = text
|
| 21 |
+
|
| 22 |
+
for entity, pattern in patterns.items():
|
| 23 |
+
for match in re.finditer(pattern, original_text):
|
| 24 |
+
start, end = match.span()
|
| 25 |
+
matched_text = match.group()
|
| 26 |
+
|
| 27 |
+
entities.append({
|
| 28 |
+
"position": [start, end],
|
| 29 |
+
"classification": entity,
|
| 30 |
+
"entity": matched_text
|
| 31 |
+
})
|
| 32 |
+
|
| 33 |
+
masked_text = masked_text.replace(matched_text, f"[{entity}]", 1)
|
| 34 |
+
|
| 35 |
+
return masked_text, entities
|
| 36 |
+
|
| 37 |
+
def unmask_pii(masked_text, entities):
|
| 38 |
+
unmasked_text = masked_text
|
| 39 |
+
for ent in entities:
|
| 40 |
+
unmasked_text = unmasked_text.replace(f"[{ent['classification']}]", ent['entity'], 1)
|
| 41 |
+
return unmasked_text
|