rajendradayma commited on
Commit
9bfe8cb
·
verified ·
1 Parent(s): 3fb5502

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -35
  2. .gitignore +12 -0
  3. README.md +1 -12
  4. api.py +37 -0
  5. streamlit_app.py +28 -0
  6. train.py +8 -0
  7. utils.py +41 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ model/classifier.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ # Virtual environments
6
+ venv/
7
+ ENV/
8
+ .env
9
+
10
+ # OS files
11
+ .DS_Store
12
+ Thumbs.db
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Email Classification And PII Masking System
3
- emoji: 🏢
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Email-Classification-and-PII-Masking-System
 
 
 
 
 
 
 
 
 
 
 
api.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from utils import mask_pii, unmask_pii
5
+ from models import load_model
6
+
7
+
8
+ # Create FastAPI app
9
+ app = FastAPI()
10
+
11
+ # Load the trained model at startup
12
+ model = load_model("model/classifier.pkl")
13
+
14
+ # Define request schema
15
+ class EmailRequest(BaseModel):
16
+ email_body: str
17
+
18
+ # Define the API endpoint
19
+ @app.post("/predict")
20
+ def classify_email(req: EmailRequest):
21
+ email_text = req.email_body
22
+
23
+ # Step 1: Mask PII
24
+ masked_text, entities = mask_pii(email_text)
25
+
26
+ # Step 2: Predict Category
27
+ category = model.predict([masked_text])[0]
28
+
29
+ # Step 3: Build the response
30
+ response = {
31
+ "input_email_body": email_text,
32
+ "list_of_masked_entities": entities,
33
+ "masked_email": masked_text,
34
+ "category_of_the_email": category
35
+ }
36
+
37
+ return response
streamlit_app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit_app.py
2
+ import streamlit as st
3
+ import requests
4
+
5
+ st.title("📧 Email Classification and PII Masking System")
6
+
7
+ email_input = st.text_area("Enter your email text:")
8
+
9
+ if st.button("Classify Email"):
10
+ if email_input.strip() != "":
11
+ # Send to FastAPI
12
+ response = requests.post("http://127.0.0.1:8000/predict", json={"email_body": email_input})
13
+
14
+ if response.status_code == 200:
15
+ result = response.json()
16
+
17
+ st.subheader("🔎 Masked Email:")
18
+ st.write(result['masked_email'])
19
+
20
+ st.subheader("🔐 List of Masked Entities:")
21
+ st.json(result['list_of_masked_entities'])
22
+
23
+ st.subheader("📂 Predicted Category:")
24
+ st.success(result['category_of_the_email'])
25
+ else:
26
+ st.error("Error from API.")
27
+ else:
28
+ st.warning("Please enter email text!")
train.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # train.py
2
+ from models import train_and_save_model
3
+
4
+ if __name__ == "__main__":
5
+ data_path = "data/combined_emails_with_natural_pii.csv" # Correct dataset path
6
+ model_path = "model/classifier.pkl" # Where model will be saved
7
+
8
+ train_and_save_model(data_path, model_path)
utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils.py
2
+ import re
3
+
4
+ def mask_pii(text):
5
+ entities = []
6
+ original_text = text
7
+
8
+ # Define patterns
9
+ patterns = {
10
+ "full_name": r"(?:(?:Mr|Ms|Mrs|Dr)\\.?\s)?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+",
11
+ "email": r"[\\w\\.-]+@[\\w\\.-]+",
12
+ "phone_number": r"(\\+91[-\\s]?)?[6-9]\\d{9}",
13
+ "dob": r"(\\d{2}[/-]\\d{2}[/-]\\d{4})",
14
+ "aadhar_num": r"\\d{4}\\s\\d{4}\\s\\d{4}",
15
+ "credit_debit_no": r"\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}",
16
+ "cvv_no": r"\\b\\d{3}\\b",
17
+ "expiry_no": r"(0[1-9]|1[0-2])/\\d{2}"
18
+ }
19
+
20
+ masked_text = text
21
+
22
+ for entity, pattern in patterns.items():
23
+ for match in re.finditer(pattern, original_text):
24
+ start, end = match.span()
25
+ matched_text = match.group()
26
+
27
+ entities.append({
28
+ "position": [start, end],
29
+ "classification": entity,
30
+ "entity": matched_text
31
+ })
32
+
33
+ masked_text = masked_text.replace(matched_text, f"[{entity}]", 1)
34
+
35
+ return masked_text, entities
36
+
37
+ def unmask_pii(masked_text, entities):
38
+ unmasked_text = masked_text
39
+ for ent in entities:
40
+ unmasked_text = unmasked_text.replace(f"[{ent['classification']}]", ent['entity'], 1)
41
+ return unmasked_text