Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- .dockerignore +28 -0
- .gitattributes +3 -35
- .gitignore +137 -0
- Dockerfile +24 -0
- Procfile +1 -0
- app.py +126 -0
- classification_model.py +162 -0
- config.py +31 -0
- email_classifier.joblib +3 -0
- pii_masking.py +324 -0
- requirements.txt +21 -0
- test_pii_masking.py +235 -0
- utils.py +64 -0
.dockerignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git files
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
.gitattributes
|
| 5 |
+
|
| 6 |
+
# Python virtual environments and cache
|
| 7 |
+
venv/
|
| 8 |
+
myenv/
|
| 9 |
+
*.pyc
|
| 10 |
+
__pycache__/
|
| 11 |
+
.env
|
| 12 |
+
|
| 13 |
+
# Large datasets not needed at runtime
|
| 14 |
+
combined_emails_with_natural_pii.csv
|
| 15 |
+
|
| 16 |
+
# Test files (if not needed in production image)
|
| 17 |
+
# test_*.py
|
| 18 |
+
# tests/
|
| 19 |
+
|
| 20 |
+
# IDE and OS specific files
|
| 21 |
+
.idea/
|
| 22 |
+
.vscode/
|
| 23 |
+
.DS_Store
|
| 24 |
+
|
| 25 |
+
# Local development artifacts
|
| 26 |
+
*.log
|
| 27 |
+
*.db
|
| 28 |
+
*.sqlite3
|
.gitattributes
CHANGED
|
@@ -1,35 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
*
|
| 3 |
-
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Auto detect text files and perform LF normalization
|
| 2 |
+
* text=auto
|
| 3 |
+
email_classifier.joblib filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib602/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
MANIFEST
|
| 26 |
+
|
| 27 |
+
# PyInstaller
|
| 28 |
+
# Usually these files are written by a python script from a template
|
| 29 |
+
# before PyInstaller builds the exe, so as to inject date/version info.
|
| 30 |
+
*.manifest
|
| 31 |
+
*.spec
|
| 32 |
+
|
| 33 |
+
# Installer logs
|
| 34 |
+
pip-log.txt
|
| 35 |
+
pip-delete-this-directory.txt
|
| 36 |
+
|
| 37 |
+
# Unit test / coverage reports
|
| 38 |
+
htmlcov/
|
| 39 |
+
.tox/
|
| 40 |
+
.nox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*_model
|
| 43 |
+
.cache
|
| 44 |
+
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
| 45 |
+
nosetests.xml
|
| 46 |
+
coverage.xml
|
| 47 |
+
*.cover
|
| 48 |
+
.hypothesis/
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
|
| 51 |
+
# Translations
|
| 52 |
+
*.mo
|
| 53 |
+
*.pot
|
| 54 |
+
*.pt
|
| 55 |
+
|
| 56 |
+
# Django stuff:
|
| 57 |
+
*.log
|
| 58 |
+
local_settings.py
|
| 59 |
+
db.sqlite3
|
| 60 |
+
db.sqlite3-journal
|
| 61 |
+
|
| 62 |
+
# Flask stuff:
|
| 63 |
+
instance/
|
| 64 |
+
.webassets-cache
|
| 65 |
+
|
| 66 |
+
# Scrapy stuff:
|
| 67 |
+
.scrapy
|
| 68 |
+
|
| 69 |
+
# Sphinx documentation
|
| 70 |
+
docs/_build/
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
target/
|
| 74 |
+
|
| 75 |
+
# Jupyter Notebook
|
| 76 |
+
.ipynb_checkpoints
|
| 77 |
+
|
| 78 |
+
# IPython
|
| 79 |
+
profile_default/
|
| 80 |
+
ipython_config.py
|
| 81 |
+
|
| 82 |
+
# Environments
|
| 83 |
+
.env
|
| 84 |
+
.venv
|
| 85 |
+
env/
|
| 86 |
+
venv/
|
| 87 |
+
ENV/
|
| 88 |
+
env.bak/
|
| 89 |
+
venv.bak/
|
| 90 |
+
|
| 91 |
+
# Spyder project settings
|
| 92 |
+
.spyderproject
|
| 93 |
+
.spyproject
|
| 94 |
+
|
| 95 |
+
# Rope project settings
|
| 96 |
+
.ropeproject
|
| 97 |
+
|
| 98 |
+
# mkdocs documentation
|
| 99 |
+
/site
|
| 100 |
+
|
| 101 |
+
# mypy
|
| 102 |
+
.mypy_cache/
|
| 103 |
+
.dmypy.json
|
| 104 |
+
dmypy.json
|
| 105 |
+
|
| 106 |
+
# Pyre type checker
|
| 107 |
+
.pyre/
|
| 108 |
+
|
| 109 |
+
# VS Code
|
| 110 |
+
.vscode/
|
| 111 |
+
|
| 112 |
+
# Hugging Face specific (if any, like model cache in project dir - though usually global)
|
| 113 |
+
# .huggingface/
|
| 114 |
+
|
| 115 |
+
# Dataset files (if large and not meant to be in git)
|
| 116 |
+
# *.csv
|
| 117 |
+
# *.jsonl
|
| 118 |
+
|
| 119 |
+
# Model files (if large)
|
| 120 |
+
# *.joblib
|
| 121 |
+
# *.pth
|
| 122 |
+
# *.bin
|
| 123 |
+
# *.onnx
|
| 124 |
+
|
| 125 |
+
# OS generated files
|
| 126 |
+
.DS_Store
|
| 127 |
+
.DS_Store?
|
| 128 |
+
._*
|
| 129 |
+
.Spotlight-V100
|
| 130 |
+
.Trashes
|
| 131 |
+
ehthumbs.db
|
| 132 |
+
Thumbs.db
|
| 133 |
+
|
| 134 |
+
/myenv
|
| 135 |
+
/myenv/*
|
| 136 |
+
|
| 137 |
+
/memory-bank/*
|
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Copy only the requirements file first to leverage Docker cache
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
|
| 10 |
+
# Install dependencies
|
| 11 |
+
# The en_core_web_sm model might be installed via requirements.txt if specified with a URL.
|
| 12 |
+
# Running `python -m spacy download en_core_web_sm` ensures it's available.
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt && \
|
| 14 |
+
python -m spacy download en_core_web_sm
|
| 15 |
+
|
| 16 |
+
# Copy the rest of the application code into the container
|
| 17 |
+
# This includes app.py, pii_masking.py, email_classifier.joblib, etc.
|
| 18 |
+
# Ensure .dockerignore is used to exclude unnecessary files.
|
| 19 |
+
COPY . .
|
| 20 |
+
|
| 21 |
+
# Hugging Face Spaces will use the Procfile to run the application.
|
| 22 |
+
# The Procfile should be: web: uvicorn app:app --host 0.0.0.0 --port $PORT
|
| 23 |
+
# The $PORT environment variable will be set by Hugging Face Spaces,
|
| 24 |
+
# based on the `app_port` in the README.md YAML (e.g., 7860).
|
Procfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
web: uvicorn app:app --host=0.0.0.0 --port=$PORT
|
app.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from fastapi import FastAPI, HTTPException, APIRouter
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
import uvicorn
|
| 5 |
+
|
| 6 |
+
from config import CLASSIFICATION_MODEL_PATH, LOG_LEVEL, LOG_FORMAT
|
| 7 |
+
from pii_masking import mask_pii_details, nlp as spacy_nlp_model
|
| 8 |
+
from classification_model import classify_email_category, load_classification_model
|
| 9 |
+
|
| 10 |
+
# --- Logging Setup ---
|
| 11 |
+
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
app = FastAPI(title="Email Classification and PII Masking API by nitinprajwal", version="1.0.0")
|
| 15 |
+
|
| 16 |
+
# --- API Router for v1 --- #
|
| 17 |
+
router_v1 = APIRouter(prefix="/api/v1")
|
| 18 |
+
|
| 19 |
+
# --- Pydantic Models for API --- #
|
| 20 |
+
class EmailInput(BaseModel):
|
| 21 |
+
input_email_body: str
|
| 22 |
+
|
| 23 |
+
class MaskedEntity(BaseModel):
|
| 24 |
+
position: list[int]
|
| 25 |
+
classification: str
|
| 26 |
+
entity: str
|
| 27 |
+
|
| 28 |
+
class ClassificationOutput(BaseModel):
|
| 29 |
+
input_email_body: str
|
| 30 |
+
list_of_masked_entities: list[MaskedEntity]
|
| 31 |
+
masked_email: str
|
| 32 |
+
category_of_the_email: str
|
| 33 |
+
|
| 34 |
+
# --- Load models --- #
|
| 35 |
+
logger.info("Loading PII NER model (spaCy) from pii_masking.py...")
|
| 36 |
+
pii_ner_model = spacy_nlp_model # Loaded from pii_masking.py
|
| 37 |
+
if pii_ner_model:
|
| 38 |
+
logger.info("PII NER model (spaCy) loaded successfully.")
|
| 39 |
+
else:
|
| 40 |
+
logger.error("PII NER model (spaCy) failed to load. PII masking for NER entities will not be available.")
|
| 41 |
+
|
| 42 |
+
logger.info(f"Loading classification model from {CLASSIFICATION_MODEL_PATH}...")
|
| 43 |
+
classification_model = load_classification_model(CLASSIFICATION_MODEL_PATH) # Uses path from config
|
| 44 |
+
if classification_model is None:
|
| 45 |
+
logger.critical("Email classification model could not be loaded. Classification will not be available.")
|
| 46 |
+
else:
|
| 47 |
+
logger.info("Classification model loaded successfully.")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# --- Health Check Endpoint ---
|
| 51 |
+
@router_v1.get("/health", tags=["Health"])
|
| 52 |
+
async def health_check():
|
| 53 |
+
logger.info("Health check endpoint called.")
|
| 54 |
+
services = {
|
| 55 |
+
"pii_ner_model_status": "loaded" if pii_ner_model else "not_loaded",
|
| 56 |
+
"classification_model_status": "loaded" if classification_model else "not_loaded"
|
| 57 |
+
}
|
| 58 |
+
if pii_ner_model and classification_model:
|
| 59 |
+
logger.info("Health check: All services OK.")
|
| 60 |
+
return {"status": "ok", "services": services}
|
| 61 |
+
else:
|
| 62 |
+
service_issues = []
|
| 63 |
+
if not pii_ner_model:
|
| 64 |
+
service_issues.append("PII NER model not loaded")
|
| 65 |
+
if not classification_model:
|
| 66 |
+
service_issues.append("Classification model not loaded")
|
| 67 |
+
|
| 68 |
+
logger.warning(f"Health check: Issues detected - {', '.join(service_issues)}")
|
| 69 |
+
# Return 503 if critical services are down
|
| 70 |
+
raise HTTPException(
|
| 71 |
+
status_code=503,
|
| 72 |
+
detail={"status": "error", "message": "One or more critical services are unavailable.", "services": services}
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
@router_v1.post("/classify", response_model=ClassificationOutput, tags=["Classification"])
|
| 76 |
+
async def classify_email_endpoint(email_input: EmailInput):
|
| 77 |
+
logger.info(f"Received request for /classify. Email length: {len(email_input.input_email_body)}")
|
| 78 |
+
if len(email_input.input_email_body) == 0:
|
| 79 |
+
logger.warning("Received empty email body for /classify.")
|
| 80 |
+
raise HTTPException(status_code=400, detail="Input email body cannot be empty.")
|
| 81 |
+
original_email = email_input.input_email_body
|
| 82 |
+
|
| 83 |
+
# 1. PII Masking
|
| 84 |
+
if pii_ner_model is None:
|
| 85 |
+
logger.warning("PII NER model (spaCy) not available at request time. Masking will be limited to regex-based detections.")
|
| 86 |
+
|
| 87 |
+
logger.debug("Performing PII masking...")
|
| 88 |
+
masked_email_text, pii_entities_raw = mask_pii_details(original_email, nlp_model=pii_ner_model)
|
| 89 |
+
logger.debug(f"PII masking complete. Found {len(pii_entities_raw)} raw entities before output conversion.")
|
| 90 |
+
|
| 91 |
+
# Convert pii_entities_raw (list of dicts) to list of MaskedEntity objects
|
| 92 |
+
pii_entities_output = [
|
| 93 |
+
MaskedEntity(
|
| 94 |
+
position=entity['position'],
|
| 95 |
+
classification=entity['classification'],
|
| 96 |
+
entity=entity['entity']
|
| 97 |
+
) for entity in pii_entities_raw
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
# 2. Classification
|
| 101 |
+
if classification_model is None:
|
| 102 |
+
logger.error("Classification model not available at request time. Returning error category.")
|
| 103 |
+
category = "Error: Classifier not available"
|
| 104 |
+
# If classification is critical, an HTTPException could be raised here.
|
| 105 |
+
else:
|
| 106 |
+
category = classify_email_category(masked_email_text, classification_model)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# Original PII entities are available for potential future demasking.
|
| 110 |
+
|
| 111 |
+
logger.info(f"Email classified as '{category}'. Total masked entities: {len(pii_entities_output)}. Returning response.")
|
| 112 |
+
return ClassificationOutput(
|
| 113 |
+
input_email_body=original_email,
|
| 114 |
+
list_of_masked_entities=pii_entities_output, # Use the converted list of Pydantic models
|
| 115 |
+
masked_email=masked_email_text,
|
| 116 |
+
category_of_the_email=category
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Include the router in the main app instance
|
| 120 |
+
app.include_router(router_v1)
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
# Note: Hugging Face Spaces will use its own command to run the app.
|
| 124 |
+
# This is for local testing.
|
| 125 |
+
logger.info("Starting Uvicorn server for local development on http://0.0.0.0:8000")
|
| 126 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
classification_model.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# classification_model.py - Developed by nitinprajwal
|
| 2 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 3 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 4 |
+
from sklearn.pipeline import Pipeline
|
| 5 |
+
from sklearn.model_selection import train_test_split # Optional: for evaluating model
|
| 6 |
+
from sklearn.metrics import classification_report # Optional: for evaluating model
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import joblib
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
# Assuming utils.py is in the same directory
|
| 12 |
+
from utils import load_data
|
| 13 |
+
# Import PII masking functionality
|
| 14 |
+
from pii_masking import mask_pii_details, nlp as spacy_nlp_model_for_training # Use the loaded spaCy model
|
| 15 |
+
|
| 16 |
+
from config import CLASSIFICATION_MODEL_PATH
|
| 17 |
+
MODEL_FILENAME = CLASSIFICATION_MODEL_PATH
|
| 18 |
+
DEFAULT_DATASET_PATH = "combined_emails_with_natural_pii.csv"
|
| 19 |
+
|
| 20 |
+
def train_classification_model(data_path: str = DEFAULT_DATASET_PATH, model_save_path: str = MODEL_FILENAME):
|
| 21 |
+
"""
|
| 22 |
+
Trains the email classification model and saves it.
|
| 23 |
+
Uses 'email' column for text and 'type' for category.
|
| 24 |
+
"""
|
| 25 |
+
print(f"Starting model training with dataset: {data_path}")
|
| 26 |
+
df = load_data(data_path)
|
| 27 |
+
|
| 28 |
+
if df is None:
|
| 29 |
+
print("Failed to load data. Aborting training.")
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
# Preprocessing: Fill NaN in 'email' (text content) and 'type' (labels)
|
| 33 |
+
df['email'] = df['email'].fillna('')
|
| 34 |
+
df['type'] = df['type'].fillna('Unknown')
|
| 35 |
+
df.dropna(subset=['type'], inplace=True) # Ensure labels are present
|
| 36 |
+
|
| 37 |
+
if df.empty or df['email'].empty or df['type'].empty:
|
| 38 |
+
print("Data is empty or lacks required 'email' or 'type' columns after preprocessing. Aborting training.")
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
print("Applying PII masking to training data...")
|
| 42 |
+
# Ensure the spaCy model is available for masking
|
| 43 |
+
if spacy_nlp_model_for_training is None:
|
| 44 |
+
print("Warning: spaCy model not loaded in pii_masking. Training will use regex-only masked data.")
|
| 45 |
+
|
| 46 |
+
# Mask PII in the training data
|
| 47 |
+
# This can be slow for large datasets; consider optimizations if needed
|
| 48 |
+
masked_emails = []
|
| 49 |
+
for i, email_text in enumerate(df['email']):
|
| 50 |
+
if pd.isna(email_text):
|
| 51 |
+
masked_emails.append("") # Handle potential NaN after fillna('') if any slip through
|
| 52 |
+
continue
|
| 53 |
+
masked_text, _ = mask_pii_details(str(email_text), nlp_model=spacy_nlp_model_for_training)
|
| 54 |
+
masked_emails.append(masked_text)
|
| 55 |
+
if (i + 1) % 100 == 0:
|
| 56 |
+
print(f"Masked {i+1}/{len(df['email'])} emails for training...")
|
| 57 |
+
|
| 58 |
+
df['masked_email_for_training'] = masked_emails
|
| 59 |
+
print("PII masking for training data complete.")
|
| 60 |
+
|
| 61 |
+
X = df['masked_email_for_training']
|
| 62 |
+
y = df['type']
|
| 63 |
+
|
| 64 |
+
# Optional: Split data for evaluation (not strictly required by assignment but good practice)
|
| 65 |
+
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
|
| 66 |
+
|
| 67 |
+
# Create a pipeline: TF-IDF Vectorizer -> Multinomial Naive Bayes
|
| 68 |
+
# You can experiment with other models like SVM, Logistic Regression, or even simple Transformers.
|
| 69 |
+
model = Pipeline([
|
| 70 |
+
('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2, ngram_range=(1,2))),
|
| 71 |
+
('clf', MultinomialNB(alpha=0.1)), # Alpha is a smoothing parameter for Naive Bayes
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
print("Training the model...")
|
| 75 |
+
# model.fit(X_train, y_train) # If using train_test_split
|
| 76 |
+
model.fit(X, y) # Train on full dataset as per typical assignment flow unless evaluation is separate
|
| 77 |
+
print("Model training complete.")
|
| 78 |
+
|
| 79 |
+
# Optional: Evaluate the model
|
| 80 |
+
# print("\nModel Evaluation on Test Set:")
|
| 81 |
+
# predictions = model.predict(X_test)
|
| 82 |
+
# print(classification_report(y_test, predictions))
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
joblib.dump(model, CLASSIFICATION_MODEL_PATH)
|
| 86 |
+
print(f"Model saved to {CLASSIFICATION_MODEL_PATH}")
|
| 87 |
+
return True
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"Error saving model: {e}")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
def load_classification_model(model_path: str = CLASSIFICATION_MODEL_PATH):
|
| 93 |
+
"""
|
| 94 |
+
Loads the trained classification model.
|
| 95 |
+
"""
|
| 96 |
+
if not os.path.exists(CLASSIFICATION_MODEL_PATH):
|
| 97 |
+
print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH}. Train the model first or ensure path is correct.")
|
| 98 |
+
print(f"Attempting to train a new model with default dataset: {DEFAULT_DATASET_PATH}")
|
| 99 |
+
success = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=CLASSIFICATION_MODEL_PATH)
|
| 100 |
+
if not success:
|
| 101 |
+
print("Failed to train a new model. Cannot load model.")
|
| 102 |
+
return None
|
| 103 |
+
# If training was successful, the model file should now exist.
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
model = joblib.load(CLASSIFICATION_MODEL_PATH)
|
| 107 |
+
print(f"Model loaded successfully from {CLASSIFICATION_MODEL_PATH}")
|
| 108 |
+
return model
|
| 109 |
+
except FileNotFoundError:
|
| 110 |
+
# This case should be handled by the os.path.exists check and auto-train attempt now.
|
| 111 |
+
print(f"Error: Model file not found at {CLASSIFICATION_MODEL_PATH} even after attempting to train.")
|
| 112 |
+
return None
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Error loading model from {model_path}: {e}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
def classify_email_category(masked_email_text: str, model):
|
| 118 |
+
"""
|
| 119 |
+
Classifies the masked email text into a category.
|
| 120 |
+
"""
|
| 121 |
+
if model is None:
|
| 122 |
+
print("Error: Classification model not loaded.")
|
| 123 |
+
# Fallback category or raise an error, as per application requirements
|
| 124 |
+
return "Error: Model not available"
|
| 125 |
+
try:
|
| 126 |
+
# The model expects a list or iterable of texts
|
| 127 |
+
prediction = model.predict([masked_email_text])
|
| 128 |
+
return prediction[0]
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error during classification: {e}")
|
| 131 |
+
return "Error: Classification failed"
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
print("Running classification_model.py script...")
|
| 135 |
+
# Train the model using the provided dataset
|
| 136 |
+
# This will save the model as 'email_classifier.joblib' in the root directory
|
| 137 |
+
training_successful = train_classification_model(data_path=DEFAULT_DATASET_PATH, model_save_path=MODEL_FILENAME)
|
| 138 |
+
|
| 139 |
+
if training_successful:
|
| 140 |
+
print("\n--- Testing loaded model ---_model")
|
| 141 |
+
# Load the just-trained model
|
| 142 |
+
loaded_model = load_classification_model(MODEL_FILENAME)
|
| 143 |
+
if loaded_model:
|
| 144 |
+
sample_emails_for_testing = [
|
| 145 |
+
("Subject: Urgent - Server down! Our main application server is not responding. We need immediate assistance.", "Incident"),
|
| 146 |
+
("Subject: Password Reset Request. Hi, I forgot my password and need to reset it. My username is testuser.", "Request"),
|
| 147 |
+
("Subject: Inquiry about new billing plans. Could you please provide more information on your enterprise billing options?", "Request"),
|
| 148 |
+
("Subject: System Update Notification for 2023-01-15. We will be performing scheduled maintenance.", "Change"),
|
| 149 |
+
("Subject: Recurring login issue. I've been unable to login for the past three days, the error says 'invalid credentials' but I am sure they are correct.", "Problem"),
|
| 150 |
+
]
|
| 151 |
+
print("\nClassifying sample emails:")
|
| 152 |
+
for email_text, expected_category in sample_emails_for_testing:
|
| 153 |
+
# For testing the endpoint, the API will handle masking.
|
| 154 |
+
# For this direct model test, we should simulate that by masking first.
|
| 155 |
+
print(f"\nOriginal sample for testing: {email_text[:60]}...")
|
| 156 |
+
masked_sample_text, _ = mask_pii_details(email_text, nlp_model=spacy_nlp_model_for_training) # Use the same nlp model
|
| 157 |
+
print(f"Masked sample for testing: {masked_sample_text[:60]}...")
|
| 158 |
+
category = classify_email_category(masked_sample_text, loaded_model)
|
| 159 |
+
print(f"-> Predicted: {category} (Expected: {expected_category})")
|
| 160 |
+
else:
|
| 161 |
+
print("Model training failed. Cannot proceed with testing.")
|
| 162 |
+
|
config.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config.py - Developed by nitinprajwal
|
| 2 |
+
"""
|
| 3 |
+
Configuration settings for the Email Classification and PII Masking application.
|
| 4 |
+
|
| 5 |
+
This file centralizes configuration parameters such as file paths, model locations,
|
| 6 |
+
and logging settings to make the application more maintainable and configurable.
|
| 7 |
+
All paths are constructed dynamically based on the project's root directory.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
# Project Root Directory
|
| 13 |
+
# Dynamically determines the absolute path to the project's root directory.
|
| 14 |
+
# This ensures that file paths are correctly resolved regardless of where the
|
| 15 |
+
# application is run from.
|
| 16 |
+
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
+
|
| 18 |
+
# Model Paths
|
| 19 |
+
# Path to the pre-trained email classification model file.
|
| 20 |
+
# The model is expected to be a .joblib file located in the project root.
|
| 21 |
+
CLASSIFICATION_MODEL_PATH = os.path.join(PROJECT_ROOT, "email_classifier.joblib")
|
| 22 |
+
|
| 23 |
+
# Logging Configuration
|
| 24 |
+
# Defines the minimum severity level for log messages to be recorded.
|
| 25 |
+
# Common levels: DEBUG, INFO, WARNING, ERROR, CRITICAL.
|
| 26 |
+
LOG_LEVEL = "INFO"
|
| 27 |
+
|
| 28 |
+
# Defines the format string for log messages.
|
| 29 |
+
# This format includes timestamp, logger name, log level, and the message itself.
|
| 30 |
+
LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 31 |
+
|
email_classifier.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75090801821b65a186d4d5924dcbf08f75838c99cb02af527ff34cf6688ea731
|
| 3 |
+
size 11067809
|
pii_masking.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Module for PII (Personally Identifiable Information) masking and demasking.
|
| 3 |
+
|
| 4 |
+
This module provides functionalities to:
|
| 5 |
+
1. Mask PII entities in a given text using regular expressions and spaCy's NER.
|
| 6 |
+
2. Conceptually demask PII (though the primary API output relies on returning the original text).
|
| 7 |
+
|
| 8 |
+
PII entities targeted include:
|
| 9 |
+
- Email addresses
|
| 10 |
+
- Phone numbers
|
| 11 |
+
- Credit/Debit card numbers
|
| 12 |
+
- CVV numbers
|
| 13 |
+
- Card expiry dates
|
| 14 |
+
- Aadhar card numbers
|
| 15 |
+
- Dates of birth (DOB)
|
| 16 |
+
- Full names (primarily via NER)
|
| 17 |
+
|
| 18 |
+
PEP8 compliant and includes detailed comments.
|
| 19 |
+
"""
|
| 20 |
+
import re
|
| 21 |
+
import spacy
|
| 22 |
+
|
| 23 |
+
# Load spaCy model
|
| 24 |
+
try:
|
| 25 |
+
nlp = spacy.load("en_core_web_sm")
|
| 26 |
+
except OSError:
|
| 27 |
+
print("spaCy model 'en_core_web_sm' not found. Please run: python -m spacy download en_core_web_sm")
|
| 28 |
+
# Depending on the environment, you might want to exit or raise an error here
|
| 29 |
+
# For Hugging Face Spaces, the model should be downloaded during setup if specified.
|
| 30 |
+
nlp = None # Fallback: spaCy features will be unavailable.
|
| 31 |
+
# In a production system, this might warrant an error or specific handling.
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# --- PII Regex Patterns --- #
|
| 35 |
+
# Note: These patterns are foundational. For production-grade accuracy and to minimize
|
| 36 |
+
# false positives/negatives (critical for test case coverage), they would require
|
| 37 |
+
# extensive testing and refinement. Some patterns (e.g., for CVV) are broad and
|
| 38 |
+
# might benefit from contextual validation not implemented here.
|
| 39 |
+
PII_PATTERNS = {
|
| 40 |
+
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
|
| 41 |
+
"phone_number": r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b", # Basic US-like
|
| 42 |
+
"credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b|\b\d{16}\b", # Visa, MC, Amex (simple)
|
| 43 |
+
"cvv_no": r"\b\d{3,4}\b", # CVV. Broad pattern; could match other 3-4 digit numbers.
|
| 44 |
+
# Contextual filtering (e.g., proximity to card numbers) would improve accuracy.
|
| 45 |
+
"expiry_no": r"\b(0[1-9]|1[0-2])\/(\d{2}|\d{4})\b", # MM/YY or MM/YYYY
|
| 46 |
+
"aadhar_num": r"\b(?:Aadhar[:\s]*)?(\d{4}(?:[\s\-]?\d{4}){2})\b", # Optional "Aadhar: " prefix, captures only numbers after prefix
|
| 47 |
+
"dob": r"\b(?:(?:(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](\d{4}))|(?:(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-. ,]*(0[1-9]|[12][0-9]|3[01])(?:st|nd|rd|th)?[-. ,]*(\d{4}))|(?:(0[1-9]|[12][0-9]|3[01])[-. ,]*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-. ,]*(\d{4})))", # DD-MM-YYYY, Month D, YYYY, D Month, YYYY
|
| 48 |
+
# Full name is best handled by NER.
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# Defines the placeholder strings to be used for masking each PII entity type.
|
| 53 |
+
# These align with the `list_of_masked_entities.classification` values.
|
| 54 |
+
ENTITY_MAP = {
|
| 55 |
+
"full_name": "[full_name]",
|
| 56 |
+
"email": "[email]",
|
| 57 |
+
"phone_number": "[phone_number]",
|
| 58 |
+
"dob": "[dob]",
|
| 59 |
+
"aadhar_num": "[aadhar_num]",
|
| 60 |
+
"credit_debit_no": "[credit_debit_no]",
|
| 61 |
+
"cvv_no": "[cvv_no]",
|
| 62 |
+
"expiry_no": "[expiry_no]",
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
def mask_pii_details(text: str, nlp_model=None) -> tuple[str, list[dict]]:
|
| 66 |
+
"""
|
| 67 |
+
Masks PII in the input text using a combination of regex patterns and spaCy NER.
|
| 68 |
+
|
| 69 |
+
The process involves:
|
| 70 |
+
1. Identifying PII candidates using predefined regex patterns.
|
| 71 |
+
2. Identifying PII candidates (especially names and potentially dates) using spaCy's NER.
|
| 72 |
+
3. Collecting all unique detections, including their start/end positions, original value, and type.
|
| 73 |
+
4. Sorting these detections by their start position to ensure correct masking order.
|
| 74 |
+
5. Iteratively replacing detected PII in the text with predefined placeholders,
|
| 75 |
+
adjusting for changes in string length caused by masking.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
text (str): The input string containing potential PII.
|
| 79 |
+
nlp_model (spacy.language.Language, optional): An initialized spaCy language model.
|
| 80 |
+
If None, NER-based detection will be skipped.
|
| 81 |
+
Defaults to None.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
tuple[str, list[dict]]:
|
| 85 |
+
- masked_text (str): The text with PII entities replaced by placeholders.
|
| 86 |
+
- found_entities (list[dict]): A list of dictionaries, where each dictionary
|
| 87 |
+
represents a detected PII entity and contains:
|
| 88 |
+
- "position" (list[int, int]): Start and end indices in the original text.
|
| 89 |
+
- "classification" (str): The type of PII (e.g., "email", "full_name").
|
| 90 |
+
- "entity" (str): The original detected PII value.
|
| 91 |
+
"""
|
| 92 |
+
masked_text = text
|
| 93 |
+
found_entities = []
|
| 94 |
+
|
| 95 |
+
# Sort entities by start position to handle replacements correctly if overlaps occur (though ideally they shouldn't for distinct entities)
|
| 96 |
+
# This list will store all PII detections (from regex and NER)
|
| 97 |
+
# before they are sorted and applied for masking. Each item is a dictionary.
|
| 98 |
+
detections_to_mask = []
|
| 99 |
+
|
| 100 |
+
# 1. Regex-based masking
|
| 101 |
+
for entity_type, pattern in PII_PATTERNS.items():
|
| 102 |
+
for match in re.finditer(pattern, text):
|
| 103 |
+
start, end = match.span()
|
| 104 |
+
original_value = match.group(0)
|
| 105 |
+
# All detections are based on the original 'text'.
|
| 106 |
+
# Sorting later handles overlaps based on start position.
|
| 107 |
+
detections_to_mask.append({
|
| 108 |
+
"position": [start, end],
|
| 109 |
+
"classification": entity_type,
|
| 110 |
+
"entity": original_value
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
# 2. NER-based masking (e.g., for names, and potentially refining other entities like DOB)
|
| 114 |
+
# spaCy NER helps identify entities that are harder to catch with regex alone (e.g., names).
|
| 115 |
+
# It can also identify dates, which are then heuristically checked if they might be a DOB.
|
| 116 |
+
if nlp_model: # Use the passed nlp_model, which is 'nlp' loaded globally in this module
|
| 117 |
+
doc = nlp_model(text)
|
| 118 |
+
for ent in doc.ents:
|
| 119 |
+
# print(f"spaCy entity: {ent.text}, label: {ent.label_}, start: {ent.start_char}, end: {ent.end_char}") # Debugging
|
| 120 |
+
entity_text = ent.text
|
| 121 |
+
entity_label = ent.label_
|
| 122 |
+
start_char, end_char = ent.start_char, ent.end_char
|
| 123 |
+
|
| 124 |
+
classification_type = None
|
| 125 |
+
if entity_label == "PERSON":
|
| 126 |
+
classification_type = "full_name"
|
| 127 |
+
elif entity_label == "DATE":
|
| 128 |
+
# Basic check for DOB-like patterns, spaCy's DATE is broad
|
| 129 |
+
# This is a heuristic. spaCy's DATE entity is broad.
|
| 130 |
+
# More sophisticated logic (e.g., pattern matching on the date string itself,
|
| 131 |
+
# or contextual analysis) would be needed for higher accuracy in identifying DOBs
|
| 132 |
+
# versus other types of dates. For this implementation, we make a basic check.
|
| 133 |
+
if len(entity_text) > 5: # Arbitrary length to avoid very short dates
|
| 134 |
+
classification_type = "dob"
|
| 135 |
+
# Add other mappings if spaCy identifies relevant entities directly
|
| 136 |
+
# e.g., ORG, GPE, etc. if they were part of PII (they are not in this problem)
|
| 137 |
+
|
| 138 |
+
if classification_type:
|
| 139 |
+
# Check for overlaps with regex: regex might be more specific for certain patterns
|
| 140 |
+
# For simplicity, we add all NER findings. Refinement could prioritize.
|
| 141 |
+
detections_to_mask.append({
|
| 142 |
+
"position": [start_char, end_char],
|
| 143 |
+
"classification": classification_type,
|
| 144 |
+
"entity": entity_text
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# --- Resolve Overlaps and Finalize Detections ---
|
| 149 |
+
# 1. Filter out CVV matches that are substrings of other longer numeric matches (Card, Aadhar, Phone)
|
| 150 |
+
potential_numeric_spans = set()
|
| 151 |
+
for det in detections_to_mask:
|
| 152 |
+
if det['classification'] in ['credit_debit_no', 'aadhar_num', 'phone_number']:
|
| 153 |
+
potential_numeric_spans.add((det['position'][0], det['position'][1]))
|
| 154 |
+
|
| 155 |
+
filtered_detections = []
|
| 156 |
+
for det in detections_to_mask:
|
| 157 |
+
if det['classification'] == 'cvv_no':
|
| 158 |
+
is_substring = False
|
| 159 |
+
for p_start, p_end in potential_numeric_spans:
|
| 160 |
+
# if CVV is within a larger number and is not the whole number itself
|
| 161 |
+
if det['position'][0] >= p_start and det['position'][1] <= p_end and (det['position'][0] > p_start or det['position'][1] < p_end):
|
| 162 |
+
is_substring = True
|
| 163 |
+
break
|
| 164 |
+
if not is_substring:
|
| 165 |
+
filtered_detections.append(det)
|
| 166 |
+
elif det['classification'] == 'expiry_no' and det['entity'].count('/') == 0: # Basic sanity for MM/YY
|
| 167 |
+
# If spaCy DATE was too broad and picked up a year as expiry_no, and it's not MM/YY like
|
| 168 |
+
# This is a heuristic. Example: '1990' from DOB was previously misclassified as expiry by a loose regex.
|
| 169 |
+
# The current DOB regex is better, so this might be less needed for expiry_no.
|
| 170 |
+
# However, if a 'DATE' from NER is misclassified as 'dob' and is just a year, this could be an issue.
|
| 171 |
+
# For now, let's assume the regex for expiry_no `MM/YY` is specific enough.
|
| 172 |
+
pass # No, let's keep it simple, if it matches expiry_no regex, it's expiry_no.
|
| 173 |
+
filtered_detections.append(det)
|
| 174 |
+
else:
|
| 175 |
+
filtered_detections.append(det)
|
| 176 |
+
detections_to_mask = filtered_detections
|
| 177 |
+
|
| 178 |
+
# Remove duplicates: If regex and NER (or multiple regex patterns)
|
| 179 |
+
# identify the exact same entity (same span, text, and classification),
|
| 180 |
+
# keep only one instance.
|
| 181 |
+
unique_detections_set = set()
|
| 182 |
+
temp_detections = []
|
| 183 |
+
for det in detections_to_mask:
|
| 184 |
+
# Create a hashable representation for checking uniqueness.
|
| 185 |
+
# Position is a list, so convert to tuple.
|
| 186 |
+
detection_tuple = (tuple(det['position']), det['classification'], det['entity'])
|
| 187 |
+
if detection_tuple not in unique_detections_set:
|
| 188 |
+
unique_detections_set.add(detection_tuple)
|
| 189 |
+
temp_detections.append(det)
|
| 190 |
+
detections_to_mask = temp_detections
|
| 191 |
+
|
| 192 |
+
# Sort detections: Primarily by start position (ascending).
|
| 193 |
+
# For entities starting at the same position, prioritize the longer one (descending end position).
|
| 194 |
+
# This helps in correctly masking nested or overlapping entities (e.g., mask "123 Main St" before "Main St").
|
| 195 |
+
detections_to_mask.sort(key=lambda x: (x['position'][0], -x['position'][1]))
|
| 196 |
+
|
| 197 |
+
# 3. Masking the text
|
| 198 |
+
# Iterate through sorted detections and replace them in the text.
|
| 199 |
+
# An offset is maintained to adjust for changes in string length due to masking.
|
| 200 |
+
offset = 0
|
| 201 |
+
for detection in detections_to_mask:
|
| 202 |
+
orig_start, orig_end = detection['position']
|
| 203 |
+
entity_type = detection['classification']
|
| 204 |
+
mask_placeholder = ENTITY_MAP.get(entity_type, f"[{entity_type}]") # Fallback if type not in map
|
| 205 |
+
|
| 206 |
+
# Adjust start and end positions based on cumulative offset from previous replacements
|
| 207 |
+
start_offset = orig_start + offset
|
| 208 |
+
end_offset = orig_end + offset
|
| 209 |
+
|
| 210 |
+
# Replace the detected PII with its corresponding mask placeholder
|
| 211 |
+
masked_text = masked_text[:start_offset] + mask_placeholder + masked_text[end_offset:]
|
| 212 |
+
|
| 213 |
+
# Update the offset for subsequent replacements
|
| 214 |
+
offset += len(mask_placeholder) - (orig_end - orig_start)
|
| 215 |
+
|
| 216 |
+
# Store the original entity details for the output list
|
| 217 |
+
# (position refers to original text, not the masked one)
|
| 218 |
+
found_entities.append({
|
| 219 |
+
"position": [orig_start, orig_end],
|
| 220 |
+
"classification": entity_type,
|
| 221 |
+
"entity": detection['entity']
|
| 222 |
+
})
|
| 223 |
+
|
| 224 |
+
return masked_text, found_entities
|
| 225 |
+
|
| 226 |
+
def demask_pii(masked_text: str, pii_entities: list[dict]) -> str:
|
| 227 |
+
"""
|
| 228 |
+
Conceptually restores PII to a masked text string.
|
| 229 |
+
|
| 230 |
+
NOTE: This function is largely a conceptual placeholder. The primary API output
|
| 231 |
+
specification includes the original `input_email_body`, which serves as the
|
| 232 |
+
'demasked' version. Direct reconstruction of a demasked string from `masked_text`
|
| 233 |
+
and `pii_entities` is complex (due to variable lengths of placeholders vs. original
|
| 234 |
+
text, potential overlaps, and mapping placeholders back to specific entities if
|
| 235 |
+
multiple same placeholders exist) and is not strictly required for the specified API output.
|
| 236 |
+
|
| 237 |
+
If this function were to be fully implemented for robust string demasking, it would
|
| 238 |
+
require a sophisticated approach to map placeholder instances in the `masked_text`
|
| 239 |
+
back to their corresponding original `entity` values from `pii_entities`,
|
| 240 |
+
likely using their positions and types, and then performing replacements carefully.
|
| 241 |
+
|
| 242 |
+
Args:
|
| 243 |
+
masked_text (str): The text string where PII has been replaced by placeholders.
|
| 244 |
+
pii_entities (list[dict]): A list of dictionaries, where each dictionary
|
| 245 |
+
describes a masked PII entity, including its original
|
| 246 |
+
value and type (as returned by `mask_pii_details`).
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
str: The conceptual demasked text. In this placeholder implementation,
|
| 250 |
+
it might return the `masked_text` itself or a simple message,
|
| 251 |
+
as full demasking is not implemented.
|
| 252 |
+
"""
|
| 253 |
+
# Given the API output spec, direct demasking of a string might not be what's evaluated.
|
| 254 |
+
# The 'input_email_body' serves as the 'demasked' version.
|
| 255 |
+
# If we had to reconstruct, we would iterate through pii_entities (sorted reverse by position)
|
| 256 |
+
# and replace placeholders. This is tricky due to length changes.
|
| 257 |
+
|
| 258 |
+
# Example (conceptual, might not perfectly work with all overlaps or length changes):
|
| 259 |
+
# temp_text = masked_text
|
| 260 |
+
# for entity_info in sorted(pii_entities, key=lambda x: masked_text.find(ENTITY_MAP[x['classification']]), reverse=True):
|
| 261 |
+
# mask_placeholder = ENTITY_MAP[entity_info['classification']]
|
| 262 |
+
# # This find might be problematic if multiple same placeholders exist.
|
| 263 |
+
# # A more robust way would be to use the positions from masking carefully.
|
| 264 |
+
# # For this assignment, the original email is returned, so direct demasking of the string is not strictly needed for the output.
|
| 265 |
+
# # However, if it were, one would need a robust way to map masked placeholders back to original values using their positions.
|
| 266 |
+
# # Example: iterate pii_entities (sorted by start position of the MASK in the MASKED text)
|
| 267 |
+
# # and replace. This is non-trivial if mask labels vary in length or original content had similar patterns.
|
| 268 |
+
#
|
| 269 |
+
# # placeholder_positions = []
|
| 270 |
+
# # for entity_detail in pii_entities:
|
| 271 |
+
# # mask_tag = ENTITY_MAP[entity_detail['classification']]
|
| 272 |
+
# # for match in re.finditer(re.escape(mask_tag), masked_text):
|
| 273 |
+
# # placeholder_positions.append({'info': entity_detail, 'mask_pos': match.span()})
|
| 274 |
+
# # placeholder_positions.sort(key=lambda x: x['mask_pos'][0], reverse=True)
|
| 275 |
+
#
|
| 276 |
+
# # demasked_str_list = list(masked_text)
|
| 277 |
+
# # for item in placeholder_positions:
|
| 278 |
+
# # # This simple replacement assumes one-to-one mapping and unique placeholders or first-match logic
|
| 279 |
+
# # # A truly robust system would need to track original vs. masked spans more carefully.
|
| 280 |
+
# # start, end = item['mask_pos']
|
| 281 |
+
# # demasked_str_list[start:end] = list(item['info']['entity'])
|
| 282 |
+
# # return "".join(demasked_str_list)
|
| 283 |
+
|
| 284 |
+
# As per the API specification, the original 'input_email_body' is returned alongside
|
| 285 |
+
# the 'masked_email' and 'list_of_masked_entities'.
|
| 286 |
+
# Therefore, reconstructing the demasked string here is not required for the final output.
|
| 287 |
+
# This function remains a conceptual placeholder if direct string demasking were needed elsewhere.
|
| 288 |
+
return masked_text # Or perhaps raise NotImplementedError, or return a concept string.
|
| 289 |
+
# Returning masked_text for now if called, though its utility is limited.
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# Example Usage (for testing)
|
| 293 |
+
if __name__ == "__main__":
|
| 294 |
+
sample_email = "Hello, my name is John Doe, and my email is johndoe@example.com. Call me at 123-456-7890. My card is 1234-5678-9012-3456, CVV 123, expires 12/25."
|
| 295 |
+
|
| 296 |
+
# To use spaCy, you'd pass the nlp object:
|
| 297 |
+
# nlp = spacy.load("en_core_web_sm")
|
| 298 |
+
# masked_version, entities = mask_pii_details(sample_email, nlp_model=nlp)
|
| 299 |
+
|
| 300 |
+
# Use the globally loaded nlp model if available
|
| 301 |
+
if nlp:
|
| 302 |
+
print("\n--- Masking with spaCy NER model ---")
|
| 303 |
+
masked_version, entities = mask_pii_details(sample_email, nlp_model=nlp)
|
| 304 |
+
else:
|
| 305 |
+
print("\n--- Masking without spaCy NER model (spaCy model not loaded) ---")
|
| 306 |
+
masked_version, entities = mask_pii_details(sample_email, nlp_model=None)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
print("Original:", sample_email)
|
| 310 |
+
print("Masked:", masked_version)
|
| 311 |
+
print("Entities Found:")
|
| 312 |
+
for entity in entities:
|
| 313 |
+
print(entity)
|
| 314 |
+
|
| 315 |
+
# Demasking example (conceptual)
|
| 316 |
+
# if entities: # Check if any PII was found and masked
|
| 317 |
+
# # This assumes the API returns the original email, so direct demasking might not be needed.
|
| 318 |
+
# # reconstructed_email = demask_pii(masked_version, entities)
|
| 319 |
+
# # print("Reconstructed (Conceptual):", reconstructed_email)
|
| 320 |
+
# print("Original email (serves as demasked as per API spec):", sample_email)
|
| 321 |
+
|
| 322 |
+
# Conceptual demasking call (its output is not a true demasked string here)
|
| 323 |
+
# conceptual_demasked = demask_pii(masked_version, entities)
|
| 324 |
+
# print("\nConceptual Demasked Output (from demask_pii function):", conceptual_demasked)
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
fastapi==0.110.0
|
| 3 |
+
uvicorn[standard]==0.29.0
|
| 4 |
+
|
| 5 |
+
# Data Handling
|
| 6 |
+
pandas==1.5.3
|
| 7 |
+
|
| 8 |
+
# Machine Learning (for email classification model)
|
| 9 |
+
scikit-learn==1.3.2
|
| 10 |
+
|
| 11 |
+
# NLP
|
| 12 |
+
# spaCy for Named Entity Recognition (NER)
|
| 13 |
+
# Version 3.8.0 is chosen for compatibility with en_core_web_sm-3.8.0 model.
|
| 14 |
+
spacy==3.8.0
|
| 15 |
+
|
| 16 |
+
# NLTK (potentially for tokenization or other non-LLM NLP tasks)
|
| 17 |
+
nltk==3.8.1
|
| 18 |
+
|
| 19 |
+
# spaCy English Model (small) - Pinned to a specific version from GitHub releases
|
| 20 |
+
# This is en_core_web_sm version 3.8.0. It requires a compatible spaCy version (e.g., spaCy 3.8.x).
|
| 21 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
test_pii_masking.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# test_pii_masking.py
|
| 2 |
+
"""
|
| 3 |
+
Unit tests for the PII (Personally Identifiable Information) masking functionality.
|
| 4 |
+
|
| 5 |
+
This module contains a suite of tests using the `unittest` framework to verify
|
| 6 |
+
the correctness of the `mask_pii_details` function from the `pii_masking` module.
|
| 7 |
+
It covers various PII types, edge cases, NER integration, and overlap resolution.
|
| 8 |
+
|
| 9 |
+
Test cases are designed to ensure:
|
| 10 |
+
- Accurate detection and masking of individual PII types.
|
| 11 |
+
- Correct handling of text with no PII.
|
| 12 |
+
- Robustness in complex scenarios with multiple PII types and potential overlaps.
|
| 13 |
+
- Proper functioning of NER-based PII detection (e.g., full names).
|
| 14 |
+
- Adherence to expected output formats for masked text and entity lists.
|
| 15 |
+
"""
|
| 16 |
+
import unittest
|
| 17 |
+
from pii_masking import mask_pii_details, nlp as spacy_nlp_model, PII_PATTERNS, ENTITY_MAP
|
| 18 |
+
|
| 19 |
+
class TestPiiMasking(unittest.TestCase):
|
| 20 |
+
"""
|
| 21 |
+
Test suite for PII masking functionalities.
|
| 22 |
+
|
| 23 |
+
This class defines individual test methods for different PII types and scenarios.
|
| 24 |
+
It utilizes a helper assertion method `assertMasking` to streamline test validation.
|
| 25 |
+
The `setUp` method ensures the spaCy NLP model is available for tests requiring NER.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def setUp(self):
|
| 29 |
+
"""Set up test environment before each test method.
|
| 30 |
+
|
| 31 |
+
Initializes `self.nlp_model` with the globally loaded spaCy model.
|
| 32 |
+
Prints a warning if the spaCy model is not available, as NER-dependent
|
| 33 |
+
tests might be affected.
|
| 34 |
+
"""
|
| 35 |
+
self.nlp_model = spacy_nlp_model
|
| 36 |
+
if not self.nlp_model:
|
| 37 |
+
# This warning helps in diagnosing test failures if the spaCy model isn't loaded.
|
| 38 |
+
print("Warning: spaCy model ('en_core_web_sm') not loaded. "
|
| 39 |
+
"NER-dependent tests might behave differently or be skipped.")
|
| 40 |
+
|
| 41 |
+
def assertMasking(self, text: str, expected_masked_text: str, expected_entities_details: list[dict]):
|
| 42 |
+
"""
|
| 43 |
+
Helper method to perform PII masking and assert the results.
|
| 44 |
+
|
| 45 |
+
Calls `mask_pii_details` with the provided text and compares the output
|
| 46 |
+
(masked text and list of found entities) against the expected values.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
text (str): The input text to be masked.
|
| 50 |
+
expected_masked_text (str): The expected string after PII masking.
|
| 51 |
+
expected_entities_details (list[dict]): A list of dictionaries, where each
|
| 52 |
+
dictionary represents an expected PII entity with its 'position',
|
| 53 |
+
'classification', and 'entity' (original value).
|
| 54 |
+
"""
|
| 55 |
+
masked_text, found_entities = mask_pii_details(text, nlp_model=self.nlp_model)
|
| 56 |
+
self.assertEqual(masked_text, expected_masked_text)
|
| 57 |
+
|
| 58 |
+
# Compare entities - sort both by position for consistent comparison
|
| 59 |
+
# And convert found_entities to a comparable format (list of dicts without 'entity' if not needed for simple check)
|
| 60 |
+
# For a more robust check, compare all fields including 'entity' and 'classification'
|
| 61 |
+
sorted_found = sorted([{"position": e['position'], "classification": e['classification'], "entity": e['entity']} for e in found_entities], key=lambda x: x['position'][0])
|
| 62 |
+
sorted_expected = sorted(expected_entities_details, key=lambda x: x['position'][0])
|
| 63 |
+
|
| 64 |
+
self.assertEqual(len(sorted_found), len(sorted_expected), msg=f"Mismatch in number of entities found. Got {len(sorted_found)}, expected {len(sorted_expected)} Found: {sorted_found}, Expected: {sorted_expected}")
|
| 65 |
+
for f, e in zip(sorted_found, sorted_expected):
|
| 66 |
+
self.assertDictEqual(f, e, msg=f"Entity mismatch. Got {f}, expected {e}")
|
| 67 |
+
|
| 68 |
+
def test_mask_email_address(self):
|
| 69 |
+
"""Test masking of a standard email address."""
|
| 70 |
+
text = "Contact me at test.email@example.com."
|
| 71 |
+
expected_masked = "Contact me at [email]."
|
| 72 |
+
expected_entities = [
|
| 73 |
+
{"position": [14, 36], "classification": "email", "entity": "test.email@example.com"}
|
| 74 |
+
]
|
| 75 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 76 |
+
|
| 77 |
+
def test_mask_phone_number(self):
|
| 78 |
+
"""Test masking of a standard US-like phone number."""
|
| 79 |
+
text = "My phone is 123-456-7890."
|
| 80 |
+
expected_masked = "My phone is [phone_number]."
|
| 81 |
+
expected_entities = [
|
| 82 |
+
{"position": [12, 24], "classification": "phone_number", "entity": "123-456-7890"}
|
| 83 |
+
]
|
| 84 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 85 |
+
|
| 86 |
+
def test_mask_credit_card(self):
|
| 87 |
+
"""Test masking of a credit card number with hyphens."""
|
| 88 |
+
text = "Card: 4000-1111-2222-3333 end."
|
| 89 |
+
expected_masked = "Card: [credit_debit_no] end."
|
| 90 |
+
expected_entities = [
|
| 91 |
+
{"position": [6, 25], "classification": "credit_debit_no", "entity": "4000-1111-2222-3333"}
|
| 92 |
+
]
|
| 93 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 94 |
+
|
| 95 |
+
def test_mask_cvv(self):
|
| 96 |
+
"""Test masking of a standalone CVV number."""
|
| 97 |
+
text = "CVV is 123."
|
| 98 |
+
expected_masked = "CVV is [cvv_no]."
|
| 99 |
+
expected_entities = [
|
| 100 |
+
{"position": [7, 10], "classification": "cvv_no", "entity": "123"}
|
| 101 |
+
]
|
| 102 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 103 |
+
|
| 104 |
+
def test_mask_expiry_date(self):
|
| 105 |
+
"""Test masking of a card expiry date (MM/YY format)."""
|
| 106 |
+
text = "Expires 03/25."
|
| 107 |
+
expected_masked = "Expires [expiry_no]."
|
| 108 |
+
expected_entities = [
|
| 109 |
+
{"position": [8, 13], "classification": "expiry_no", "entity": "03/25"}
|
| 110 |
+
]
|
| 111 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 112 |
+
|
| 113 |
+
def test_mask_aadhar_number(self):
|
| 114 |
+
"""Test masking of an Aadhar number, including the 'Aadhar: ' prefix if present."""
|
| 115 |
+
text = "Aadhar: 1234 5678 9012."
|
| 116 |
+
# The regex for Aadhar includes the optional "Aadhar: " prefix.
|
| 117 |
+
# The entire matched string "Aadhar: 1234 5678 9012" is replaced, leaving the trailing period.
|
| 118 |
+
expected_masked = "[aadhar_num]."
|
| 119 |
+
expected_entities = [
|
| 120 |
+
{"position": [0, 22], "classification": "aadhar_num", "entity": "Aadhar: 1234 5678 9012"}
|
| 121 |
+
]
|
| 122 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 123 |
+
|
| 124 |
+
def test_mask_dob_regex(self):
|
| 125 |
+
"""Test masking of a Date of Birth using regex (DD/MM/YYYY format)."""
|
| 126 |
+
# Test regex-based DOB detection
|
| 127 |
+
text = "Born on 01/02/1990."
|
| 128 |
+
expected_masked = "Born on [dob]."
|
| 129 |
+
expected_entities = [
|
| 130 |
+
{"position": [8, 18], "classification": "dob", "entity": "01/02/1990"}
|
| 131 |
+
]
|
| 132 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 133 |
+
|
| 134 |
+
def test_mask_full_name_ner(self):
|
| 135 |
+
"""Test masking of a full name using spaCy NER (PERSON entity)."""
|
| 136 |
+
if not self.nlp_model: self.skipTest("spaCy model not loaded, skipping NER test.")
|
| 137 |
+
text = "My name is John Doe."
|
| 138 |
+
expected_masked = "My name is [full_name]."
|
| 139 |
+
expected_entities = [
|
| 140 |
+
{"position": [11, 19], "classification": "full_name", "entity": "John Doe"}
|
| 141 |
+
]
|
| 142 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 143 |
+
|
| 144 |
+
def test_mask_dob_ner_and_regex_preference(self):
|
| 145 |
+
"""
|
| 146 |
+
Test masking of a Date of Birth where both NER (as DATE) and regex might detect it.
|
| 147 |
+
Checks if the overlap resolution handles this scenario correctly.
|
| 148 |
+
The expected behavior depends on the sorting logic in `mask_pii_details`
|
| 149 |
+
(e.g., preference for longer matches or specific types if defined).
|
| 150 |
+
"""
|
| 151 |
+
# spaCy might pick up 'Jan 1st, 2000' as DATE, our regex might also.
|
| 152 |
+
# The overlap resolution (sorting by start pos, then by reverse end pos) should handle this.
|
| 153 |
+
if not self.nlp_model: self.skipTest("spaCy model not loaded, skipping NER test.")
|
| 154 |
+
text = "Her birthday is Jan 1st, 2000."
|
| 155 |
+
expected_masked = "Her birthday is [dob]."
|
| 156 |
+
# Entity details depend on whether NER or regex wins, and how specific the match is.
|
| 157 |
+
# Assuming our regex `dob` is specific and the overlap resolution prefers it or NER's span is similar.
|
| 158 |
+
expected_entities = [
|
| 159 |
+
{"position": [16, 29], "classification": "dob", "entity": "Jan 1st, 2000"}
|
| 160 |
+
]
|
| 161 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 162 |
+
|
| 163 |
+
def test_no_pii(self):
|
| 164 |
+
"""Test text containing no PII; should remain unchanged with no entities found."""
|
| 165 |
+
text = "This is a normal sentence without any PII."
|
| 166 |
+
self.assertMasking(text, text, [])
|
| 167 |
+
|
| 168 |
+
def test_multiple_pii_types_and_overlap_resolution(self):
|
| 169 |
+
"""
|
| 170 |
+
Test a complex string with multiple PII types.
|
| 171 |
+
This also implicitly tests the overlap resolution logic where entities might be
|
| 172 |
+
adjacent or nested (though current examples are mostly adjacent).
|
| 173 |
+
Ensures all specified PII types are correctly identified and masked.
|
| 174 |
+
"""
|
| 175 |
+
text = "Alice Wonderland (alice.wonder@example.com, born 01/02/1990) called from 987-654-3210 with card 4500-1234-5678-9012 (exp 12/26, CVV 321) and Aadhar 1111 2222 3333."
|
| 176 |
+
expected_masked = "[full_name] ([email], born [dob]) called from [phone_number] with card [credit_debit_no] (exp [expiry_no], CVV [cvv_no]) and [aadhar_num]."
|
| 177 |
+
expected_entities = [
|
| 178 |
+
{"position": [0, 16], "classification": "full_name", "entity": "Alice Wonderland"},
|
| 179 |
+
{"position": [18, 42], "classification": "email", "entity": "alice.wonder@example.com"},
|
| 180 |
+
{"position": [49, 59], "classification": "dob", "entity": "01/02/1990"},
|
| 181 |
+
{"position": [73, 85], "classification": "phone_number", "entity": "987-654-3210"},
|
| 182 |
+
{"position": [96, 115], "classification": "credit_debit_no", "entity": "4500-1234-5678-9012"},
|
| 183 |
+
{"position": [121, 126], "classification": "expiry_no", "entity": "12/26"},
|
| 184 |
+
{"position": [132, 135], "classification": "cvv_no", "entity": "321"},
|
| 185 |
+
{"position": [144, 166], "classification": "aadhar_num", "entity": "Aadhar 1111 2222 3333"}
|
| 186 |
+
]
|
| 187 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 188 |
+
|
| 189 |
+
def test_cvv_not_part_of_card(self):
|
| 190 |
+
"""
|
| 191 |
+
Test that a CVV is masked correctly when it's separate from a card number.
|
| 192 |
+
This also checks that the CVV pattern doesn't mistakenly mask part of a card number
|
| 193 |
+
if the card number itself is also detected (due to overlap resolution preferring longer matches).
|
| 194 |
+
"""
|
| 195 |
+
text = "My card is 4500123456789012 and the separate CVV is 123."
|
| 196 |
+
expected_masked = "My card is [credit_debit_no] and the separate CVV is [cvv_no]."
|
| 197 |
+
expected_entities = [
|
| 198 |
+
{"position": [11, 27], "classification": "credit_debit_no", "entity": "4500123456789012"},
|
| 199 |
+
{"position": [52, 55], "classification": "cvv_no", "entity": "123"}
|
| 200 |
+
]
|
| 201 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 202 |
+
|
| 203 |
+
def test_complex_text_with_potential_false_positives(self):
|
| 204 |
+
"""
|
| 205 |
+
Test text containing numbers that might resemble PII but are not, or are ambiguous.
|
| 206 |
+
Specifically, this tests the behavior of the broad CVV regex (\b\d{3,4}\b),
|
| 207 |
+
which might flag any 3 or 4-digit number as a CVV if no other context or
|
| 208 |
+
more specific PII pattern (like a credit card) overlaps and takes precedence.
|
| 209 |
+
"""
|
| 210 |
+
# This test highlights potential false positives from the CVV regex.
|
| 211 |
+
# Numbers like '678' (reference number) and '123' (part of a sentence)
|
| 212 |
+
# are masked as '[cvv_no]' because they are 3-digit numbers and no other, more specific
|
| 213 |
+
# PII pattern (like a credit card number) covers them at these positions.
|
| 214 |
+
# The number '12345' (Order ID) is not masked as it's 5 digits, exceeding the CVV pattern.
|
| 215 |
+
# This behavior is expected given the current regex and overlap resolution.
|
| 216 |
+
# For higher accuracy in a production system, CVV detection would need more context.
|
| 217 |
+
text = "Order ID is 12345, reference 678. My card is not 123. It is 4444-5555-6666-7777. My actual CVV: 987."
|
| 218 |
+
expected_masked = "Order ID is 12345, reference [cvv_no]. My card is not [cvv_no]. It is [credit_debit_no]. My actual CVV: [cvv_no]."
|
| 219 |
+
expected_entities = [
|
| 220 |
+
# '678' is identified as 'cvv_no' due to the broad regex and lack of overlap with a more specific PII.
|
| 221 |
+
{"position": [29, 32], "classification": "cvv_no", "entity": "678"},
|
| 222 |
+
# '123' is also identified as 'cvv_no' for the same reasons.
|
| 223 |
+
{"position": [49, 52], "classification": "cvv_no", "entity": "123"},
|
| 224 |
+
{"position": [60, 79], "classification": "credit_debit_no", "entity": "4444-5555-6666-7777"},
|
| 225 |
+
{"position": [96, 99], "classification": "cvv_no", "entity": "987"} # Actual CVV
|
| 226 |
+
]
|
| 227 |
+
self.assertMasking(text, expected_masked, expected_entities)
|
| 228 |
+
|
| 229 |
+
if __name__ == '__main__':
|
| 230 |
+
# This allows running the tests directly from the command line
|
| 231 |
+
# e.g., `python test_pii_masking.py`
|
| 232 |
+
# The `argv` and `exit=False` are common patterns for running unittests
|
| 233 |
+
# in environments like Jupyter notebooks or when you want to inspect results
|
| 234 |
+
# without the script exiting immediately.
|
| 235 |
+
unittest.main(argv=['first-arg-is-ignored'], exit=False)
|
utils.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for the Email Classification and PII Masking application.
|
| 3 |
+
|
| 4 |
+
This module provides common helper functions that can be used across different
|
| 5 |
+
parts of the project, such as data loading, preprocessing, or other shared tasks.
|
| 6 |
+
It aims to promote code reusability and organization.
|
| 7 |
+
"""
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
def load_data(file_path: str) -> pd.DataFrame | None:
|
| 11 |
+
"""
|
| 12 |
+
Loads data from a specified CSV file into a pandas DataFrame.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
file_path (str): The absolute or relative path to the CSV file.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
pd.DataFrame | None: A pandas DataFrame containing the loaded data if successful,
|
| 19 |
+
with 'email' and 'type' columns validated.
|
| 20 |
+
Returns None if any error occurs during loading or validation
|
| 21 |
+
(e.g., file not found, empty file, missing required columns).
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
Prints an error message to the console if loading fails or if the
|
| 25 |
+
required columns ('email', 'type') are not found in the CSV.
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
df = pd.read_csv(file_path)
|
| 29 |
+
# Basic validation: Check for expected columns 'email' and 'type'
|
| 30 |
+
# Basic validation: Ensure essential columns are present.
|
| 31 |
+
# These columns are critical for training the email classifier and processing emails.
|
| 32 |
+
if 'email' not in df.columns or 'type' not in df.columns:
|
| 33 |
+
print(f"Error: CSV file at {file_path} must contain 'email' and 'type' columns.")
|
| 34 |
+
return None
|
| 35 |
+
print(f"Successfully loaded data from {file_path}. DataFrame shape: {df.shape}")
|
| 36 |
+
return df
|
| 37 |
+
except FileNotFoundError:
|
| 38 |
+
print(f"Error: The data file was not found at the specified path: {file_path}")
|
| 39 |
+
return None
|
| 40 |
+
except pd.errors.EmptyDataError:
|
| 41 |
+
print(f"Error: The data file at {file_path} is empty and cannot be processed.")
|
| 42 |
+
return None
|
| 43 |
+
except Exception as e: # Catching other potential pandas or general exceptions during file loading.
|
| 44 |
+
print(f"An unexpected error occurred while loading data from {file_path}: {e}")
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
# This block serves as an example of how to use the functions in this module.
|
| 49 |
+
# It will only execute when this script is run directly (e.g., `python utils.py`)
|
| 50 |
+
# and not when `utils.py` is imported by another module.
|
| 51 |
+
|
| 52 |
+
# --- Example: Loading email data --- #
|
| 53 |
+
# Ensure the CSV file 'combined_emails_with_natural_pii.csv' exists in the project's
|
| 54 |
+
# root directory or update DATASET_PATH to the correct location for this example to run.
|
| 55 |
+
# This dataset is assumed to be for demonstration or initial model training preparation.
|
| 56 |
+
DATASET_PATH = 'combined_emails_with_natural_pii.csv'
|
| 57 |
+
email_data = load_data(DATASET_PATH)
|
| 58 |
+
if email_data is not None:
|
| 59 |
+
print(f"Successfully loaded {len(email_data)} emails for example usage.")
|
| 60 |
+
print("First 5 rows:")
|
| 61 |
+
print(email_data.head())
|
| 62 |
+
print("\nEmail categories distribution:")
|
| 63 |
+
print(email_data['type'].value_counts())
|
| 64 |
+
|