Spaces:
Sleeping
Sleeping
Aman Garg
commited on
Email Classification API
Browse files- Dockerfile +36 -0
- README.md +127 -6
- label_encoder.pkl +3 -0
- main.py +93 -0
- mlp_model.pth +3 -0
- models.py +77 -0
- pca.pkl +3 -0
- requirements.txt +10 -0
- test.py +61 -0
- utils.py +141 -0
Dockerfile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Prevent Python from writing .pyc files to disc and enable unbuffered output
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE 1
|
| 6 |
+
ENV PYTHONUNBUFFERED 1
|
| 7 |
+
|
| 8 |
+
# Set the working directory
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install git (required by some HF models) and basic system tools
|
| 12 |
+
RUN apt-get update && apt-get install -y git && apt-get clean
|
| 13 |
+
|
| 14 |
+
# Copy requirements and install dependencies
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Pre-download NER model
|
| 19 |
+
RUN python -c "from transformers import AutoTokenizer, AutoModelForTokenClassification; \
|
| 20 |
+
model = AutoModelForTokenClassification.from_pretrained('Davlan/bert-base-multilingual-cased-ner-hrl'); \
|
| 21 |
+
tokenizer = AutoTokenizer.from_pretrained('Davlan/bert-base-multilingual-cased-ner-hrl'); \
|
| 22 |
+
model.save_pretrained('./model'); tokenizer.save_pretrained('./model')"
|
| 23 |
+
|
| 24 |
+
# Pre-download SentenceTransformer model
|
| 25 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; \
|
| 26 |
+
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2'); \
|
| 27 |
+
model.save('./sbert_model')"
|
| 28 |
+
|
| 29 |
+
# Copy app code into container
|
| 30 |
+
COPY . .
|
| 31 |
+
|
| 32 |
+
# Expose port
|
| 33 |
+
EXPOSE 8000
|
| 34 |
+
|
| 35 |
+
# Run the application
|
| 36 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,131 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
|
|
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Email Classification and PII Masking API
|
| 3 |
+
emoji: 📧
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "latest"
|
| 8 |
+
app_file: main.py
|
| 9 |
pinned: false
|
| 10 |
+
models:
|
| 11 |
+
- Davlan/bert-base-multilingual-cased-ner-hrl
|
| 12 |
+
- sentence-transformers/paraphrase-multilingual-mpnet-base-v2
|
| 13 |
---
|
| 14 |
|
| 15 |
+
# Email Classification and PII Masking API
|
| 16 |
+
|
| 17 |
+
This FastAPI application provides an API for classifying emails and masking Personally Identifiable Information (PII) in text.
|
| 18 |
+
|
| 19 |
+
## Features
|
| 20 |
+
|
| 21 |
+
- PII Detection and Masking
|
| 22 |
+
- Full names
|
| 23 |
+
- Email addresses
|
| 24 |
+
- Phone numbers
|
| 25 |
+
- Dates of birth
|
| 26 |
+
- Aadhar numbers
|
| 27 |
+
- Credit/Debit card numbers
|
| 28 |
+
- CVV numbers
|
| 29 |
+
- Card expiry dates
|
| 30 |
+
- Email Classification using MLP model
|
| 31 |
+
- Multilingual support using BERT-based models
|
| 32 |
+
|
| 33 |
+
## Setup
|
| 34 |
+
|
| 35 |
+
1. Create a virtual environment:
|
| 36 |
+
```bash
|
| 37 |
+
python -m venv venv
|
| 38 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
2. Install dependencies:
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
pip install -r requirements.txt
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
3. Download required model files:
|
| 48 |
+
|
| 49 |
+
- `label_encoder.pkl`
|
| 50 |
+
- `pca.pkl`
|
| 51 |
+
- `mlp_model.pth`
|
| 52 |
+
Place these files in the same directory as `main.py`.
|
| 53 |
+
|
| 54 |
+
## Usage
|
| 55 |
+
|
| 56 |
+
1. Start the FastAPI server:
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
uvicorn main:app --reload --host 0.0.0.0 --port 80
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
**Note for Hugging Face Spaces:** We explicitly bind to `0.0.0.0` and port `80`, which is typically required by Spaces.
|
| 63 |
+
|
| 64 |
+
2. The API will be available at the Space's URL (e.g., `https://your-username-your-space-name.hf.space`).
|
| 65 |
+
|
| 66 |
+
3. API Endpoints:
|
| 67 |
+
|
| 68 |
+
- **POST `/classify`**: Classify and mask PII in email text
|
| 69 |
+
- **Input:** JSON with `input_email_body` field
|
| 70 |
+
```json
|
| 71 |
+
{
|
| 72 |
+
"input_email_body": "Hello, my name is John Doe and my email is john.doe@example.com. Please help with my billing issue."
|
| 73 |
+
}
|
| 74 |
+
```
|
| 75 |
+
- **Output:** JSON with masked text, detected entities, and classification
|
| 76 |
+
```json
|
| 77 |
+
{
|
| 78 |
+
"input_email_body": "Hello, my name is John Doe and my email is john.doe@example.com. Please help with my billing issue.",
|
| 79 |
+
"list_of_masked_entities": [
|
| 80 |
+
{
|
| 81 |
+
"position": [
|
| 82 |
+
16,
|
| 83 |
+
24
|
| 84 |
+
],
|
| 85 |
+
"classification": "full_name",
|
| 86 |
+
"entity": "John Doe"
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"position": [
|
| 90 |
+
39,
|
| 91 |
+
60
|
| 92 |
+
],
|
| 93 |
+
"classification": "email",
|
| 94 |
+
"entity": "john.doe@example.com"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"masked_email": "Hello, my name is [full_name] and my email is [email]. I'm having trouble with my account.",
|
| 98 |
+
"category_of_the_email": "Billing Issues"
|
| 99 |
+
}
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## API Documentation
|
| 103 |
+
|
| 104 |
+
Once the server is running on Hugging Face Spaces, the Swagger UI and ReDoc documentation endpoints might not be directly accessible via the standard `/docs` and `/redoc` paths in a "Static" Space setup. You would typically interact with the `/classify` endpoint directly via POST requests.
|
| 105 |
+
|
| 106 |
+
## Project Structure
|
| 107 |
+
|
| 108 |
+
```
|
| 109 |
+
.
|
| 110 |
+
├── README.md
|
| 111 |
+
├── requirements.txt
|
| 112 |
+
├── main.py # FastAPI application entry point
|
| 113 |
+
├── models.py # ML model definitions and training logic
|
| 114 |
+
├── utils.py # Utility functions for text processing
|
| 115 |
+
├── label_encoder.pkl # Label encoder for classification
|
| 116 |
+
├── pca.pkl # PCA model for dimensionality reduction
|
| 117 |
+
└── mlp_model.pth # Trained MLP model weights
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
## Deployment on Hugging Face Spaces
|
| 121 |
+
|
| 122 |
+
To deploy this application on Hugging Face Spaces:
|
| 123 |
+
|
| 124 |
+
1. **Create a new Space** on [https://huggingface.co/spaces](https://huggingface.co/spaces).
|
| 125 |
+
2. Choose a **Space name**, select a **license**, and for **Space Hardware**, the "Free" tier should be sufficient for this type of API.
|
| 126 |
+
3. Crucially, under **SDK**, select **"Static"**.
|
| 127 |
+
4. In your Space's settings, link your **GitHub repository** containing these files.
|
| 128 |
+
5. Hugging Face Spaces will automatically detect the `requirements.txt` and install the dependencies.
|
| 129 |
+
6. It will then look for an `app_file` specified in the frontmatter (`main.py` in this case) to run. For a "Static" Space running a FastAPI application, it will execute `uvicorn main:app --host 0.0.0.0 --port 80`.
|
| 130 |
+
|
| 131 |
+
Ensure all your model files (`label_encoder.pkl`, `pca.pkl`, `mlp_model.pth`) are present in your repository at the root level or in the same directory as `main.py`.
|
label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e8103306a7eee71cc11a4a770b52d57c8e033925b07f81f7c3b85c26e2c4d6a
|
| 3 |
+
size 283
|
main.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
from typing import Any, Dict, List
|
| 4 |
+
|
| 5 |
+
from fastapi import FastAPI, HTTPException
|
| 6 |
+
from fastapi.responses import JSONResponse
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
|
| 9 |
+
from models import ModelManager
|
| 10 |
+
from utils import mask_pii
|
| 11 |
+
|
| 12 |
+
app = FastAPI()
|
| 13 |
+
|
| 14 |
+
# Initialize model manager
|
| 15 |
+
model_manager = ModelManager()
|
| 16 |
+
try:
|
| 17 |
+
model_manager.load_models()
|
| 18 |
+
except Exception as e:
|
| 19 |
+
raise RuntimeError(f"Error loading models: {e}")
|
| 20 |
+
|
| 21 |
+
# Helper class for marking lists that need compact JSON representation
|
| 22 |
+
class CompactListWrapper:
|
| 23 |
+
def __init__(self, data_list):
|
| 24 |
+
self.data = data_list
|
| 25 |
+
|
| 26 |
+
# Custom JSON Encoder (used by CustomFormattedJSONResponse)
|
| 27 |
+
class CustomJsonEncoder(json.JSONEncoder):
|
| 28 |
+
def default(self, o):
|
| 29 |
+
if isinstance(o, CompactListWrapper):
|
| 30 |
+
return f"__COMPACT_LIST_PLACEHOLDER__{json.dumps(o.data, separators=(',',':'))}__END_PLACEHOLDER__"
|
| 31 |
+
return super().default(o)
|
| 32 |
+
|
| 33 |
+
# Custom JSONResponse class for specific formatting
|
| 34 |
+
class CustomFormattedJSONResponse(JSONResponse):
|
| 35 |
+
def render(self, content: Any) -> bytes:
|
| 36 |
+
# content is the dictionary passed to the response instance
|
| 37 |
+
json_string_with_placeholders = json.dumps(
|
| 38 |
+
content,
|
| 39 |
+
indent=2,
|
| 40 |
+
cls=CustomJsonEncoder # Our encoder that inserts placeholders
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Replace the quoted placeholders with their unquoted compact list content
|
| 44 |
+
final_json_string = re.sub(
|
| 45 |
+
r'"__COMPACT_LIST_PLACEHOLDER__(.*?)__END_PLACEHOLDER__"',
|
| 46 |
+
r'\1',
|
| 47 |
+
json_string_with_placeholders
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
return final_json_string.encode("utf-8")
|
| 51 |
+
|
| 52 |
+
class EmailInput(BaseModel):
|
| 53 |
+
input_email_body: str
|
| 54 |
+
|
| 55 |
+
@app.post("/classify")
|
| 56 |
+
async def classify_email(email_input: EmailInput):
|
| 57 |
+
try:
|
| 58 |
+
# Mask PII in the email
|
| 59 |
+
masked_email_str, masked_entities_list_of_dicts = mask_pii(
|
| 60 |
+
email_input.input_email_body,
|
| 61 |
+
model_manager.ner_pipeline
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
# Classify the masked email
|
| 65 |
+
predicted_category_str = model_manager.predict(masked_email_str)
|
| 66 |
+
|
| 67 |
+
# Prepare data, wrapping 'position' lists in CompactListWrapper
|
| 68 |
+
processed_masked_entities = []
|
| 69 |
+
for entity_dict in masked_entities_list_of_dicts:
|
| 70 |
+
# Create a new dict to avoid modifying original from mask_pii if it's reused
|
| 71 |
+
processed_entity = entity_dict.copy()
|
| 72 |
+
if "position" in processed_entity and isinstance(processed_entity["position"], list):
|
| 73 |
+
processed_entity["position"] = CompactListWrapper(processed_entity["position"])
|
| 74 |
+
processed_masked_entities.append(processed_entity)
|
| 75 |
+
|
| 76 |
+
response_data = {
|
| 77 |
+
"input_email_body": email_input.input_email_body,
|
| 78 |
+
"list_of_masked_entities": processed_masked_entities,
|
| 79 |
+
"masked_email": masked_email_str,
|
| 80 |
+
"category_of_the_email": predicted_category_str
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Use the custom response class
|
| 84 |
+
return CustomFormattedJSONResponse(content=response_data)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
# It's good practice to log the actual exception for debugging on the server
|
| 87 |
+
# import traceback
|
| 88 |
+
# print(f"Error in classify_email: {str(e)}\n{traceback.format_exc()}")
|
| 89 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
import uvicorn
|
| 93 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
mlp_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:294544a99fec249f58b8258a5868d941dfb9e47d7ff44700965f0b1ce107b22d
|
| 3 |
+
size 923932
|
models.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from transformers import (AutoModelForTokenClassification, AutoTokenizer,
|
| 7 |
+
TokenClassificationPipeline)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MLPClassifier(nn.Module):
|
| 11 |
+
def __init__(self, input_dim, num_classes):
|
| 12 |
+
super(MLPClassifier, self).__init__()
|
| 13 |
+
self.model = nn.Sequential(
|
| 14 |
+
nn.Linear(input_dim, 256),
|
| 15 |
+
nn.ReLU(),
|
| 16 |
+
nn.Dropout(0.3),
|
| 17 |
+
nn.Linear(256, 128),
|
| 18 |
+
nn.ReLU(),
|
| 19 |
+
nn.Dropout(0.3),
|
| 20 |
+
nn.Linear(128, num_classes)
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
def forward(self, x):
|
| 24 |
+
return self.model(x)
|
| 25 |
+
|
| 26 |
+
class ModelManager:
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self.ner_model = None
|
| 29 |
+
self.ner_tokenizer = None
|
| 30 |
+
self.ner_pipeline = None
|
| 31 |
+
self.classification_model = None
|
| 32 |
+
self.label_encoder = None
|
| 33 |
+
self.pca_model = None
|
| 34 |
+
self.mlp_model = None
|
| 35 |
+
|
| 36 |
+
def load_models(self):
|
| 37 |
+
# Load NER model
|
| 38 |
+
ner_model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
|
| 39 |
+
self.ner_tokenizer = AutoTokenizer.from_pretrained("./model")
|
| 40 |
+
self.ner_model = AutoModelForTokenClassification.from_pretrained("./model")
|
| 41 |
+
self.ner_pipeline = TokenClassificationPipeline(
|
| 42 |
+
model=self.ner_model.to('cpu'),
|
| 43 |
+
tokenizer=self.ner_tokenizer,
|
| 44 |
+
device=-1,
|
| 45 |
+
aggregation_strategy="simple"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Load classification models
|
| 49 |
+
self.classification_model = SentenceTransformer('./sbert_model')
|
| 50 |
+
|
| 51 |
+
with open("label_encoder.pkl", "rb") as f:
|
| 52 |
+
self.label_encoder = pickle.load(f)
|
| 53 |
+
|
| 54 |
+
with open("pca.pkl", "rb") as f:
|
| 55 |
+
self.pca_model = pickle.load(f)
|
| 56 |
+
|
| 57 |
+
model_state_dict = torch.load("mlp_model.pth", map_location=torch.device('cpu'))
|
| 58 |
+
num_classes = len(self.label_encoder.classes_)
|
| 59 |
+
input_dim = self.pca_model.n_components_
|
| 60 |
+
|
| 61 |
+
self.mlp_model = MLPClassifier(input_dim, num_classes)
|
| 62 |
+
self.mlp_model.load_state_dict(model_state_dict)
|
| 63 |
+
self.mlp_model.eval()
|
| 64 |
+
|
| 65 |
+
def predict(self, text):
|
| 66 |
+
# Get embeddings and reduce dimensions
|
| 67 |
+
email_embedding = self.classification_model.encode([text])
|
| 68 |
+
email_reduced = self.pca_model.transform(email_embedding)
|
| 69 |
+
email_tensor = torch.tensor(email_reduced, dtype=torch.float32)
|
| 70 |
+
|
| 71 |
+
# Make prediction
|
| 72 |
+
with torch.no_grad():
|
| 73 |
+
output = self.mlp_model(email_tensor)
|
| 74 |
+
predicted_class_index = torch.argmax(output, dim=1).item()
|
| 75 |
+
predicted_category = self.label_encoder.inverse_transform([predicted_class_index])[0]
|
| 76 |
+
|
| 77 |
+
return predicted_category
|
pca.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8dd0cc734e8c0ef88a277f3f948c687afae9184bef195adbe608d1bb553a2ed3
|
| 3 |
+
size 2372321
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
transformers==4.51.3
|
| 4 |
+
torch==2.5.0
|
| 5 |
+
pandas==2.2.2
|
| 6 |
+
scikit-learn==1.6.1
|
| 7 |
+
sentence-transformers==4.1.0
|
| 8 |
+
pydantic==2.11.4
|
| 9 |
+
tqdm==4.67.1
|
| 10 |
+
numpy==2.0.2
|
test.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
|
| 5 |
+
# Replace with the actual URL where your FastAPI app is running locally
|
| 6 |
+
LOCAL_API_URL = "http://127.0.0.1:8000/classify"
|
| 7 |
+
|
| 8 |
+
def test_classify_endpoint(email_body):
|
| 9 |
+
"""
|
| 10 |
+
Sends a POST request to the /classify endpoint of the local FastAPI app.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
email_body (str): The email content to be classified.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
dict: The JSON response from the API, or None if an error occurred.
|
| 17 |
+
"""
|
| 18 |
+
headers = {"Content-Type": "application/json"}
|
| 19 |
+
payload = {"input_email_body": email_body}
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
response = requests.post(LOCAL_API_URL, headers=headers, json=payload)
|
| 23 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 24 |
+
return response.json()
|
| 25 |
+
except requests.exceptions.RequestException as e:
|
| 26 |
+
print(f"Error connecting to the API: {e}")
|
| 27 |
+
return None
|
| 28 |
+
except requests.exceptions.HTTPError as e:
|
| 29 |
+
print(f"HTTP Error: {e}")
|
| 30 |
+
return None
|
| 31 |
+
except json.JSONDecodeError as e:
|
| 32 |
+
print(f"Error decoding JSON response: {e}")
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
if __name__ == "__main__":
|
| 36 |
+
# Example email bodies to test
|
| 37 |
+
test_emails = [
|
| 38 |
+
"Hello, my name is Alice Smith and my email is alice.smith@example.com. I'm having trouble with my account.",
|
| 39 |
+
"Urgent: My credit card number is 1234-5678-9012-3456 and the expiry is 03/27. I was overcharged.",
|
| 40 |
+
"Subject: Network down - Office B1 floor. Please investigate.",
|
| 41 |
+
"Request for new software installation on my laptop.",
|
| 42 |
+
"Regarding invoice INV-2023-10-01. The total seems incorrect.",
|
| 43 |
+
"My date of birth is 01/15/1990 and my phone number is 987-654-3210.",
|
| 44 |
+
"Is there a problem with the server?",
|
| 45 |
+
"I need to change my registered address.",
|
| 46 |
+
"Subject: Unplanned system outage affecting database access.",
|
| 47 |
+
"Can I get access to the premium features?"
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
print("Testing the /classify endpoint on localhost:")
|
| 51 |
+
for i, email in enumerate(test_emails):
|
| 52 |
+
print(f"\n--- Test Email {i+1} ---")
|
| 53 |
+
print(f"Input Email Body: {email}")
|
| 54 |
+
response_data = test_classify_endpoint(email)
|
| 55 |
+
if response_data:
|
| 56 |
+
print("API Response:")
|
| 57 |
+
print(json.dumps(response_data, indent=4))
|
| 58 |
+
else:
|
| 59 |
+
print("Failed to get a valid response from the API.")
|
| 60 |
+
|
| 61 |
+
print("\nTesting complete.")
|
utils.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Dict, List, Tuple
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def mask_full_name(text: str, ner_pipeline) -> Tuple[str, List[Dict]]:
|
| 6 |
+
"""
|
| 7 |
+
Mask full names in text using NER model.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
text (str): Input text
|
| 11 |
+
ner_pipeline: NER pipeline for name detection
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
Tuple[str, List[Dict]]: Masked text and list of masked entities
|
| 15 |
+
"""
|
| 16 |
+
entities = ner_pipeline(text)
|
| 17 |
+
masked_entities = []
|
| 18 |
+
for ent in sorted(entities, key=lambda x: x['start'], reverse=True):
|
| 19 |
+
if ent['entity_group'] in ['PER', 'Person', 'full_name']:
|
| 20 |
+
start, end = ent['start'], ent['end']
|
| 21 |
+
original_entity = text[start:end]
|
| 22 |
+
masked_entities.append({
|
| 23 |
+
"position": [start, end],
|
| 24 |
+
"classification": "full_name",
|
| 25 |
+
"entity": original_entity
|
| 26 |
+
})
|
| 27 |
+
text = text[:start] + '[full_name]' + text[end:]
|
| 28 |
+
return text, masked_entities
|
| 29 |
+
|
| 30 |
+
def mask_with_regex(text: str) -> Tuple[str, List[Dict]]:
|
| 31 |
+
"""
|
| 32 |
+
Mask PII using regex patterns.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
text (str): Input text
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Tuple[str, List[Dict]]: Masked text and list of masked entities
|
| 39 |
+
"""
|
| 40 |
+
masked_entities = []
|
| 41 |
+
|
| 42 |
+
# Email address
|
| 43 |
+
emails = list(re.finditer(r'\b[\w.-]+?@\w+?\.\w+?\b', text))
|
| 44 |
+
for match in reversed(emails):
|
| 45 |
+
start, end = match.span()
|
| 46 |
+
original_entity = text[start:end]
|
| 47 |
+
masked_entities.append({
|
| 48 |
+
"position": [start, end],
|
| 49 |
+
"classification": "email",
|
| 50 |
+
"entity": original_entity
|
| 51 |
+
})
|
| 52 |
+
text = text[:start] + '[email]' + text[end:]
|
| 53 |
+
|
| 54 |
+
# Phone number
|
| 55 |
+
phones = list(re.finditer(r'\b(?:(?:\+|0)91[\s.-]?)?\d{10}(?!\d)\b', text))
|
| 56 |
+
for match in reversed(phones):
|
| 57 |
+
start, end = match.span()
|
| 58 |
+
original_entity = text[start:end]
|
| 59 |
+
masked_entities.append({
|
| 60 |
+
"position": [start, end],
|
| 61 |
+
"classification": "phone_number",
|
| 62 |
+
"entity": original_entity
|
| 63 |
+
})
|
| 64 |
+
text = text[:start] + '[phone_number]' + text[end:]
|
| 65 |
+
|
| 66 |
+
# Date of Birth
|
| 67 |
+
dobs = list(re.finditer(r'\b\d{2}[-/]\d{2}[-/]\d{4}\b|\b\d{4}[-/]\d{2}[-/]\d{2}\b', text))
|
| 68 |
+
for match in reversed(dobs):
|
| 69 |
+
start, end = match.span()
|
| 70 |
+
original_entity = text[start:end]
|
| 71 |
+
masked_entities.append({
|
| 72 |
+
"position": [start, end],
|
| 73 |
+
"classification": "dob",
|
| 74 |
+
"entity": original_entity
|
| 75 |
+
})
|
| 76 |
+
text = text[:start] + '[dob]' + text[end:]
|
| 77 |
+
|
| 78 |
+
# Credit/Debit card number
|
| 79 |
+
cards = list(re.finditer(r'\b(?:\d[ -]*?){13,19}\b', text))
|
| 80 |
+
for match in reversed(cards):
|
| 81 |
+
start, end = match.span()
|
| 82 |
+
original_entity = text[start:end]
|
| 83 |
+
masked_entities.append({
|
| 84 |
+
"position": [start, end],
|
| 85 |
+
"classification": "credit_debit_no",
|
| 86 |
+
"entity": original_entity
|
| 87 |
+
})
|
| 88 |
+
text = text[:start] + '[credit_debit_no]' + text[end:]
|
| 89 |
+
|
| 90 |
+
# Aadhar number
|
| 91 |
+
aadhars = list(re.finditer(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', text))
|
| 92 |
+
for match in reversed(aadhars):
|
| 93 |
+
start, end = match.span()
|
| 94 |
+
original_entity = text[start:end]
|
| 95 |
+
masked_entities.append({
|
| 96 |
+
"position": [start, end],
|
| 97 |
+
"classification": "aadhar_num",
|
| 98 |
+
"entity": original_entity
|
| 99 |
+
})
|
| 100 |
+
text = text[:start] + '[aadhar_num]' + text[end:]
|
| 101 |
+
|
| 102 |
+
# CVV number
|
| 103 |
+
cvvs = list(re.finditer(r'\b\d{3}\b', text))
|
| 104 |
+
for match in reversed(cvvs):
|
| 105 |
+
start, end = match.span()
|
| 106 |
+
original_entity = text[start:end]
|
| 107 |
+
masked_entities.append({
|
| 108 |
+
"position": [start, end],
|
| 109 |
+
"classification": "cvv_no",
|
| 110 |
+
"entity": original_entity
|
| 111 |
+
})
|
| 112 |
+
text = text[:start] + '[cvv_no]' + text[end:]
|
| 113 |
+
|
| 114 |
+
# Card expiry date
|
| 115 |
+
expiries = list(re.finditer(r'\b(0[1-9]|1[0-2])\/?([0-9]{2}|[0-9]{4})\b', text))
|
| 116 |
+
for match in reversed(expiries):
|
| 117 |
+
start, end = match.span()
|
| 118 |
+
original_entity = text[start:end]
|
| 119 |
+
masked_entities.append({
|
| 120 |
+
"position": [start, end],
|
| 121 |
+
"classification": "expiry_no",
|
| 122 |
+
"entity": original_entity
|
| 123 |
+
})
|
| 124 |
+
text = text[:start] + '[expiry_no]' + text[end:]
|
| 125 |
+
|
| 126 |
+
return text, masked_entities
|
| 127 |
+
|
| 128 |
+
def mask_pii(text: str, ner_pipeline) -> Tuple[str, List[Dict]]:
|
| 129 |
+
"""
|
| 130 |
+
Mask all PII in text using both NER and regex patterns.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
text (str): Input text
|
| 134 |
+
ner_pipeline: NER pipeline for name detection
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Tuple[str, List[Dict]]: Masked text and list of all masked entities
|
| 138 |
+
"""
|
| 139 |
+
text, ner_entities = mask_full_name(text, ner_pipeline)
|
| 140 |
+
text, regex_entities = mask_with_regex(text)
|
| 141 |
+
return text, ner_entities + regex_entities
|