Spaces:
Sleeping
Sleeping
Commit
·
16f0e1e
1
Parent(s):
6a8c14d
Initial commit: FastAPI app, model, utils, Dockerfile
Browse files- Dockerfile +19 -0
- main.py +29 -0
- models.py +15 -0
- requirements.txt +11 -0
- utils.py +74 -0
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# 1) Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# 2) Copy everything into the container
|
| 7 |
+
COPY . /app
|
| 8 |
+
|
| 9 |
+
# 3) Install requirements
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# 4) Download spaCy model
|
| 13 |
+
RUN python -m spacy download en_core_web_sm
|
| 14 |
+
|
| 15 |
+
# 5) Expose port 7860 (FastAPI default)
|
| 16 |
+
EXPOSE 7860
|
| 17 |
+
|
| 18 |
+
# 6) Start Uvicorn with your FastAPI app
|
| 19 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from utils import mask_pii, demask_pii
|
| 4 |
+
from models import load_model, classify_email
|
| 5 |
+
|
| 6 |
+
app = FastAPI()
|
| 7 |
+
model = load_model()
|
| 8 |
+
|
| 9 |
+
class EmailRequest(BaseModel):
|
| 10 |
+
input_email_body: str
|
| 11 |
+
|
| 12 |
+
@app.post("/classify")
|
| 13 |
+
def classify(request: EmailRequest):
|
| 14 |
+
# Mask PII
|
| 15 |
+
masked_email, entities = mask_pii(request.input_email_body)
|
| 16 |
+
|
| 17 |
+
# Classify email
|
| 18 |
+
category = classify_email(model, masked_email)
|
| 19 |
+
|
| 20 |
+
# Demask back to original
|
| 21 |
+
demasked_email = demask_pii(masked_email, entities)
|
| 22 |
+
|
| 23 |
+
return {
|
| 24 |
+
"input_email_body": request.input_email_body,
|
| 25 |
+
"list_of_masked_entities": entities,
|
| 26 |
+
"masked_email": masked_email,
|
| 27 |
+
"category_of_the_email": category,
|
| 28 |
+
"demasked_email": demasked_email
|
| 29 |
+
}
|
models.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import hf_hub_download
|
| 2 |
+
from joblib import load
|
| 3 |
+
from sklearn.pipeline import Pipeline
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Add at top
|
| 7 |
+
MODEL_REPO = "adrian7305/email-classifier"
|
| 8 |
+
MODEL_FILE = "model.joblib"
|
| 9 |
+
|
| 10 |
+
def load_model() -> Pipeline:
|
| 11 |
+
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
|
| 12 |
+
return load(model_path)
|
| 13 |
+
|
| 14 |
+
def classify_email(model: Pipeline, email: str) -> str:
|
| 15 |
+
return str(model.predict([email])[0])
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.109.0
|
| 2 |
+
uvicorn==0.27.0
|
| 3 |
+
python-multipart==0.0.9
|
| 4 |
+
spacy==3.7.4
|
| 5 |
+
scikit-learn==1.6.1
|
| 6 |
+
joblib==1.3.2
|
| 7 |
+
regex==2023.12.25
|
| 8 |
+
pandas==2.1.4
|
| 9 |
+
huggingface-hub==0.20.3
|
| 10 |
+
python-dotenv==1.0.0
|
| 11 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
|
utils.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import spacy
|
| 3 |
+
from typing import Tuple, List, Dict
|
| 4 |
+
|
| 5 |
+
nlp = spacy.load("en_core_web_sm")
|
| 6 |
+
|
| 7 |
+
PATTERN_ORDER = [
|
| 8 |
+
("credit_debit_no", r"\b(?:\d[ -]*?){13,19}\b"),
|
| 9 |
+
("aadhar_num", r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"),
|
| 10 |
+
("phone_number", r"(?:(?:\+91|0)[-\s]?)?[6-9]\d{4}[-\s]?\d{5}"),
|
| 11 |
+
("email", r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
|
| 12 |
+
("dob", r"\b(?:0?[1-9]|1[0-2])[\/-](?:0?[1-9]|[12][0-9]|3[01])[\/-](?:\d{4}|\d{2})\b"),
|
| 13 |
+
("expiry_no", r"\b(?:0[1-9]|1[0-2])[\/-]?(?:\d{2}|\d{4})\b"),
|
| 14 |
+
("cvv_no", r"\b\d{3,4}\b"),
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
def mask_pii(text: str) -> Tuple[str, List[Dict]]:
|
| 18 |
+
entities: List[Dict] = []
|
| 19 |
+
occupied_spans: List[Tuple[int, int]] = []
|
| 20 |
+
masked_text = text
|
| 21 |
+
|
| 22 |
+
def overlaps_existing(start: int, end: int) -> bool:
|
| 23 |
+
for os_, oe_ in occupied_spans:
|
| 24 |
+
if not (end <= os_ or start >= oe_):
|
| 25 |
+
return True
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
for pii_type, pattern in PATTERN_ORDER:
|
| 29 |
+
for match in re.finditer(pattern, text):
|
| 30 |
+
start, end = match.span()
|
| 31 |
+
if not overlaps_existing(start, end):
|
| 32 |
+
entities.append({
|
| 33 |
+
"position": [start, end],
|
| 34 |
+
"classification": pii_type,
|
| 35 |
+
"entity": text[start:end]
|
| 36 |
+
})
|
| 37 |
+
occupied_spans.append((start, end))
|
| 38 |
+
|
| 39 |
+
doc = nlp(text)
|
| 40 |
+
for ent in doc.ents:
|
| 41 |
+
if ent.label_ == "PERSON":
|
| 42 |
+
start, end = ent.start_char, ent.end_char
|
| 43 |
+
if not overlaps_existing(start, end):
|
| 44 |
+
entities.append({
|
| 45 |
+
"position": [start, end],
|
| 46 |
+
"classification": "full_name",
|
| 47 |
+
"entity": ent.text
|
| 48 |
+
})
|
| 49 |
+
occupied_spans.append((start, end))
|
| 50 |
+
|
| 51 |
+
entities.sort(key=lambda x: x["position"][0], reverse=True)
|
| 52 |
+
|
| 53 |
+
for entity in entities:
|
| 54 |
+
start, end = entity["position"]
|
| 55 |
+
placeholder = f"[{entity['classification']}]"
|
| 56 |
+
masked_text = masked_text[:start] + placeholder + masked_text[end:]
|
| 57 |
+
|
| 58 |
+
return masked_text, entities
|
| 59 |
+
def demask_pii(masked_text: str, entities: List[Dict]) -> str:
|
| 60 |
+
"""
|
| 61 |
+
Given `masked_text` (with placeholders like “[email]”) and the
|
| 62 |
+
`entities` list (each entry has position, classification, entity),
|
| 63 |
+
restore the original substrings at their exact positions.
|
| 64 |
+
"""
|
| 65 |
+
result = masked_text
|
| 66 |
+
# Sort in ascending order of start‐index, so that earlier replacements
|
| 67 |
+
# don’t break the indices of later ones.
|
| 68 |
+
for ent in sorted(entities, key=lambda x: x["position"][0]):
|
| 69 |
+
start, end = ent["position"]
|
| 70 |
+
placeholder = f"[{ent['classification']}]"
|
| 71 |
+
original = ent["entity"]
|
| 72 |
+
# Replace the placeholder at the exact location with the original text.
|
| 73 |
+
result = result[:start] + original + result[start + len(placeholder):]
|
| 74 |
+
return result
|