adrian7305 commited on
Commit
16f0e1e
·
1 Parent(s): 6a8c14d

Initial commit: FastAPI app, model, utils, Dockerfile

Browse files
Files changed (5) hide show
  1. Dockerfile +19 -0
  2. main.py +29 -0
  3. models.py +15 -0
  4. requirements.txt +11 -0
  5. utils.py +74 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # 1) Set working directory
4
+ WORKDIR /app
5
+
6
+ # 2) Copy everything into the container
7
+ COPY . /app
8
+
9
+ # 3) Install requirements
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # 4) Download spaCy model
13
+ RUN python -m spacy download en_core_web_sm
14
+
15
+ # 5) Expose port 7860 (FastAPI default)
16
+ EXPOSE 7860
17
+
18
+ # 6) Start Uvicorn with your FastAPI app
19
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from utils import mask_pii, demask_pii
4
+ from models import load_model, classify_email
5
+
6
+ app = FastAPI()
7
+ model = load_model()
8
+
9
+ class EmailRequest(BaseModel):
10
+ input_email_body: str
11
+
12
+ @app.post("/classify")
13
+ def classify(request: EmailRequest):
14
+ # Mask PII
15
+ masked_email, entities = mask_pii(request.input_email_body)
16
+
17
+ # Classify email
18
+ category = classify_email(model, masked_email)
19
+
20
+ # Demask back to original
21
+ demasked_email = demask_pii(masked_email, entities)
22
+
23
+ return {
24
+ "input_email_body": request.input_email_body,
25
+ "list_of_masked_entities": entities,
26
+ "masked_email": masked_email,
27
+ "category_of_the_email": category,
28
+ "demasked_email": demasked_email
29
+ }
models.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ from joblib import load
3
+ from sklearn.pipeline import Pipeline
4
+
5
+ import os
6
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Add at top
7
+ MODEL_REPO = "adrian7305/email-classifier"
8
+ MODEL_FILE = "model.joblib"
9
+
10
+ def load_model() -> Pipeline:
11
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
12
+ return load(model_path)
13
+
14
+ def classify_email(model: Pipeline, email: str) -> str:
15
+ return str(model.predict([email])[0])
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn==0.27.0
3
+ python-multipart==0.0.9
4
+ spacy==3.7.4
5
+ scikit-learn==1.6.1
6
+ joblib==1.3.2
7
+ regex==2023.12.25
8
+ pandas==2.1.4
9
+ huggingface-hub==0.20.3
10
+ python-dotenv==1.0.0
11
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from typing import Tuple, List, Dict
4
+
5
+ nlp = spacy.load("en_core_web_sm")
6
+
7
+ PATTERN_ORDER = [
8
+ ("credit_debit_no", r"\b(?:\d[ -]*?){13,19}\b"),
9
+ ("aadhar_num", r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"),
10
+ ("phone_number", r"(?:(?:\+91|0)[-\s]?)?[6-9]\d{4}[-\s]?\d{5}"),
11
+ ("email", r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
12
+ ("dob", r"\b(?:0?[1-9]|1[0-2])[\/-](?:0?[1-9]|[12][0-9]|3[01])[\/-](?:\d{4}|\d{2})\b"),
13
+ ("expiry_no", r"\b(?:0[1-9]|1[0-2])[\/-]?(?:\d{2}|\d{4})\b"),
14
+ ("cvv_no", r"\b\d{3,4}\b"),
15
+ ]
16
+
17
+ def mask_pii(text: str) -> Tuple[str, List[Dict]]:
18
+ entities: List[Dict] = []
19
+ occupied_spans: List[Tuple[int, int]] = []
20
+ masked_text = text
21
+
22
+ def overlaps_existing(start: int, end: int) -> bool:
23
+ for os_, oe_ in occupied_spans:
24
+ if not (end <= os_ or start >= oe_):
25
+ return True
26
+ return False
27
+
28
+ for pii_type, pattern in PATTERN_ORDER:
29
+ for match in re.finditer(pattern, text):
30
+ start, end = match.span()
31
+ if not overlaps_existing(start, end):
32
+ entities.append({
33
+ "position": [start, end],
34
+ "classification": pii_type,
35
+ "entity": text[start:end]
36
+ })
37
+ occupied_spans.append((start, end))
38
+
39
+ doc = nlp(text)
40
+ for ent in doc.ents:
41
+ if ent.label_ == "PERSON":
42
+ start, end = ent.start_char, ent.end_char
43
+ if not overlaps_existing(start, end):
44
+ entities.append({
45
+ "position": [start, end],
46
+ "classification": "full_name",
47
+ "entity": ent.text
48
+ })
49
+ occupied_spans.append((start, end))
50
+
51
+ entities.sort(key=lambda x: x["position"][0], reverse=True)
52
+
53
+ for entity in entities:
54
+ start, end = entity["position"]
55
+ placeholder = f"[{entity['classification']}]"
56
+ masked_text = masked_text[:start] + placeholder + masked_text[end:]
57
+
58
+ return masked_text, entities
59
+ def demask_pii(masked_text: str, entities: List[Dict]) -> str:
60
+ """
61
+ Given `masked_text` (with placeholders like “[email]”) and the
62
+ `entities` list (each entry has position, classification, entity),
63
+ restore the original substrings at their exact positions.
64
+ """
65
+ result = masked_text
66
+ # Sort in ascending order of start‐index, so that earlier replacements
67
+ # don’t break the indices of later ones.
68
+ for ent in sorted(entities, key=lambda x: x["position"][0]):
69
+ start, end = ent["position"]
70
+ placeholder = f"[{ent['classification']}]"
71
+ original = ent["entity"]
72
+ # Replace the placeholder at the exact location with the original text.
73
+ result = result[:start] + original + result[start + len(placeholder):]
74
+ return result