nandini2455508's picture
Upload utils.py
9ad7d4c verified
# -*- coding: utf-8 -*-
"""utils.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1A-dtpeFsj10i7nsKsMjRA1sb8O-2Cccd
"""
import re
import spacy
# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")
# PII patterns using Regex
PII_PATTERNS = {
"email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
"phone_number": r"\b(?:\+91[-\s]?)?[6-9]\d{9}\b",
"dob": r"\b(?:\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[-/]\d{2}[-/]\d{2})\b",
"aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
"credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
"cvv_no": r"\b\d{3}\b",
"expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b"
}
def detect_full_name(text):
"""Detect full name using spaCy's named entity recognition."""
doc = nlp(text)
full_names = []
for ent in doc.ents:
if ent.label_ == "PERSON":
full_names.append((ent.start_char, ent.end_char, ent.text))
return full_names
def mask_pii(text):
"""
Mask PII in the input text and return:
- masked text
- list of detected entities with positions
"""
masked_text = text
entity_list = []
# First: detect names using SpaCy
name_entities = detect_full_name(text)
for start, end, val in name_entities:
placeholder = "[full_name]"
entity_list.append({
"position": [start, end],
"classification": "full_name",
"entity": val
})
# Replace names in reverse to keep positions intact
for start, end, val in sorted(name_entities, key=lambda x: x[0], reverse=True):
masked_text = masked_text[:start] + "[full_name]" + masked_text[end:]
# Second: regex-based detection
for ent_type, pattern in PII_PATTERNS.items():
for match in re.finditer(pattern, masked_text):
start, end = match.start(), match.end()
value = match.group()
placeholder = f"[{ent_type}]"
entity_list.append({
"position": [start, end],
"classification": ent_type,
"entity": value
})
# Sort and replace regex entities in reverse order
for ent in sorted(entity_list, key=lambda x: x['position'][0], reverse=True):
start, end = ent['position']
classification = ent['classification']
masked_text = masked_text[:start] + f"[{classification}]" + masked_text[end:]
return masked_text, entity_list