Spaces:
Sleeping
Sleeping
File size: 2,098 Bytes
16f0e1e efb707f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import re
import spacy
from typing import Tuple, List, Dict
nlp = spacy.load("en_core_web_sm")
PATTERN_ORDER = [
("credit_debit_no", r"\b(?:\d[ -]*?){13,19}\b"),
("aadhar_num", r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b"),
("phone_number", r"(?:(?:\+91|0)[-\s]?)?[6-9]\d{4}[-\s]?\d{5}"),
("email", r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
("dob", r"\b(?:0?[1-9]|1[0-2])[\/-](?:0?[1-9]|[12][0-9]|3[01])[\/-](?:\d{4}|\d{2})\b"),
("expiry_no", r"\b(?:0[1-9]|1[0-2])[\/-]?(?:\d{2}|\d{4})\b"),
("cvv_no", r"\b\d{3,4}\b"),
]
def mask_pii(text: str) -> Tuple[str, List[Dict]]:
entities: List[Dict] = []
occupied_spans: List[Tuple[int, int]] = []
masked_text = text
def overlaps_existing(start: int, end: int) -> bool:
for os_, oe_ in occupied_spans:
if not (end <= os_ or start >= oe_):
return True
return False
for pii_type, pattern in PATTERN_ORDER:
for match in re.finditer(pattern, text):
start, end = match.span()
if not overlaps_existing(start, end):
entities.append({
"position": [start, end],
"classification": pii_type,
"entity": text[start:end]
})
occupied_spans.append((start, end))
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "PERSON":
start, end = ent.start_char, ent.end_char
if not overlaps_existing(start, end):
entities.append({
"position": [start, end],
"classification": "full_name",
"entity": ent.text
})
occupied_spans.append((start, end))
entities.sort(key=lambda x: x["position"][0], reverse=True)
for entity in entities:
start, end = entity["position"]
placeholder = f"[{entity['classification']}]"
masked_text = masked_text[:start] + placeholder + masked_text[end:]
return masked_text, entities |