File size: 1,312 Bytes
9bfe8cb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | # utils.py
import re
def mask_pii(text):
entities = []
original_text = text
# Define patterns
patterns = {
"full_name": r"(?:(?:Mr|Ms|Mrs|Dr)\\.?\s)?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+",
"email": r"[\\w\\.-]+@[\\w\\.-]+",
"phone_number": r"(\\+91[-\\s]?)?[6-9]\\d{9}",
"dob": r"(\\d{2}[/-]\\d{2}[/-]\\d{4})",
"aadhar_num": r"\\d{4}\\s\\d{4}\\s\\d{4}",
"credit_debit_no": r"\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}[-\\s]\\d{4}",
"cvv_no": r"\\b\\d{3}\\b",
"expiry_no": r"(0[1-9]|1[0-2])/\\d{2}"
}
masked_text = text
for entity, pattern in patterns.items():
for match in re.finditer(pattern, original_text):
start, end = match.span()
matched_text = match.group()
entities.append({
"position": [start, end],
"classification": entity,
"entity": matched_text
})
masked_text = masked_text.replace(matched_text, f"[{entity}]", 1)
return masked_text, entities
def unmask_pii(masked_text, entities):
unmasked_text = masked_text
for ent in entities:
unmasked_text = unmasked_text.replace(f"[{ent['classification']}]", ent['entity'], 1)
return unmasked_text
|