# -*- coding: utf-8 -*- """utils.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1A-dtpeFsj10i7nsKsMjRA1sb8O-2Cccd """ import re import spacy # Load SpaCy English model nlp = spacy.load("en_core_web_sm") # PII patterns using Regex PII_PATTERNS = { "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b", "phone_number": r"\b(?:\+91[-\s]?)?[6-9]\d{9}\b", "dob": r"\b(?:\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[-/]\d{2}[-/]\d{2})\b", "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b", "credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b", "cvv_no": r"\b\d{3}\b", "expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b" } def detect_full_name(text): """Detect full name using spaCy's named entity recognition.""" doc = nlp(text) full_names = [] for ent in doc.ents: if ent.label_ == "PERSON": full_names.append((ent.start_char, ent.end_char, ent.text)) return full_names def mask_pii(text): """ Mask PII in the input text and return: - masked text - list of detected entities with positions """ masked_text = text entity_list = [] # First: detect names using SpaCy name_entities = detect_full_name(text) for start, end, val in name_entities: placeholder = "[full_name]" entity_list.append({ "position": [start, end], "classification": "full_name", "entity": val }) # Replace names in reverse to keep positions intact for start, end, val in sorted(name_entities, key=lambda x: x[0], reverse=True): masked_text = masked_text[:start] + "[full_name]" + masked_text[end:] # Second: regex-based detection for ent_type, pattern in PII_PATTERNS.items(): for match in re.finditer(pattern, masked_text): start, end = match.start(), match.end() value = match.group() placeholder = f"[{ent_type}]" entity_list.append({ "position": [start, end], "classification": ent_type, "entity": value }) # Sort and replace regex entities in reverse order for ent in sorted(entity_list, key=lambda x: x['position'][0], reverse=True): start, end = ent['position'] classification = ent['classification'] masked_text = masked_text[:start] + f"[{classification}]" + masked_text[end:] return masked_text, entity_list