| | |
| | """utils.ipynb |
| | |
| | Automatically generated by Colab. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1A-dtpeFsj10i7nsKsMjRA1sb8O-2Cccd |
| | """ |
| |
|
| |
|
| |
|
| | import re |
| | import spacy |
| |
|
| | |
| | nlp = spacy.load("en_core_web_sm") |
| |
|
| | |
| | PII_PATTERNS = { |
| | "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b", |
| | "phone_number": r"\b(?:\+91[-\s]?)?[6-9]\d{9}\b", |
| | "dob": r"\b(?:\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[-/]\d{2}[-/]\d{2})\b", |
| | "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b", |
| | "credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b", |
| | "cvv_no": r"\b\d{3}\b", |
| | "expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b" |
| | } |
| |
|
| |
|
| | def detect_full_name(text): |
| | """Detect full name using spaCy's named entity recognition.""" |
| | doc = nlp(text) |
| | full_names = [] |
| | for ent in doc.ents: |
| | if ent.label_ == "PERSON": |
| | full_names.append((ent.start_char, ent.end_char, ent.text)) |
| | return full_names |
| |
|
| |
|
| | def mask_pii(text): |
| | """ |
| | Mask PII in the input text and return: |
| | - masked text |
| | - list of detected entities with positions |
| | """ |
| | masked_text = text |
| | entity_list = [] |
| |
|
| | |
| | name_entities = detect_full_name(text) |
| | for start, end, val in name_entities: |
| | placeholder = "[full_name]" |
| | entity_list.append({ |
| | "position": [start, end], |
| | "classification": "full_name", |
| | "entity": val |
| | }) |
| |
|
| | |
| | for start, end, val in sorted(name_entities, key=lambda x: x[0], reverse=True): |
| | masked_text = masked_text[:start] + "[full_name]" + masked_text[end:] |
| |
|
| | |
| | for ent_type, pattern in PII_PATTERNS.items(): |
| | for match in re.finditer(pattern, masked_text): |
| | start, end = match.start(), match.end() |
| | value = match.group() |
| | placeholder = f"[{ent_type}]" |
| | entity_list.append({ |
| | "position": [start, end], |
| | "classification": ent_type, |
| | "entity": value |
| | }) |
| |
|
| | |
| | for ent in sorted(entity_list, key=lambda x: x['position'][0], reverse=True): |
| | start, end = ent['position'] |
| | classification = ent['classification'] |
| | masked_text = masked_text[:start] + f"[{classification}]" + masked_text[end:] |
| |
|
| | return masked_text, entity_list |
| |
|
| |
|