File size: 2,495 Bytes
9ad7d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""utils.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1A-dtpeFsj10i7nsKsMjRA1sb8O-2Cccd
"""



import re
import spacy

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# PII patterns using Regex
PII_PATTERNS = {
    "email": r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b",
    "phone_number": r"\b(?:\+91[-\s]?)?[6-9]\d{9}\b",
    "dob": r"\b(?:\d{2}[-/]\d{2}[-/]\d{4}|\d{4}[-/]\d{2}[-/]\d{2})\b",
    "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    "credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
    "cvv_no": r"\b\d{3}\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b"
}


def detect_full_name(text):
    """Detect full name using spaCy's named entity recognition."""
    doc = nlp(text)
    full_names = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            full_names.append((ent.start_char, ent.end_char, ent.text))
    return full_names


def mask_pii(text):
    """
    Mask PII in the input text and return:
    - masked text
    - list of detected entities with positions
    """
    masked_text = text
    entity_list = []

    # First: detect names using SpaCy
    name_entities = detect_full_name(text)
    for start, end, val in name_entities:
        placeholder = "[full_name]"
        entity_list.append({
            "position": [start, end],
            "classification": "full_name",
            "entity": val
        })

    # Replace names in reverse to keep positions intact
    for start, end, val in sorted(name_entities, key=lambda x: x[0], reverse=True):
        masked_text = masked_text[:start] + "[full_name]" + masked_text[end:]

    # Second: regex-based detection
    for ent_type, pattern in PII_PATTERNS.items():
        for match in re.finditer(pattern, masked_text):
            start, end = match.start(), match.end()
            value = match.group()
            placeholder = f"[{ent_type}]"
            entity_list.append({
                "position": [start, end],
                "classification": ent_type,
                "entity": value
            })

    # Sort and replace regex entities in reverse order
    for ent in sorted(entity_list, key=lambda x: x['position'][0], reverse=True):
        start, end = ent['position']
        classification = ent['classification']
        masked_text = masked_text[:start] + f"[{classification}]" + masked_text[end:]

    return masked_text, entity_list