File size: 4,741 Bytes
6db4426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from typing import Dict, List, Tuple


def mask_full_name(text: str, ner_pipeline) -> Tuple[str, List[Dict]]:
    """
    Mask full names in text using NER model.
    
    Args:
        text (str): Input text
        ner_pipeline: NER pipeline for name detection
        
    Returns:
        Tuple[str, List[Dict]]: Masked text and list of masked entities
    """
    entities = ner_pipeline(text)
    masked_entities = []
    for ent in sorted(entities, key=lambda x: x['start'], reverse=True):
        if ent['entity_group'] in ['PER', 'Person', 'full_name']:
            start, end = ent['start'], ent['end']
            original_entity = text[start:end]
            masked_entities.append({
                "position": [start, end],
                "classification": "full_name",
                "entity": original_entity
            })
            text = text[:start] + '[full_name]' + text[end:]
    return text, masked_entities

def mask_with_regex(text: str) -> Tuple[str, List[Dict]]:
    """
    Mask PII using regex patterns.
    
    Args:
        text (str): Input text
        
    Returns:
        Tuple[str, List[Dict]]: Masked text and list of masked entities
    """
    masked_entities = []
    
    # Email address
    emails = list(re.finditer(r'\b[\w.-]+?@\w+?\.\w+?\b', text))
    for match in reversed(emails):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "email",
            "entity": original_entity
        })
        text = text[:start] + '[email]' + text[end:]

    # Phone number
    phones = list(re.finditer(r'\b(?:(?:\+|0)91[\s.-]?)?\d{10}(?!\d)\b', text))
    for match in reversed(phones):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "phone_number",
            "entity": original_entity
        })
        text = text[:start] + '[phone_number]' + text[end:]

    # Date of Birth
    dobs = list(re.finditer(r'\b\d{2}[-/]\d{2}[-/]\d{4}\b|\b\d{4}[-/]\d{2}[-/]\d{2}\b', text))
    for match in reversed(dobs):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "dob",
            "entity": original_entity
        })
        text = text[:start] + '[dob]' + text[end:]

    # Credit/Debit card number
    cards = list(re.finditer(r'\b(?:\d[ -]*?){13,19}\b', text))
    for match in reversed(cards):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "credit_debit_no",
            "entity": original_entity
        })
        text = text[:start] + '[credit_debit_no]' + text[end:]
        
    # Aadhar number
    aadhars = list(re.finditer(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', text))
    for match in reversed(aadhars):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "aadhar_num",
            "entity": original_entity
        })
        text = text[:start] + '[aadhar_num]' + text[end:]

    # CVV number
    cvvs = list(re.finditer(r'\b\d{3}\b', text))
    for match in reversed(cvvs):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "cvv_no",
            "entity": original_entity
        })
        text = text[:start] + '[cvv_no]' + text[end:]

    # Card expiry date
    expiries = list(re.finditer(r'\b(0[1-9]|1[0-2])\/?([0-9]{2}|[0-9]{4})\b', text))
    for match in reversed(expiries):
        start, end = match.span()
        original_entity = text[start:end]
        masked_entities.append({
            "position": [start, end],
            "classification": "expiry_no",
            "entity": original_entity
        })
        text = text[:start] + '[expiry_no]' + text[end:]

    return text, masked_entities

def mask_pii(text: str, ner_pipeline) -> Tuple[str, List[Dict]]:
    """
    Mask all PII in text using both NER and regex patterns.
    
    Args:
        text (str): Input text
        ner_pipeline: NER pipeline for name detection
        
    Returns:
        Tuple[str, List[Dict]]: Masked text and list of all masked entities
    """
    text, ner_entities = mask_full_name(text, ner_pipeline)
    text, regex_entities = mask_with_regex(text)
    return text, ner_entities + regex_entities