File size: 3,744 Bytes
507fdc6
 
 
03c6783
 
507fdc6
 
 
 
 
 
 
 
03c6783
 
 
 
 
 
 
 
507fdc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03c6783
0e13165
03c6783
507fdc6
03c6783
 
 
 
 
 
 
 
 
 
507fdc6
03c6783
507fdc6
 
0e13165
 
 
507fdc6
 
 
03c6783
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
import spacy
from typing import Dict, Any, List
from langdetect import detect
from deep_translator import GoogleTranslator

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def mask_pii(text: str) -> Dict[str, Any]:
    """
    Enhanced PII masking with JSON output format
    """
    lang = detect(text)
    if lang == 'en':
            #return text
            masked_text = text
    else:
            # Translate to English
            translated = GoogleTranslator(source=lang, target='en').translate(text)
            masked_text = translated
    entities = []
    
    def mask_and_record(pattern, label, group=0):
        nonlocal masked_text, entities
        for match in reversed(list(re.finditer(pattern, masked_text))):
            start, end = match.span(group)
            original = match.group(group)
            
            # Skip if already masked or overlaps
            if any(e['position'][0] <= start < e['position'][1] for e in entities):
                continue
            
            masked_text = masked_text[:start] + f"[{label}]" + masked_text[end:]
            entities.append({
                "position": [start, end],
                "classification": label,
                "entity": original
            })

    # Specific patterns first
    mask_and_record(r'\b(\d{4}[ -]?\d{4}[ -]?\d{4})\b', 'aadhar_num')
    mask_and_record(r'\b((?:\d[ -]*?){15,18}\d)\b', 'credit_debit_no')
    mask_and_record(r'(?:CVV|CVC|Security Code)[: ]*(\d{3,4})\b', 'cvv_no', 1)
    mask_and_record(r'\b((0[1-9]|1[0-2])[/-](\d{2}|\d{4}))\b', 'expiry_no', 1)

    dob_patterns = [
        r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b',
        r'\b(\d{4}[/-]\d{1,2}[/-]\d{1,2})\b',
        r'\b((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4})\b'
    ]
    for pattern in dob_patterns:
        mask_and_record(pattern, 'dob', 1)

    mask_and_record(r'(\+?\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}', 'phone_number')
    mask_and_record(r'(\b[\w.-]+@[\w.-]+\.\w+\b)', 'email')

    # spaCy for full names
    doc = nlp(masked_text)
    for ent in reversed(doc.ents):
        if ent.label_ == "PERSON":
            if any(e['position'][0] <= ent.start_char < e['position'][1] for e in entities):
                continue
            masked_text = masked_text[:ent.start_char] + "[full_name]" + masked_text[ent.end_char:]
            entities.append({
                "position": [ent.start_char, ent.end_char],
                "classification": "full_name",
                "entity": ent.text
            })

    # Optional: Set category based on simple rule or ML model
    category = "Problem"
    eng_mask = masked_text


    # if lang == 'en':
    #      masked_text = masked_text
    # else:
    #      masked_text = GoogleTranslator(source='en', target=lang).translate(masked_text)
    text2 = text
    for ent in entities:
        entity_value = ent['entity']
        classification = ent['classification']
        text = text.replace(entity_value, f"[{classification}]")
    masked_text = text
    return {
        "input_email_body": text2,
        "list_of_masked_entities": sorted(entities, key=lambda x: x["position"][0]),
        "masked_email": masked_text,
        "category_of_the_email": category,
        "English_masked": eng_mask

    }


text = "Subject: Unvorhergesehener Absturz der Datenanalyse-Plattform\n\nDie Datenanalyse-Plattform brach unerwartet ab, da die Speicheroberfläche zu gering war My name is Sophia Rossi.. Ich habe versucht, Laravel 8 und meinen MacBook Pro neu zu starten, aber das Problem behält sich bei. Ich benötige Ihre Unterstützung, um diesen Fehler zu beheben. You can reach me at janesmith@company.com."
print(mask_pii(text))