File size: 2,061 Bytes
01e9350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from datetime import datetime
from bson import ObjectId
import re
import copy
import random
import string

EMAIL_REGEX = re.compile(r'[A-Za-z0-9._%+-]+@([A-Za-z0-9.-]+\.[A-Za-z]{2,})')
URL_REGEX   = re.compile(r'(?i)\b((?:https?://|ftp://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+[^\s]*)')

PHONE_PARTS_REGEX = re.compile(r'(\D*)(\d+)(\D*)(\d+)(\D*)(\d+)(\D*)(\d*)(\D*)')


def rand_name():
    name = random.choice(['john', 'jane', 'alex', 'mike', 'sara', 'chris', 'emma', 'liam'])
    num = ''.join(random.choices(string.digits, k=3))
    return f'{name}{num}'

def fake_email(match):
    domain = match.group(1)
    return f'{rand_name()}@{domain}'

def fake_url(match):
    return 'https://example.com'

def fake_phone(match):
    """

    Preserves exact punctuation, only replaces digits.

    Example: '+1 555-123-4567' → '+1 987-654-3210'

    """
    groups = match.groups()
    faked = []
    for g in groups:
        if g and g.isdigit():
            length = len(g)
            fake_digits = ''.join(random.choices(string.digits, k=length))
            faked.append(fake_digits)
        else:
            faked.append(g or '')
    return ''.join(faked)



def anonymize_value(val):
    if not isinstance(val, (str, int)):
        return val

    s = str(val)

    # 1. URLs
    if URL_REGEX.search(s):
        s = URL_REGEX.sub(fake_url, s)

    # 2. Emails
    if EMAIL_REGEX.search(s):
        s = EMAIL_REGEX.sub(fake_email, s)

    # 3. Phones – preserve format
    if PHONE_PARTS_REGEX.search(s):
        s = PHONE_PARTS_REGEX.sub(fake_phone, s)

    # If original was int and now only digits, return int
    if isinstance(val, int) and s.isdigit():
        return int(s)

    return s


def anonymize_deep(obj):
    if isinstance(obj, dict):
        return {k: anonymize_deep(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [anonymize_deep(v) for v in obj]
    return anonymize_value(obj)



#fake_data = anonymize_deep(copy.deepcopy(input_data))