File size: 2,061 Bytes
01e9350 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | from datetime import datetime
from bson import ObjectId
import re
import copy
import random
import string
EMAIL_REGEX = re.compile(r'[A-Za-z0-9._%+-]+@([A-Za-z0-9.-]+\.[A-Za-z]{2,})')
URL_REGEX = re.compile(r'(?i)\b((?:https?://|ftp://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+[^\s]*)')
PHONE_PARTS_REGEX = re.compile(r'(\D*)(\d+)(\D*)(\d+)(\D*)(\d+)(\D*)(\d*)(\D*)')
def rand_name():
name = random.choice(['john', 'jane', 'alex', 'mike', 'sara', 'chris', 'emma', 'liam'])
num = ''.join(random.choices(string.digits, k=3))
return f'{name}{num}'
def fake_email(match):
domain = match.group(1)
return f'{rand_name()}@{domain}'
def fake_url(match):
return 'https://example.com'
def fake_phone(match):
"""
Preserves exact punctuation, only replaces digits.
Example: '+1 555-123-4567' → '+1 987-654-3210'
"""
groups = match.groups()
faked = []
for g in groups:
if g and g.isdigit():
length = len(g)
fake_digits = ''.join(random.choices(string.digits, k=length))
faked.append(fake_digits)
else:
faked.append(g or '')
return ''.join(faked)
def anonymize_value(val):
if not isinstance(val, (str, int)):
return val
s = str(val)
# 1. URLs
if URL_REGEX.search(s):
s = URL_REGEX.sub(fake_url, s)
# 2. Emails
if EMAIL_REGEX.search(s):
s = EMAIL_REGEX.sub(fake_email, s)
# 3. Phones – preserve format
if PHONE_PARTS_REGEX.search(s):
s = PHONE_PARTS_REGEX.sub(fake_phone, s)
# If original was int and now only digits, return int
if isinstance(val, int) and s.isdigit():
return int(s)
return s
def anonymize_deep(obj):
if isinstance(obj, dict):
return {k: anonymize_deep(v) for k, v in obj.items()}
if isinstance(obj, list):
return [anonymize_deep(v) for v in obj]
return anonymize_value(obj)
#fake_data = anonymize_deep(copy.deepcopy(input_data))
|