from datetime import datetime from bson import ObjectId import re import copy import random import string EMAIL_REGEX = re.compile(r'[A-Za-z0-9._%+-]+@([A-Za-z0-9.-]+\.[A-Za-z]{2,})') URL_REGEX = re.compile(r'(?i)\b((?:https?://|ftp://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+[^\s]*)') PHONE_PARTS_REGEX = re.compile(r'(\D*)(\d+)(\D*)(\d+)(\D*)(\d+)(\D*)(\d*)(\D*)') def rand_name(): name = random.choice(['john', 'jane', 'alex', 'mike', 'sara', 'chris', 'emma', 'liam']) num = ''.join(random.choices(string.digits, k=3)) return f'{name}{num}' def fake_email(match): domain = match.group(1) return f'{rand_name()}@{domain}' def fake_url(match): return 'https://example.com' def fake_phone(match): """ Preserves exact punctuation, only replaces digits. Example: '+1 555-123-4567' → '+1 987-654-3210' """ groups = match.groups() faked = [] for g in groups: if g and g.isdigit(): length = len(g) fake_digits = ''.join(random.choices(string.digits, k=length)) faked.append(fake_digits) else: faked.append(g or '') return ''.join(faked) def anonymize_value(val): if not isinstance(val, (str, int)): return val s = str(val) # 1. URLs if URL_REGEX.search(s): s = URL_REGEX.sub(fake_url, s) # 2. Emails if EMAIL_REGEX.search(s): s = EMAIL_REGEX.sub(fake_email, s) # 3. Phones – preserve format if PHONE_PARTS_REGEX.search(s): s = PHONE_PARTS_REGEX.sub(fake_phone, s) # If original was int and now only digits, return int if isinstance(val, int) and s.isdigit(): return int(s) return s def anonymize_deep(obj): if isinstance(obj, dict): return {k: anonymize_deep(v) for k, v in obj.items()} if isinstance(obj, list): return [anonymize_deep(v) for v in obj] return anonymize_value(obj) #fake_data = anonymize_deep(copy.deepcopy(input_data))