rocketship_data / all_combine_code /tools_ /fake_data_generator.py
vicky4s4s's picture
Upload 76 files
01e9350 verified
from datetime import datetime
from bson import ObjectId
import re
import copy
import random
import string
EMAIL_REGEX = re.compile(r'[A-Za-z0-9._%+-]+@([A-Za-z0-9.-]+\.[A-Za-z]{2,})')
URL_REGEX = re.compile(r'(?i)\b((?:https?://|ftp://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+[^\s]*)')
PHONE_PARTS_REGEX = re.compile(r'(\D*)(\d+)(\D*)(\d+)(\D*)(\d+)(\D*)(\d*)(\D*)')
def rand_name():
name = random.choice(['john', 'jane', 'alex', 'mike', 'sara', 'chris', 'emma', 'liam'])
num = ''.join(random.choices(string.digits, k=3))
return f'{name}{num}'
def fake_email(match):
domain = match.group(1)
return f'{rand_name()}@{domain}'
def fake_url(match):
return 'https://example.com'
def fake_phone(match):
"""
Preserves exact punctuation, only replaces digits.
Example: '+1 555-123-4567' → '+1 987-654-3210'
"""
groups = match.groups()
faked = []
for g in groups:
if g and g.isdigit():
length = len(g)
fake_digits = ''.join(random.choices(string.digits, k=length))
faked.append(fake_digits)
else:
faked.append(g or '')
return ''.join(faked)
def anonymize_value(val):
if not isinstance(val, (str, int)):
return val
s = str(val)
# 1. URLs
if URL_REGEX.search(s):
s = URL_REGEX.sub(fake_url, s)
# 2. Emails
if EMAIL_REGEX.search(s):
s = EMAIL_REGEX.sub(fake_email, s)
# 3. Phones – preserve format
if PHONE_PARTS_REGEX.search(s):
s = PHONE_PARTS_REGEX.sub(fake_phone, s)
# If original was int and now only digits, return int
if isinstance(val, int) and s.isdigit():
return int(s)
return s
def anonymize_deep(obj):
if isinstance(obj, dict):
return {k: anonymize_deep(v) for k, v in obj.items()}
if isinstance(obj, list):
return [anonymize_deep(v) for v in obj]
return anonymize_value(obj)
#fake_data = anonymize_deep(copy.deepcopy(input_data))