Spaces:

neuronslabs
/

comfyCausalAI

Sleeping

App Files Files Community

comfyCausalAI / data_generator.py

rknl

updated

2130e8d verified over 1 year ago

raw

history blame contribute delete

10.7 kB

	import pandas as pd
	import numpy as np
	from faker import Faker
	from datetime import datetime, timedelta, date
	import random

	def generate_synthetic_data(num_customers=1000):
	"""
	Generate synthetic customer data for e-commerce analysis.

	This function creates a dataset of customers with various attributes such as
	demographics, purchase history, and preferences. It uses the Faker library to
	generate realistic-looking data for Ukrainian customers.

	Args:
	num_customers (int): The number of customer records to generate (default: 1000)

	Returns:
	pandas.DataFrame: A DataFrame containing the generated customer data
	"""
	# Set up Faker for Ukrainian locale
	fake = Faker('uk_UA')
	Faker.seed(42)
	np.random.seed(42)

	# Define constants
	NUM_CUSTOMERS = num_customers
	START_DATE = date(2019, 1, 1)
	END_DATE = date(2024, 7, 31)

	# Helper functions
	def generate_phone_number():
	"""Generate a realistic Ukrainian phone number."""
	return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}"

	def generate_email(name):
	"""Generate an email address based on the customer's name."""
	username = name.lower().replace(' ', '.').replace('\'', '')
	domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com'])
	return f"{username}@{domain}"

	# Define regions and their characteristics
	REGIONS = {
	'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7},
	'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6},
	'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65},
	'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6},
	'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6},
	'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55},
	'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5},
	'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55},
	'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5},
	'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5}
	}

	# Generate initial customer data
	data = []
	for i in range(NUM_CUSTOMERS):
	customer_id = f"C{str(i+1).zfill(6)}"

	# Region and City
	region = np.random.choice(list(REGIONS.keys()))
	region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original
	is_urban = np.random.random() < region_info['urbanization']
	city = fake.city()
	if not is_urban:
	city = f"смт {city}"

	# Age (dependent on region)
	age = int(np.random.normal(region_info['avg_age'], 10))
	age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2
	age = max(18, min(80, int(age + age_noise)))

	# Add noise to urbanization and tech adoption
	urbanization_noise = np.random.normal(0, 0.05)
	tech_adoption_noise = np.random.normal(0, 0.05)
	region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise))
	region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise))

	# Gender (slight dependency on age and region)
	gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age
	gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas
	gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01])

	# Preferred Language (dependent on age and region)
	ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian
	ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian
	preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))])

	# Registration date
	registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE)

	# Determine if the customer is active (has made orders)
	is_active = np.random.random() < 0.6 # 60% chance of being an active customer

	if is_active:
	# Total orders and average order value (dependent on various factors)
	base_orders = np.random.poisson(5)
	order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor
	order_multiplier = 1 + 0.1 (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor
	order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor
	order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor
	total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers

	# Add noise to total orders
	total_orders_noise = np.random.poisson(2)
	total_orders = max(1, total_orders + total_orders_noise)

	base_aov = np.random.gamma(shape=5, scale=100)
	aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor
	aov_multiplier = 1 + 0.2 (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor
	aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor
	average_order_value = base_aov * aov_multiplier

	# Add noise to average order value
	aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise
	average_order_value = max(0, average_order_value + aov_noise)

	# Last order date
	last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE)
	else:
	total_orders = 0
	average_order_value = 0
	last_order_date = None

	# Loyalty level based on total orders
	loyalty_level = min(5, max(1, int(total_orders / 2)))

	# Add some randomness to loyalty level
	loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1
	loyalty_level = max(1, min(5, loyalty_level + loyalty_noise))

	# Newsletter subscription (dependent on age, loyalty, and tech adoption)
	newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption']
	newsletter_noise = np.random.normal(0, 0.1)
	newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise))
	newsletter_subscription = np.random.random() < newsletter_prob

	# Preferred payment method (dependent on age and urbanization)
	payment_probs = [
	0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card
	0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery
	0.15, # Bank Transfer
	0.05 + 0.1 * region_info['tech_adoption'] # PayPal
	]
	payment_probs = [max(0, min(p, 1)) for p in payment_probs]
	payment_probs = [p / sum(payment_probs) for p in payment_probs]
	preferred_payment_method = np.random.choice(
	['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'],
	p=payment_probs
	)

	# Add some inconsistency to preferred payment method
	if np.random.random() < 0.1: # 10% chance of inconsistent preference
	preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'])

	# Main browsing device (dependent on age and tech adoption)
	device_probs = [
	0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web
	0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile
	0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App
	]
	device_probs = [max(0, min(p, 1)) for p in device_probs]
	device_probs = [p / sum(device_probs) for p in device_probs]

	# Add noise to main browsing device probabilities
	device_noise = np.random.normal(0, 0.05, size=3)
	device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)]
	device_probs = [p / sum(device_probs) for p in device_probs]

	main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs)

	# Product categories (dependent on age, gender, and browsing device)
	all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio']
	category_probs = [0.2] * 5
	if age < 30:
	category_probs[2] += 0.1 # Increase Computers
	category_probs[3] += 0.1 # Increase Smartphones
	elif age > 60:
	category_probs[1] += 0.1 # Increase Home Appliances
	category_probs[4] += 0.1 # Increase TV & Audio
	if gender == 'Male':
	category_probs[0] += 0.05 # Slight increase in Electronics
	category_probs[2] += 0.05 # Slight increase in Computers
	if main_browsing_device == 'Mobile':
	category_probs[3] += 0.1 # Increase Smartphones
	category_probs = [p / sum(category_probs) for p in category_probs]
	num_categories = np.random.randint(1, 4)
	product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs)

	data.append({
	'customer_id': customer_id,
	'name': fake.name(),
	'email': generate_email(fake.name()),
	'age': age,
	'gender': gender,
	'region': region,
	'city': city,
	'registration_date': registration_date,
	'phone_number': generate_phone_number(),
	'preferred_language': preferred_language,
	'newsletter_subscription': newsletter_subscription,
	'preferred_payment_method': preferred_payment_method,
	'loyalty_level': loyalty_level,
	'main_browsing_device': main_browsing_device,
	'product_categories_of_interest': ', '.join(product_categories),
	'average_order_value': round(average_order_value, 2),
	'total_orders': total_orders,
	'last_order_date': last_order_date
	})

	# Create DataFrame
	df = pd.DataFrame(data)
	return df

	if __name__ == "__main__":
	df = generate_synthetic_data()
	print(df.head())