Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from faker import Faker | |
| from datetime import datetime, timedelta, date | |
| import random | |
| def generate_synthetic_data(num_customers=1000): | |
| """ | |
| Generate synthetic customer data for e-commerce analysis. | |
| This function creates a dataset of customers with various attributes such as | |
| demographics, purchase history, and preferences. It uses the Faker library to | |
| generate realistic-looking data for Ukrainian customers. | |
| Args: | |
| num_customers (int): The number of customer records to generate (default: 1000) | |
| Returns: | |
| pandas.DataFrame: A DataFrame containing the generated customer data | |
| """ | |
| # Set up Faker for Ukrainian locale | |
| fake = Faker('uk_UA') | |
| Faker.seed(42) | |
| np.random.seed(42) | |
| # Define constants | |
| NUM_CUSTOMERS = num_customers | |
| START_DATE = date(2019, 1, 1) | |
| END_DATE = date(2024, 7, 31) | |
| # Helper functions | |
| def generate_phone_number(): | |
| """Generate a realistic Ukrainian phone number.""" | |
| return f"+380{random.randint(50, 99)}{fake.msisdn()[6:]}" | |
| def generate_email(name): | |
| """Generate an email address based on the customer's name.""" | |
| username = name.lower().replace(' ', '.').replace('\'', '') | |
| domain = random.choice(['gmail.com', 'ukr.net', 'i.ua', 'meta.ua', 'yahoo.com']) | |
| return f"{username}@{domain}" | |
| # Define regions and their characteristics | |
| REGIONS = { | |
| 'Київська': {'avg_age': 40, 'urbanization': 0.8, 'tech_adoption': 0.7}, | |
| 'Львівська': {'avg_age': 38, 'urbanization': 0.7, 'tech_adoption': 0.6}, | |
| 'Харківська': {'avg_age': 42, 'urbanization': 0.8, 'tech_adoption': 0.65}, | |
| 'Одеська': {'avg_age': 41, 'urbanization': 0.7, 'tech_adoption': 0.6}, | |
| 'Дніпропетровська': {'avg_age': 43, 'urbanization': 0.75, 'tech_adoption': 0.6}, | |
| 'Запорізька': {'avg_age': 44, 'urbanization': 0.7, 'tech_adoption': 0.55}, | |
| 'Вінницька': {'avg_age': 42, 'urbanization': 0.6, 'tech_adoption': 0.5}, | |
| 'Полтавська': {'avg_age': 43, 'urbanization': 0.65, 'tech_adoption': 0.55}, | |
| 'Чернігівська': {'avg_age': 45, 'urbanization': 0.6, 'tech_adoption': 0.5}, | |
| 'Сумська': {'avg_age': 44, 'urbanization': 0.65, 'tech_adoption': 0.5} | |
| } | |
| # Generate initial customer data | |
| data = [] | |
| for i in range(NUM_CUSTOMERS): | |
| customer_id = f"C{str(i+1).zfill(6)}" | |
| # Region and City | |
| region = np.random.choice(list(REGIONS.keys())) | |
| region_info = REGIONS[region].copy() # Create a copy to avoid modifying the original | |
| is_urban = np.random.random() < region_info['urbanization'] | |
| city = fake.city() | |
| if not is_urban: | |
| city = f"смт {city}" | |
| # Age (dependent on region) | |
| age = int(np.random.normal(region_info['avg_age'], 10)) | |
| age_noise = np.random.normal(0, 2) # Add noise with mean 0 and std dev 2 | |
| age = max(18, min(80, int(age + age_noise))) | |
| # Add noise to urbanization and tech adoption | |
| urbanization_noise = np.random.normal(0, 0.05) | |
| tech_adoption_noise = np.random.normal(0, 0.05) | |
| region_info['urbanization'] = max(0, min(1, region_info['urbanization'] + urbanization_noise)) | |
| region_info['tech_adoption'] = max(0, min(1, region_info['tech_adoption'] + tech_adoption_noise)) | |
| # Gender (slight dependency on age and region) | |
| gender_prob = 0.49 + 0.02 * (age - 40) / 40 # Slight increase in male probability with age | |
| gender_prob += 0.02 * (region_info['urbanization'] - 0.7) / 0.3 # Slight increase in urban areas | |
| gender = np.random.choice(['Male', 'Female', 'Other'], p=[gender_prob, 1-gender_prob-0.01, 0.01]) | |
| # Preferred Language (dependent on age and region) | |
| ukrainian_prob = 0.8 - 0.2 * (age - 40) / 40 # Younger people more likely to prefer Ukrainian | |
| ukrainian_prob += 0.1 * (1 - region_info['urbanization']) # Rural areas more likely to prefer Ukrainian | |
| preferred_language = np.random.choice(['Ukrainian', 'Russian'], p=[min(1, max(0, ukrainian_prob)), 1-min(1, max(0, ukrainian_prob))]) | |
| # Registration date | |
| registration_date = fake.date_between(start_date=START_DATE, end_date=END_DATE) | |
| # Determine if the customer is active (has made orders) | |
| is_active = np.random.random() < 0.6 # 60% chance of being an active customer | |
| if is_active: | |
| # Total orders and average order value (dependent on various factors) | |
| base_orders = np.random.poisson(5) | |
| order_multiplier = 1 + 0.2 * (age - 40) / 40 # Age factor | |
| order_multiplier *= 1 + 0.1 * (region_info['tech_adoption'] - 0.6) / 0.2 # Tech adoption factor | |
| order_multiplier *= 1.1 if gender == 'Female' else 0.9 # Gender factor | |
| order_multiplier *= 1.1 if preferred_language == 'Ukrainian' else 0.9 # Language factor | |
| total_orders = max(1, int(base_orders * order_multiplier)) # Ensure at least 1 order for active customers | |
| # Add noise to total orders | |
| total_orders_noise = np.random.poisson(2) | |
| total_orders = max(1, total_orders + total_orders_noise) | |
| base_aov = np.random.gamma(shape=5, scale=100) | |
| aov_multiplier = 1 + 0.3 * (age - 40) / 40 # Age factor | |
| aov_multiplier *= 1 + 0.2 * (region_info['urbanization'] - 0.7) / 0.3 # Urbanization factor | |
| aov_multiplier *= 1.1 if gender == 'Male' else 0.9 # Gender factor | |
| average_order_value = base_aov * aov_multiplier | |
| # Add noise to average order value | |
| aov_noise = np.random.normal(0, average_order_value * 0.1) # 10% noise | |
| average_order_value = max(0, average_order_value + aov_noise) | |
| # Last order date | |
| last_order_date = fake.date_between(start_date=registration_date, end_date=END_DATE) | |
| else: | |
| total_orders = 0 | |
| average_order_value = 0 | |
| last_order_date = None | |
| # Loyalty level based on total orders | |
| loyalty_level = min(5, max(1, int(total_orders / 2))) | |
| # Add some randomness to loyalty level | |
| loyalty_noise = np.random.randint(-1, 2) # -1, 0, or 1 | |
| loyalty_level = max(1, min(5, loyalty_level + loyalty_noise)) | |
| # Newsletter subscription (dependent on age, loyalty, and tech adoption) | |
| newsletter_prob = 0.5 + 0.1 * loyalty_level / 5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['tech_adoption'] | |
| newsletter_noise = np.random.normal(0, 0.1) | |
| newsletter_prob = max(0, min(1, newsletter_prob + newsletter_noise)) | |
| newsletter_subscription = np.random.random() < newsletter_prob | |
| # Preferred payment method (dependent on age and urbanization) | |
| payment_probs = [ | |
| 0.5 - 0.2 * (age - 40) / 40 + 0.2 * region_info['urbanization'], # Credit Card | |
| 0.3 + 0.2 * (age - 40) / 40 - 0.2 * region_info['urbanization'], # Cash on Delivery | |
| 0.15, # Bank Transfer | |
| 0.05 + 0.1 * region_info['tech_adoption'] # PayPal | |
| ] | |
| payment_probs = [max(0, min(p, 1)) for p in payment_probs] | |
| payment_probs = [p / sum(payment_probs) for p in payment_probs] | |
| preferred_payment_method = np.random.choice( | |
| ['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal'], | |
| p=payment_probs | |
| ) | |
| # Add some inconsistency to preferred payment method | |
| if np.random.random() < 0.1: # 10% chance of inconsistent preference | |
| preferred_payment_method = np.random.choice(['Credit Card', 'Cash on Delivery', 'Bank Transfer', 'PayPal']) | |
| # Main browsing device (dependent on age and tech adoption) | |
| device_probs = [ | |
| 0.4 + 0.3 * (age - 40) / 40 - 0.2 * region_info['tech_adoption'], # Web | |
| 0.4 - 0.2 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'], # Mobile | |
| 0.2 - 0.1 * (age - 40) / 40 + 0.1 * region_info['tech_adoption'] # App | |
| ] | |
| device_probs = [max(0, min(p, 1)) for p in device_probs] | |
| device_probs = [p / sum(device_probs) for p in device_probs] | |
| # Add noise to main browsing device probabilities | |
| device_noise = np.random.normal(0, 0.05, size=3) | |
| device_probs = [max(0, min(1, p + n)) for p, n in zip(device_probs, device_noise)] | |
| device_probs = [p / sum(device_probs) for p in device_probs] | |
| main_browsing_device = np.random.choice(['Web', 'Mobile', 'App'], p=device_probs) | |
| # Product categories (dependent on age, gender, and browsing device) | |
| all_categories = ['Electronics', 'Home Appliances', 'Computers', 'Smartphones', 'TV & Audio'] | |
| category_probs = [0.2] * 5 | |
| if age < 30: | |
| category_probs[2] += 0.1 # Increase Computers | |
| category_probs[3] += 0.1 # Increase Smartphones | |
| elif age > 60: | |
| category_probs[1] += 0.1 # Increase Home Appliances | |
| category_probs[4] += 0.1 # Increase TV & Audio | |
| if gender == 'Male': | |
| category_probs[0] += 0.05 # Slight increase in Electronics | |
| category_probs[2] += 0.05 # Slight increase in Computers | |
| if main_browsing_device == 'Mobile': | |
| category_probs[3] += 0.1 # Increase Smartphones | |
| category_probs = [p / sum(category_probs) for p in category_probs] | |
| num_categories = np.random.randint(1, 4) | |
| product_categories = np.random.choice(all_categories, size=num_categories, replace=False, p=category_probs) | |
| data.append({ | |
| 'customer_id': customer_id, | |
| 'name': fake.name(), | |
| 'email': generate_email(fake.name()), | |
| 'age': age, | |
| 'gender': gender, | |
| 'region': region, | |
| 'city': city, | |
| 'registration_date': registration_date, | |
| 'phone_number': generate_phone_number(), | |
| 'preferred_language': preferred_language, | |
| 'newsletter_subscription': newsletter_subscription, | |
| 'preferred_payment_method': preferred_payment_method, | |
| 'loyalty_level': loyalty_level, | |
| 'main_browsing_device': main_browsing_device, | |
| 'product_categories_of_interest': ', '.join(product_categories), | |
| 'average_order_value': round(average_order_value, 2), | |
| 'total_orders': total_orders, | |
| 'last_order_date': last_order_date | |
| }) | |
| # Create DataFrame | |
| df = pd.DataFrame(data) | |
| return df | |
| if __name__ == "__main__": | |
| df = generate_synthetic_data() | |
| print(df.head()) | |