Spaces:
Sleeping
Sleeping
File size: 2,405 Bytes
5e2aaa0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | """
Data Generation Utilities
========================
Utility functions for generating sample datasets.
"""
import pandas as pd
import numpy as np
def create_sample_mall_customers(n_customers=200, random_seed=42):
"""
Create a realistic sample Mall Customers dataset.
Parameters:
-----------
n_customers : int, default=200
Number of customers to generate
random_seed : int, default=42
Random seed for reproducibility
Returns:
--------
pd.DataFrame
Generated customer dataset
"""
np.random.seed(random_seed)
customer_ids = range(1, n_customers + 1)
# Gender distribution (approximately 56% Female, 44% Male)
genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56])
# Age distribution (mean ~39, std ~14)
ages = np.random.normal(38.85, 13.97, n_customers).astype(int)
ages = np.clip(ages, 18, 70)
# Create realistic income distribution (mean ~61k, std ~26k)
annual_incomes = np.random.normal(60.56, 26.26, n_customers)
annual_incomes = np.clip(annual_incomes, 15, 137)
# Create spending scores with realistic patterns
base_spending = np.random.normal(50, 25, n_customers)
# Add some income correlation
income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min())
income_effect = (income_normalized - 0.5) * 30
# Add age effect (younger people might spend more)
age_normalized = (ages - ages.min()) / (ages.max() - ages.min())
age_effect = np.where(age_normalized < 0.3, 10,
np.where(age_normalized > 0.7, -5, 0))
# Gender effect (slight difference in spending patterns)
gender_effect = np.where(genders == 'Female', 3, -3)
spending_scores = (base_spending +
income_effect * 0.6 +
age_effect +
gender_effect +
np.random.normal(0, 10, n_customers))
spending_scores = np.clip(spending_scores, 1, 100)
# Create DataFrame
sample_data = pd.DataFrame({
'CustomerID': customer_ids,
'Gender': genders,
'Age': ages,
'Annual Income (k$)': annual_incomes.round().astype(int),
'Spending Score (1-100)': spending_scores.round().astype(int)
})
return sample_data
|