Spaces:
Sleeping
Sleeping
| """ | |
| Data Generation Utilities | |
| ======================== | |
| Utility functions for generating sample datasets. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| def create_sample_mall_customers(n_customers=200, random_seed=42): | |
| """ | |
| Create a realistic sample Mall Customers dataset. | |
| Parameters: | |
| ----------- | |
| n_customers : int, default=200 | |
| Number of customers to generate | |
| random_seed : int, default=42 | |
| Random seed for reproducibility | |
| Returns: | |
| -------- | |
| pd.DataFrame | |
| Generated customer dataset | |
| """ | |
| np.random.seed(random_seed) | |
| customer_ids = range(1, n_customers + 1) | |
| # Gender distribution (approximately 56% Female, 44% Male) | |
| genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56]) | |
| # Age distribution (mean ~39, std ~14) | |
| ages = np.random.normal(38.85, 13.97, n_customers).astype(int) | |
| ages = np.clip(ages, 18, 70) | |
| # Create realistic income distribution (mean ~61k, std ~26k) | |
| annual_incomes = np.random.normal(60.56, 26.26, n_customers) | |
| annual_incomes = np.clip(annual_incomes, 15, 137) | |
| # Create spending scores with realistic patterns | |
| base_spending = np.random.normal(50, 25, n_customers) | |
| # Add some income correlation | |
| income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min()) | |
| income_effect = (income_normalized - 0.5) * 30 | |
| # Add age effect (younger people might spend more) | |
| age_normalized = (ages - ages.min()) / (ages.max() - ages.min()) | |
| age_effect = np.where(age_normalized < 0.3, 10, | |
| np.where(age_normalized > 0.7, -5, 0)) | |
| # Gender effect (slight difference in spending patterns) | |
| gender_effect = np.where(genders == 'Female', 3, -3) | |
| spending_scores = (base_spending + | |
| income_effect * 0.6 + | |
| age_effect + | |
| gender_effect + | |
| np.random.normal(0, 10, n_customers)) | |
| spending_scores = np.clip(spending_scores, 1, 100) | |
| # Create DataFrame | |
| sample_data = pd.DataFrame({ | |
| 'CustomerID': customer_ids, | |
| 'Gender': genders, | |
| 'Age': ages, | |
| 'Annual Income (k$)': annual_incomes.round().astype(int), | |
| 'Spending Score (1-100)': spending_scores.round().astype(int) | |
| }) | |
| return sample_data | |