File size: 2,405 Bytes
5e2aaa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Data Generation Utilities
========================

Utility functions for generating sample datasets.
"""

import pandas as pd
import numpy as np

def create_sample_mall_customers(n_customers=200, random_seed=42):
    """
    Create a realistic sample Mall Customers dataset.
    
    Parameters:
    -----------
    n_customers : int, default=200
        Number of customers to generate
    random_seed : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    pd.DataFrame
        Generated customer dataset
    """
    np.random.seed(random_seed)
    
    customer_ids = range(1, n_customers + 1)
    
    # Gender distribution (approximately 56% Female, 44% Male)
    genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56])
    
    # Age distribution (mean ~39, std ~14)
    ages = np.random.normal(38.85, 13.97, n_customers).astype(int)
    ages = np.clip(ages, 18, 70)
    
    # Create realistic income distribution (mean ~61k, std ~26k)
    annual_incomes = np.random.normal(60.56, 26.26, n_customers)
    annual_incomes = np.clip(annual_incomes, 15, 137)
    
    # Create spending scores with realistic patterns
    base_spending = np.random.normal(50, 25, n_customers)
    
    # Add some income correlation
    income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min())
    income_effect = (income_normalized - 0.5) * 30
    
    # Add age effect (younger people might spend more)
    age_normalized = (ages - ages.min()) / (ages.max() - ages.min())
    age_effect = np.where(age_normalized < 0.3, 10,
                         np.where(age_normalized > 0.7, -5, 0))
    
    # Gender effect (slight difference in spending patterns)
    gender_effect = np.where(genders == 'Female', 3, -3)
    
    spending_scores = (base_spending + 
                      income_effect * 0.6 + 
                      age_effect + 
                      gender_effect +
                      np.random.normal(0, 10, n_customers))
    spending_scores = np.clip(spending_scores, 1, 100)
    
    # Create DataFrame
    sample_data = pd.DataFrame({
        'CustomerID': customer_ids,
        'Gender': genders,
        'Age': ages,
        'Annual Income (k$)': annual_incomes.round().astype(int),
        'Spending Score (1-100)': spending_scores.round().astype(int)
    })
    
    return sample_data