Customer-Segmentation

Sleeping

Customer-Segmentation / utils /data_generator.py

Mahmoud Adel

Clean Hugging Face deployment

5e2aaa0 about 1 month ago

2.41 kB

	"""
	Data Generation Utilities
	========================

	Utility functions for generating sample datasets.
	"""

	import pandas as pd
	import numpy as np

	def create_sample_mall_customers(n_customers=200, random_seed=42):
	"""
	Create a realistic sample Mall Customers dataset.

	Parameters:
	-----------
	n_customers : int, default=200
	Number of customers to generate
	random_seed : int, default=42
	Random seed for reproducibility

	Returns:
	--------
	pd.DataFrame
	Generated customer dataset
	"""
	np.random.seed(random_seed)

	customer_ids = range(1, n_customers + 1)

	# Gender distribution (approximately 56% Female, 44% Male)
	genders = np.random.choice(['Male', 'Female'], n_customers, p=[0.44, 0.56])

	# Age distribution (mean ~39, std ~14)
	ages = np.random.normal(38.85, 13.97, n_customers).astype(int)
	ages = np.clip(ages, 18, 70)

	# Create realistic income distribution (mean ~61k, std ~26k)
	annual_incomes = np.random.normal(60.56, 26.26, n_customers)
	annual_incomes = np.clip(annual_incomes, 15, 137)

	# Create spending scores with realistic patterns
	base_spending = np.random.normal(50, 25, n_customers)

	# Add some income correlation
	income_normalized = (annual_incomes - annual_incomes.min()) / (annual_incomes.max() - annual_incomes.min())
	income_effect = (income_normalized - 0.5) * 30

	# Add age effect (younger people might spend more)
	age_normalized = (ages - ages.min()) / (ages.max() - ages.min())
	age_effect = np.where(age_normalized < 0.3, 10,
	np.where(age_normalized > 0.7, -5, 0))

	# Gender effect (slight difference in spending patterns)
	gender_effect = np.where(genders == 'Female', 3, -3)

	spending_scores = (base_spending +
	income_effect * 0.6 +
	age_effect +
	gender_effect +
	np.random.normal(0, 10, n_customers))
	spending_scores = np.clip(spending_scores, 1, 100)

	# Create DataFrame
	sample_data = pd.DataFrame({
	'CustomerID': customer_ids,
	'Gender': genders,
	'Age': ages,
	'Annual Income (k$)': annual_incomes.round().astype(int),
	'Spending Score (1-100)': spending_scores.round().astype(int)
	})

	return sample_data