E-commerce-demand-prediction-system / generate_dataset.py

Upload 17 files

7f90ea0 verified 4 days ago

4.03 kB

	"""
	Generate Synthetic E-commerce Sales Dataset

	This script creates a realistic synthetic dataset for demand prediction.
	The dataset includes temporal patterns, seasonality, and realistic relationships
	between features and sales quantity.
	"""

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta

	# Set random seed for reproducibility
	np.random.seed(42)

	# Configuration
	NUM_PRODUCTS = 50
	START_DATE = datetime(2020, 1, 1)
	END_DATE = datetime(2023, 12, 31)
	CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books',
	'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health']

	# Generate date range
	date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
	num_days = len(date_range)

	# Initialize lists to store data
	data = []

	# Generate data for each product
	for product_id in range(1, NUM_PRODUCTS + 1):
	# Assign category randomly
	category = np.random.choice(CATEGORIES)

	# Base price varies by category
	category_base_prices = {
	'Electronics': 500,
	'Clothing': 50,
	'Home & Garden': 100,
	'Sports': 150,
	'Books': 20,
	'Toys': 30,
	'Beauty': 40,
	'Automotive': 300,
	'Food & Beverages': 25,
	'Health': 60
	}

	base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4)

	# Generate daily records
	for date in date_range:
	# Day of week effect (weekends have higher sales)
	day_of_week = date.weekday()
	weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0

	# Monthly seasonality (higher sales in Nov-Dec, lower in Jan-Feb)
	month = date.month
	if month in [11, 12]: # Holiday season
	seasonality = 1.5
	elif month in [1, 2]: # Post-holiday slump
	seasonality = 0.7
	elif month in [6, 7, 8]: # Summer
	seasonality = 1.2
	else:
	seasonality = 1.0

	# Random discount (0-30%)
	discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02])

	# Price with discount
	price = base_price * (1 - discount / 100)

	# Base demand varies by product
	base_demand = np.random.randint(10, 100)

	# Calculate sales quantity with multiple factors
	# Higher discount -> higher sales
	discount_effect = 1 + (discount / 100) * 0.5

	# Lower price -> higher sales (inverse relationship)
	price_effect = 1 / (1 + (price / 1000) * 0.1)

	# Add some randomness
	noise = np.random.normal(1, 0.15)

	# Calculate final sales quantity
	sales_quantity = int(
	base_demand *
	weekend_multiplier *
	seasonality *
	discount_effect *
	price_effect *
	noise
	)

	# Ensure non-negative
	sales_quantity = max(0, sales_quantity)

	data.append({
	'product_id': product_id,
	'date': date.strftime('%Y-%m-%d'),
	'price': round(price, 2),
	'discount': discount,
	'category': category,
	'sales_quantity': sales_quantity
	})

	# Create DataFrame
	df = pd.DataFrame(data)

	# Shuffle the data
	df = df.sample(frac=1, random_state=42).reset_index(drop=True)

	# Save to CSV
	output_path = 'data/sales.csv'
	df.to_csv(output_path, index=False)

	print(f"Dataset generated successfully!")
	print(f"Total records: {len(df)}")
	print(f"Date range: {df['date'].min()} to {df['date'].max()}")
	print(f"Number of products: {df['product_id'].nunique()}")
	print(f"Categories: {df['category'].nunique()}")
	print(f"\nDataset saved to: {output_path}")
	print(f"\nFirst few rows:")
	print(df.head(10))
	print(f"\nDataset statistics:")
	print(df.describe())