|
|
"""
|
|
|
Generate Synthetic E-commerce Sales Dataset
|
|
|
|
|
|
This script creates a realistic synthetic dataset for demand prediction.
|
|
|
The dataset includes temporal patterns, seasonality, and realistic relationships
|
|
|
between features and sales quantity.
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
|
NUM_PRODUCTS = 50
|
|
|
START_DATE = datetime(2020, 1, 1)
|
|
|
END_DATE = datetime(2023, 12, 31)
|
|
|
CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books',
|
|
|
'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health']
|
|
|
|
|
|
|
|
|
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
|
|
|
num_days = len(date_range)
|
|
|
|
|
|
|
|
|
data = []
|
|
|
|
|
|
|
|
|
for product_id in range(1, NUM_PRODUCTS + 1):
|
|
|
|
|
|
category = np.random.choice(CATEGORIES)
|
|
|
|
|
|
|
|
|
category_base_prices = {
|
|
|
'Electronics': 500,
|
|
|
'Clothing': 50,
|
|
|
'Home & Garden': 100,
|
|
|
'Sports': 150,
|
|
|
'Books': 20,
|
|
|
'Toys': 30,
|
|
|
'Beauty': 40,
|
|
|
'Automotive': 300,
|
|
|
'Food & Beverages': 25,
|
|
|
'Health': 60
|
|
|
}
|
|
|
|
|
|
base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4)
|
|
|
|
|
|
|
|
|
for date in date_range:
|
|
|
|
|
|
day_of_week = date.weekday()
|
|
|
weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0
|
|
|
|
|
|
|
|
|
month = date.month
|
|
|
if month in [11, 12]:
|
|
|
seasonality = 1.5
|
|
|
elif month in [1, 2]:
|
|
|
seasonality = 0.7
|
|
|
elif month in [6, 7, 8]:
|
|
|
seasonality = 1.2
|
|
|
else:
|
|
|
seasonality = 1.0
|
|
|
|
|
|
|
|
|
discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02])
|
|
|
|
|
|
|
|
|
price = base_price * (1 - discount / 100)
|
|
|
|
|
|
|
|
|
base_demand = np.random.randint(10, 100)
|
|
|
|
|
|
|
|
|
|
|
|
discount_effect = 1 + (discount / 100) * 0.5
|
|
|
|
|
|
|
|
|
price_effect = 1 / (1 + (price / 1000) * 0.1)
|
|
|
|
|
|
|
|
|
noise = np.random.normal(1, 0.15)
|
|
|
|
|
|
|
|
|
sales_quantity = int(
|
|
|
base_demand *
|
|
|
weekend_multiplier *
|
|
|
seasonality *
|
|
|
discount_effect *
|
|
|
price_effect *
|
|
|
noise
|
|
|
)
|
|
|
|
|
|
|
|
|
sales_quantity = max(0, sales_quantity)
|
|
|
|
|
|
data.append({
|
|
|
'product_id': product_id,
|
|
|
'date': date.strftime('%Y-%m-%d'),
|
|
|
'price': round(price, 2),
|
|
|
'discount': discount,
|
|
|
'category': category,
|
|
|
'sales_quantity': sales_quantity
|
|
|
})
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
output_path = 'data/sales.csv'
|
|
|
df.to_csv(output_path, index=False)
|
|
|
|
|
|
print(f"Dataset generated successfully!")
|
|
|
print(f"Total records: {len(df)}")
|
|
|
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
|
|
|
print(f"Number of products: {df['product_id'].nunique()}")
|
|
|
print(f"Categories: {df['category'].nunique()}")
|
|
|
print(f"\nDataset saved to: {output_path}")
|
|
|
print(f"\nFirst few rows:")
|
|
|
print(df.head(10))
|
|
|
print(f"\nDataset statistics:")
|
|
|
print(df.describe())
|
|
|
|