""" Generate Synthetic E-commerce Sales Dataset This script creates a realistic synthetic dataset for demand prediction. The dataset includes temporal patterns, seasonality, and realistic relationships between features and sales quantity. """ import pandas as pd import numpy as np from datetime import datetime, timedelta # Set random seed for reproducibility np.random.seed(42) # Configuration NUM_PRODUCTS = 50 START_DATE = datetime(2020, 1, 1) END_DATE = datetime(2023, 12, 31) CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health'] # Generate date range date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D') num_days = len(date_range) # Initialize lists to store data data = [] # Generate data for each product for product_id in range(1, NUM_PRODUCTS + 1): # Assign category randomly category = np.random.choice(CATEGORIES) # Base price varies by category category_base_prices = { 'Electronics': 500, 'Clothing': 50, 'Home & Garden': 100, 'Sports': 150, 'Books': 20, 'Toys': 30, 'Beauty': 40, 'Automotive': 300, 'Food & Beverages': 25, 'Health': 60 } base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4) # Generate daily records for date in date_range: # Day of week effect (weekends have higher sales) day_of_week = date.weekday() weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0 # Monthly seasonality (higher sales in Nov-Dec, lower in Jan-Feb) month = date.month if month in [11, 12]: # Holiday season seasonality = 1.5 elif month in [1, 2]: # Post-holiday slump seasonality = 0.7 elif month in [6, 7, 8]: # Summer seasonality = 1.2 else: seasonality = 1.0 # Random discount (0-30%) discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02]) # Price with discount price = base_price * (1 - discount / 100) # Base demand varies by product base_demand = np.random.randint(10, 100) # Calculate sales quantity with multiple factors # Higher discount -> higher sales discount_effect = 1 + (discount / 100) * 0.5 # Lower price -> higher sales (inverse relationship) price_effect = 1 / (1 + (price / 1000) * 0.1) # Add some randomness noise = np.random.normal(1, 0.15) # Calculate final sales quantity sales_quantity = int( base_demand * weekend_multiplier * seasonality * discount_effect * price_effect * noise ) # Ensure non-negative sales_quantity = max(0, sales_quantity) data.append({ 'product_id': product_id, 'date': date.strftime('%Y-%m-%d'), 'price': round(price, 2), 'discount': discount, 'category': category, 'sales_quantity': sales_quantity }) # Create DataFrame df = pd.DataFrame(data) # Shuffle the data df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Save to CSV output_path = 'data/sales.csv' df.to_csv(output_path, index=False) print(f"Dataset generated successfully!") print(f"Total records: {len(df)}") print(f"Date range: {df['date'].min()} to {df['date'].max()}") print(f"Number of products: {df['product_id'].nunique()}") print(f"Categories: {df['category'].nunique()}") print(f"\nDataset saved to: {output_path}") print(f"\nFirst few rows:") print(df.head(10)) print(f"\nDataset statistics:") print(df.describe())