File size: 4,029 Bytes

7f90ea0

"""

Generate Synthetic E-commerce Sales Dataset



This script creates a realistic synthetic dataset for demand prediction.

The dataset includes temporal patterns, seasonality, and realistic relationships

between features and sales quantity.

"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
NUM_PRODUCTS = 50
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2023, 12, 31)
CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 
              'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health']

# Generate date range
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
num_days = len(date_range)

# Initialize lists to store data
data = []

# Generate data for each product
for product_id in range(1, NUM_PRODUCTS + 1):
    # Assign category randomly
    category = np.random.choice(CATEGORIES)
    
    # Base price varies by category
    category_base_prices = {
        'Electronics': 500,
        'Clothing': 50,
        'Home & Garden': 100,
        'Sports': 150,
        'Books': 20,
        'Toys': 30,
        'Beauty': 40,
        'Automotive': 300,
        'Food & Beverages': 25,
        'Health': 60
    }
    
    base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4)
    
    # Generate daily records
    for date in date_range:
        # Day of week effect (weekends have higher sales)
        day_of_week = date.weekday()
        weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0
        
        # Monthly seasonality (higher sales in Nov-Dec, lower in Jan-Feb)
        month = date.month
        if month in [11, 12]:  # Holiday season
            seasonality = 1.5
        elif month in [1, 2]:  # Post-holiday slump
            seasonality = 0.7
        elif month in [6, 7, 8]:  # Summer
            seasonality = 1.2
        else:
            seasonality = 1.0
        
        # Random discount (0-30%)
        discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02])
        
        # Price with discount
        price = base_price * (1 - discount / 100)
        
        # Base demand varies by product
        base_demand = np.random.randint(10, 100)
        
        # Calculate sales quantity with multiple factors
        # Higher discount -> higher sales
        discount_effect = 1 + (discount / 100) * 0.5
        
        # Lower price -> higher sales (inverse relationship)
        price_effect = 1 / (1 + (price / 1000) * 0.1)
        
        # Add some randomness
        noise = np.random.normal(1, 0.15)
        
        # Calculate final sales quantity
        sales_quantity = int(
            base_demand * 
            weekend_multiplier * 
            seasonality * 
            discount_effect * 
            price_effect * 
            noise
        )
        
        # Ensure non-negative
        sales_quantity = max(0, sales_quantity)
        
        data.append({
            'product_id': product_id,
            'date': date.strftime('%Y-%m-%d'),
            'price': round(price, 2),
            'discount': discount,
            'category': category,
            'sales_quantity': sales_quantity
        })

# Create DataFrame
df = pd.DataFrame(data)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to CSV
output_path = 'data/sales.csv'
df.to_csv(output_path, index=False)

print(f"Dataset generated successfully!")
print(f"Total records: {len(df)}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of products: {df['product_id'].nunique()}")
print(f"Categories: {df['category'].nunique()}")
print(f"\nDataset saved to: {output_path}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nDataset statistics:")
print(df.describe())