vikaswebdev's picture
Upload 17 files
7f90ea0 verified
"""
Generate Synthetic E-commerce Sales Dataset
This script creates a realistic synthetic dataset for demand prediction.
The dataset includes temporal patterns, seasonality, and realistic relationships
between features and sales quantity.
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# Set random seed for reproducibility
np.random.seed(42)
# Configuration
NUM_PRODUCTS = 50
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2023, 12, 31)
CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books',
'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health']
# Generate date range
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
num_days = len(date_range)
# Initialize lists to store data
data = []
# Generate data for each product
for product_id in range(1, NUM_PRODUCTS + 1):
# Assign category randomly
category = np.random.choice(CATEGORIES)
# Base price varies by category
category_base_prices = {
'Electronics': 500,
'Clothing': 50,
'Home & Garden': 100,
'Sports': 150,
'Books': 20,
'Toys': 30,
'Beauty': 40,
'Automotive': 300,
'Food & Beverages': 25,
'Health': 60
}
base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4)
# Generate daily records
for date in date_range:
# Day of week effect (weekends have higher sales)
day_of_week = date.weekday()
weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0
# Monthly seasonality (higher sales in Nov-Dec, lower in Jan-Feb)
month = date.month
if month in [11, 12]: # Holiday season
seasonality = 1.5
elif month in [1, 2]: # Post-holiday slump
seasonality = 0.7
elif month in [6, 7, 8]: # Summer
seasonality = 1.2
else:
seasonality = 1.0
# Random discount (0-30%)
discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02])
# Price with discount
price = base_price * (1 - discount / 100)
# Base demand varies by product
base_demand = np.random.randint(10, 100)
# Calculate sales quantity with multiple factors
# Higher discount -> higher sales
discount_effect = 1 + (discount / 100) * 0.5
# Lower price -> higher sales (inverse relationship)
price_effect = 1 / (1 + (price / 1000) * 0.1)
# Add some randomness
noise = np.random.normal(1, 0.15)
# Calculate final sales quantity
sales_quantity = int(
base_demand *
weekend_multiplier *
seasonality *
discount_effect *
price_effect *
noise
)
# Ensure non-negative
sales_quantity = max(0, sales_quantity)
data.append({
'product_id': product_id,
'date': date.strftime('%Y-%m-%d'),
'price': round(price, 2),
'discount': discount,
'category': category,
'sales_quantity': sales_quantity
})
# Create DataFrame
df = pd.DataFrame(data)
# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Save to CSV
output_path = 'data/sales.csv'
df.to_csv(output_path, index=False)
print(f"Dataset generated successfully!")
print(f"Total records: {len(df)}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of products: {df['product_id'].nunique()}")
print(f"Categories: {df['category'].nunique()}")
print(f"\nDataset saved to: {output_path}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nDataset statistics:")
print(df.describe())