File size: 4,029 Bytes
7f90ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""

Generate Synthetic E-commerce Sales Dataset



This script creates a realistic synthetic dataset for demand prediction.

The dataset includes temporal patterns, seasonality, and realistic relationships

between features and sales quantity.

"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
NUM_PRODUCTS = 50
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2023, 12, 31)
CATEGORIES = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 
              'Toys', 'Beauty', 'Automotive', 'Food & Beverages', 'Health']

# Generate date range
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
num_days = len(date_range)

# Initialize lists to store data
data = []

# Generate data for each product
for product_id in range(1, NUM_PRODUCTS + 1):
    # Assign category randomly
    category = np.random.choice(CATEGORIES)
    
    # Base price varies by category
    category_base_prices = {
        'Electronics': 500,
        'Clothing': 50,
        'Home & Garden': 100,
        'Sports': 150,
        'Books': 20,
        'Toys': 30,
        'Beauty': 40,
        'Automotive': 300,
        'Food & Beverages': 25,
        'Health': 60
    }
    
    base_price = category_base_prices[category] * (0.8 + np.random.random() * 0.4)
    
    # Generate daily records
    for date in date_range:
        # Day of week effect (weekends have higher sales)
        day_of_week = date.weekday()
        weekend_multiplier = 1.3 if day_of_week >= 5 else 1.0
        
        # Monthly seasonality (higher sales in Nov-Dec, lower in Jan-Feb)
        month = date.month
        if month in [11, 12]:  # Holiday season
            seasonality = 1.5
        elif month in [1, 2]:  # Post-holiday slump
            seasonality = 0.7
        elif month in [6, 7, 8]:  # Summer
            seasonality = 1.2
        else:
            seasonality = 1.0
        
        # Random discount (0-30%)
        discount = np.random.choice([0, 5, 10, 15, 20, 25, 30], p=[0.4, 0.2, 0.15, 0.1, 0.08, 0.05, 0.02])
        
        # Price with discount
        price = base_price * (1 - discount / 100)
        
        # Base demand varies by product
        base_demand = np.random.randint(10, 100)
        
        # Calculate sales quantity with multiple factors
        # Higher discount -> higher sales
        discount_effect = 1 + (discount / 100) * 0.5
        
        # Lower price -> higher sales (inverse relationship)
        price_effect = 1 / (1 + (price / 1000) * 0.1)
        
        # Add some randomness
        noise = np.random.normal(1, 0.15)
        
        # Calculate final sales quantity
        sales_quantity = int(
            base_demand * 
            weekend_multiplier * 
            seasonality * 
            discount_effect * 
            price_effect * 
            noise
        )
        
        # Ensure non-negative
        sales_quantity = max(0, sales_quantity)
        
        data.append({
            'product_id': product_id,
            'date': date.strftime('%Y-%m-%d'),
            'price': round(price, 2),
            'discount': discount,
            'category': category,
            'sales_quantity': sales_quantity
        })

# Create DataFrame
df = pd.DataFrame(data)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to CSV
output_path = 'data/sales.csv'
df.to_csv(output_path, index=False)

print(f"Dataset generated successfully!")
print(f"Total records: {len(df)}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Number of products: {df['product_id'].nunique()}")
print(f"Categories: {df['category'].nunique()}")
print(f"\nDataset saved to: {output_path}")
print(f"\nFirst few rows:")
print(df.head(10))
print(f"\nDataset statistics:")
print(df.describe())