|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import json
|
|
|
import random
|
|
|
from faker import Faker
|
|
|
from datetime import datetime, timedelta
|
|
|
import os
|
|
|
|
|
|
fake = Faker()
|
|
|
Faker.seed(42)
|
|
|
np.random.seed(42)
|
|
|
|
|
|
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
|
|
|
os.makedirs(DATA_DIR, exist_ok=True)
|
|
|
|
|
|
def generate_sales_data(num_records=1000):
|
|
|
print("Generating Sales Data...")
|
|
|
products = {
|
|
|
'Laptop': {'category': 'Electronics', 'price_range': (800, 2500)},
|
|
|
'Smartphone': {'category': 'Electronics', 'price_range': (400, 1200)},
|
|
|
'Headphones': {'category': 'Accessories', 'price_range': (50, 300)},
|
|
|
'Monitor': {'category': 'Electronics', 'price_range': (150, 600)},
|
|
|
'Mouse': {'category': 'Accessories', 'price_range': (20, 100)},
|
|
|
'Keyboard': {'category': 'Accessories', 'price_range': (30, 150)},
|
|
|
'Desk Chair': {'category': 'Furniture', 'price_range': (100, 500)},
|
|
|
'Coffee Maker': {'category': 'Appliances', 'price_range': (40, 200)}
|
|
|
}
|
|
|
|
|
|
data = []
|
|
|
start_date = datetime.now() - timedelta(days=365)
|
|
|
|
|
|
for _ in range(num_records):
|
|
|
product_name = random.choice(list(products.keys()))
|
|
|
info = products[product_name]
|
|
|
|
|
|
date = start_date + timedelta(days=random.randint(0, 365), hours=random.randint(0, 23))
|
|
|
quantity = random.randint(1, 5)
|
|
|
price_unit = round(random.uniform(*info['price_range']), 2)
|
|
|
total_price = quantity * price_unit
|
|
|
|
|
|
data.append({
|
|
|
'TransactionID': fake.uuid4(),
|
|
|
'Date': date.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
'CustomerID': fake.random_int(min=1001, max=1200),
|
|
|
'Product': product_name,
|
|
|
'Category': info['category'],
|
|
|
'Quantity': quantity,
|
|
|
'UnitPrice': price_unit,
|
|
|
'TotalPrice': round(total_price, 2),
|
|
|
'Region': random.choice(['North', 'South', 'East', 'West']),
|
|
|
'PaymentMethod': random.choice(['Credit Card', 'PayPal', 'Debit Card', 'Bank Transfer'])
|
|
|
})
|
|
|
|
|
|
df = pd.DataFrame(data)
|
|
|
df.to_csv(os.path.join(DATA_DIR, 'sales_data.csv'), index=False)
|
|
|
print(f"Saved {num_records} sales records.")
|
|
|
|
|
|
def generate_web_logs(num_records=500):
|
|
|
print("Generating Web Logs...")
|
|
|
logs = []
|
|
|
start_date = datetime.now() - timedelta(days=7)
|
|
|
|
|
|
endpoints = ['/home', '/products', '/cart', '/checkout', '/login', '/support', '/search']
|
|
|
status_codes = [200, 200, 200, 201, 302, 404, 500]
|
|
|
|
|
|
for _ in range(num_records):
|
|
|
timestamp = start_date + timedelta(minutes=random.randint(0, 7*24*60))
|
|
|
|
|
|
log = {
|
|
|
'timestamp': timestamp.isoformat(),
|
|
|
'ip_address': fake.ipv4(),
|
|
|
'request_method': random.choice(['GET', 'POST']),
|
|
|
'endpoint': random.choice(endpoints),
|
|
|
'status_code': random.choices(status_codes, weights=[60, 10, 5, 5, 10, 8, 2])[0],
|
|
|
'user_agent': fake.user_agent(),
|
|
|
'response_time_ms': random.randint(20, 1500)
|
|
|
}
|
|
|
logs.append(log)
|
|
|
|
|
|
with open(os.path.join(DATA_DIR, 'web_logs.json'), 'w') as f:
|
|
|
json.dump(logs, f, indent=4)
|
|
|
print(f"Saved {num_records} web logs.")
|
|
|
|
|
|
def generate_reviews(num_records=300):
|
|
|
print("Generating Customer Reviews...")
|
|
|
reviews = []
|
|
|
|
|
|
|
|
|
positive_templates = [
|
|
|
"I love this {product}! It works great.",
|
|
|
"Excellent quality, highly recommended.",
|
|
|
"Good value for money. The {product} exceeded my expectations.",
|
|
|
"Fast shipping and great service.",
|
|
|
"The {product} is amazing, 5 stars!"
|
|
|
]
|
|
|
|
|
|
negative_templates = [
|
|
|
"Terrible experience with this {product}.",
|
|
|
"Broken on arrival. Very disappointed.",
|
|
|
"Not worth the money. Analyzing the {product} shows poor build quality.",
|
|
|
"Customer service was unhelpful.",
|
|
|
"The {product} stopped working after two days."
|
|
|
]
|
|
|
|
|
|
products = ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Mouse', 'Keyboard', 'Desk Chair', 'Coffee Maker']
|
|
|
|
|
|
for _ in range(num_records):
|
|
|
product = random.choice(products)
|
|
|
sentiment = random.choice(['positive', 'negative', 'neutral'])
|
|
|
|
|
|
if sentiment == 'positive':
|
|
|
rating = random.randint(4, 5)
|
|
|
text = random.choice(positive_templates).format(product=product)
|
|
|
elif sentiment == 'negative':
|
|
|
rating = random.randint(1, 2)
|
|
|
text = random.choice(negative_templates).format(product=product)
|
|
|
else:
|
|
|
rating = 3
|
|
|
text = f"The {product} is okay, but could be better."
|
|
|
|
|
|
reviews.append({
|
|
|
'ReviewID': fake.uuid4(),
|
|
|
'CustomerID': fake.random_int(min=1001, max=1200),
|
|
|
'Product': product,
|
|
|
'Rating': rating,
|
|
|
'ReviewText': text,
|
|
|
'Date': fake.date_this_year().strftime('%Y-%m-%d')
|
|
|
})
|
|
|
|
|
|
df = pd.DataFrame(reviews)
|
|
|
df.to_csv(os.path.join(DATA_DIR, 'customer_reviews.csv'), index=False)
|
|
|
print(f"Saved {num_records} reviews.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
generate_sales_data(2000)
|
|
|
generate_web_logs(1000)
|
|
|
generate_reviews(500)
|
|
|
|