import pandas as pd import numpy as np import json import random from faker import Faker from datetime import datetime, timedelta import os fake = Faker() Faker.seed(42) np.random.seed(42) DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') os.makedirs(DATA_DIR, exist_ok=True) def generate_sales_data(num_records=1000): print("Generating Sales Data...") products = { 'Laptop': {'category': 'Electronics', 'price_range': (800, 2500)}, 'Smartphone': {'category': 'Electronics', 'price_range': (400, 1200)}, 'Headphones': {'category': 'Accessories', 'price_range': (50, 300)}, 'Monitor': {'category': 'Electronics', 'price_range': (150, 600)}, 'Mouse': {'category': 'Accessories', 'price_range': (20, 100)}, 'Keyboard': {'category': 'Accessories', 'price_range': (30, 150)}, 'Desk Chair': {'category': 'Furniture', 'price_range': (100, 500)}, 'Coffee Maker': {'category': 'Appliances', 'price_range': (40, 200)} } data = [] start_date = datetime.now() - timedelta(days=365) for _ in range(num_records): product_name = random.choice(list(products.keys())) info = products[product_name] date = start_date + timedelta(days=random.randint(0, 365), hours=random.randint(0, 23)) quantity = random.randint(1, 5) price_unit = round(random.uniform(*info['price_range']), 2) total_price = quantity * price_unit data.append({ 'TransactionID': fake.uuid4(), 'Date': date.strftime('%Y-%m-%d %H:%M:%S'), 'CustomerID': fake.random_int(min=1001, max=1200), 'Product': product_name, 'Category': info['category'], 'Quantity': quantity, 'UnitPrice': price_unit, 'TotalPrice': round(total_price, 2), 'Region': random.choice(['North', 'South', 'East', 'West']), 'PaymentMethod': random.choice(['Credit Card', 'PayPal', 'Debit Card', 'Bank Transfer']) }) df = pd.DataFrame(data) df.to_csv(os.path.join(DATA_DIR, 'sales_data.csv'), index=False) print(f"Saved {num_records} sales records.") def generate_web_logs(num_records=500): print("Generating Web Logs...") logs = [] start_date = datetime.now() - timedelta(days=7) endpoints = ['/home', '/products', '/cart', '/checkout', '/login', '/support', '/search'] status_codes = [200, 200, 200, 201, 302, 404, 500] for _ in range(num_records): timestamp = start_date + timedelta(minutes=random.randint(0, 7*24*60)) log = { 'timestamp': timestamp.isoformat(), 'ip_address': fake.ipv4(), 'request_method': random.choice(['GET', 'POST']), 'endpoint': random.choice(endpoints), 'status_code': random.choices(status_codes, weights=[60, 10, 5, 5, 10, 8, 2])[0], 'user_agent': fake.user_agent(), 'response_time_ms': random.randint(20, 1500) } logs.append(log) with open(os.path.join(DATA_DIR, 'web_logs.json'), 'w') as f: json.dump(logs, f, indent=4) print(f"Saved {num_records} web logs.") def generate_reviews(num_records=300): print("Generating Customer Reviews...") reviews = [] # Simple templates for negative/positive reviews positive_templates = [ "I love this {product}! It works great.", "Excellent quality, highly recommended.", "Good value for money. The {product} exceeded my expectations.", "Fast shipping and great service.", "The {product} is amazing, 5 stars!" ] negative_templates = [ "Terrible experience with this {product}.", "Broken on arrival. Very disappointed.", "Not worth the money. Analyzing the {product} shows poor build quality.", "Customer service was unhelpful.", "The {product} stopped working after two days." ] products = ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Mouse', 'Keyboard', 'Desk Chair', 'Coffee Maker'] for _ in range(num_records): product = random.choice(products) sentiment = random.choice(['positive', 'negative', 'neutral']) if sentiment == 'positive': rating = random.randint(4, 5) text = random.choice(positive_templates).format(product=product) elif sentiment == 'negative': rating = random.randint(1, 2) text = random.choice(negative_templates).format(product=product) else: rating = 3 text = f"The {product} is okay, but could be better." reviews.append({ 'ReviewID': fake.uuid4(), 'CustomerID': fake.random_int(min=1001, max=1200), 'Product': product, 'Rating': rating, 'ReviewText': text, 'Date': fake.date_this_year().strftime('%Y-%m-%d') }) df = pd.DataFrame(reviews) df.to_csv(os.path.join(DATA_DIR, 'customer_reviews.csv'), index=False) print(f"Saved {num_records} reviews.") if __name__ == "__main__": generate_sales_data(2000) generate_web_logs(1000) generate_reviews(500)