File size: 5,439 Bytes
8f69dec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | import pandas as pd
import numpy as np
import json
import random
from faker import Faker
from datetime import datetime, timedelta
import os
fake = Faker()
Faker.seed(42)
np.random.seed(42)
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
os.makedirs(DATA_DIR, exist_ok=True)
def generate_sales_data(num_records=1000):
print("Generating Sales Data...")
products = {
'Laptop': {'category': 'Electronics', 'price_range': (800, 2500)},
'Smartphone': {'category': 'Electronics', 'price_range': (400, 1200)},
'Headphones': {'category': 'Accessories', 'price_range': (50, 300)},
'Monitor': {'category': 'Electronics', 'price_range': (150, 600)},
'Mouse': {'category': 'Accessories', 'price_range': (20, 100)},
'Keyboard': {'category': 'Accessories', 'price_range': (30, 150)},
'Desk Chair': {'category': 'Furniture', 'price_range': (100, 500)},
'Coffee Maker': {'category': 'Appliances', 'price_range': (40, 200)}
}
data = []
start_date = datetime.now() - timedelta(days=365)
for _ in range(num_records):
product_name = random.choice(list(products.keys()))
info = products[product_name]
date = start_date + timedelta(days=random.randint(0, 365), hours=random.randint(0, 23))
quantity = random.randint(1, 5)
price_unit = round(random.uniform(*info['price_range']), 2)
total_price = quantity * price_unit
data.append({
'TransactionID': fake.uuid4(),
'Date': date.strftime('%Y-%m-%d %H:%M:%S'),
'CustomerID': fake.random_int(min=1001, max=1200),
'Product': product_name,
'Category': info['category'],
'Quantity': quantity,
'UnitPrice': price_unit,
'TotalPrice': round(total_price, 2),
'Region': random.choice(['North', 'South', 'East', 'West']),
'PaymentMethod': random.choice(['Credit Card', 'PayPal', 'Debit Card', 'Bank Transfer'])
})
df = pd.DataFrame(data)
df.to_csv(os.path.join(DATA_DIR, 'sales_data.csv'), index=False)
print(f"Saved {num_records} sales records.")
def generate_web_logs(num_records=500):
print("Generating Web Logs...")
logs = []
start_date = datetime.now() - timedelta(days=7)
endpoints = ['/home', '/products', '/cart', '/checkout', '/login', '/support', '/search']
status_codes = [200, 200, 200, 201, 302, 404, 500]
for _ in range(num_records):
timestamp = start_date + timedelta(minutes=random.randint(0, 7*24*60))
log = {
'timestamp': timestamp.isoformat(),
'ip_address': fake.ipv4(),
'request_method': random.choice(['GET', 'POST']),
'endpoint': random.choice(endpoints),
'status_code': random.choices(status_codes, weights=[60, 10, 5, 5, 10, 8, 2])[0],
'user_agent': fake.user_agent(),
'response_time_ms': random.randint(20, 1500)
}
logs.append(log)
with open(os.path.join(DATA_DIR, 'web_logs.json'), 'w') as f:
json.dump(logs, f, indent=4)
print(f"Saved {num_records} web logs.")
def generate_reviews(num_records=300):
print("Generating Customer Reviews...")
reviews = []
# Simple templates for negative/positive reviews
positive_templates = [
"I love this {product}! It works great.",
"Excellent quality, highly recommended.",
"Good value for money. The {product} exceeded my expectations.",
"Fast shipping and great service.",
"The {product} is amazing, 5 stars!"
]
negative_templates = [
"Terrible experience with this {product}.",
"Broken on arrival. Very disappointed.",
"Not worth the money. Analyzing the {product} shows poor build quality.",
"Customer service was unhelpful.",
"The {product} stopped working after two days."
]
products = ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Mouse', 'Keyboard', 'Desk Chair', 'Coffee Maker']
for _ in range(num_records):
product = random.choice(products)
sentiment = random.choice(['positive', 'negative', 'neutral'])
if sentiment == 'positive':
rating = random.randint(4, 5)
text = random.choice(positive_templates).format(product=product)
elif sentiment == 'negative':
rating = random.randint(1, 2)
text = random.choice(negative_templates).format(product=product)
else:
rating = 3
text = f"The {product} is okay, but could be better."
reviews.append({
'ReviewID': fake.uuid4(),
'CustomerID': fake.random_int(min=1001, max=1200),
'Product': product,
'Rating': rating,
'ReviewText': text,
'Date': fake.date_this_year().strftime('%Y-%m-%d')
})
df = pd.DataFrame(reviews)
df.to_csv(os.path.join(DATA_DIR, 'customer_reviews.csv'), index=False)
print(f"Saved {num_records} reviews.")
if __name__ == "__main__":
generate_sales_data(2000)
generate_web_logs(1000)
generate_reviews(500)
|