EdwardSamuel13's picture
Upload 14 files
8f69dec verified
import pandas as pd
import numpy as np
import json
import random
from faker import Faker
from datetime import datetime, timedelta
import os
fake = Faker()
Faker.seed(42)
np.random.seed(42)
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
os.makedirs(DATA_DIR, exist_ok=True)
def generate_sales_data(num_records=1000):
print("Generating Sales Data...")
products = {
'Laptop': {'category': 'Electronics', 'price_range': (800, 2500)},
'Smartphone': {'category': 'Electronics', 'price_range': (400, 1200)},
'Headphones': {'category': 'Accessories', 'price_range': (50, 300)},
'Monitor': {'category': 'Electronics', 'price_range': (150, 600)},
'Mouse': {'category': 'Accessories', 'price_range': (20, 100)},
'Keyboard': {'category': 'Accessories', 'price_range': (30, 150)},
'Desk Chair': {'category': 'Furniture', 'price_range': (100, 500)},
'Coffee Maker': {'category': 'Appliances', 'price_range': (40, 200)}
}
data = []
start_date = datetime.now() - timedelta(days=365)
for _ in range(num_records):
product_name = random.choice(list(products.keys()))
info = products[product_name]
date = start_date + timedelta(days=random.randint(0, 365), hours=random.randint(0, 23))
quantity = random.randint(1, 5)
price_unit = round(random.uniform(*info['price_range']), 2)
total_price = quantity * price_unit
data.append({
'TransactionID': fake.uuid4(),
'Date': date.strftime('%Y-%m-%d %H:%M:%S'),
'CustomerID': fake.random_int(min=1001, max=1200),
'Product': product_name,
'Category': info['category'],
'Quantity': quantity,
'UnitPrice': price_unit,
'TotalPrice': round(total_price, 2),
'Region': random.choice(['North', 'South', 'East', 'West']),
'PaymentMethod': random.choice(['Credit Card', 'PayPal', 'Debit Card', 'Bank Transfer'])
})
df = pd.DataFrame(data)
df.to_csv(os.path.join(DATA_DIR, 'sales_data.csv'), index=False)
print(f"Saved {num_records} sales records.")
def generate_web_logs(num_records=500):
print("Generating Web Logs...")
logs = []
start_date = datetime.now() - timedelta(days=7)
endpoints = ['/home', '/products', '/cart', '/checkout', '/login', '/support', '/search']
status_codes = [200, 200, 200, 201, 302, 404, 500]
for _ in range(num_records):
timestamp = start_date + timedelta(minutes=random.randint(0, 7*24*60))
log = {
'timestamp': timestamp.isoformat(),
'ip_address': fake.ipv4(),
'request_method': random.choice(['GET', 'POST']),
'endpoint': random.choice(endpoints),
'status_code': random.choices(status_codes, weights=[60, 10, 5, 5, 10, 8, 2])[0],
'user_agent': fake.user_agent(),
'response_time_ms': random.randint(20, 1500)
}
logs.append(log)
with open(os.path.join(DATA_DIR, 'web_logs.json'), 'w') as f:
json.dump(logs, f, indent=4)
print(f"Saved {num_records} web logs.")
def generate_reviews(num_records=300):
print("Generating Customer Reviews...")
reviews = []
# Simple templates for negative/positive reviews
positive_templates = [
"I love this {product}! It works great.",
"Excellent quality, highly recommended.",
"Good value for money. The {product} exceeded my expectations.",
"Fast shipping and great service.",
"The {product} is amazing, 5 stars!"
]
negative_templates = [
"Terrible experience with this {product}.",
"Broken on arrival. Very disappointed.",
"Not worth the money. Analyzing the {product} shows poor build quality.",
"Customer service was unhelpful.",
"The {product} stopped working after two days."
]
products = ['Laptop', 'Smartphone', 'Headphones', 'Monitor', 'Mouse', 'Keyboard', 'Desk Chair', 'Coffee Maker']
for _ in range(num_records):
product = random.choice(products)
sentiment = random.choice(['positive', 'negative', 'neutral'])
if sentiment == 'positive':
rating = random.randint(4, 5)
text = random.choice(positive_templates).format(product=product)
elif sentiment == 'negative':
rating = random.randint(1, 2)
text = random.choice(negative_templates).format(product=product)
else:
rating = 3
text = f"The {product} is okay, but could be better."
reviews.append({
'ReviewID': fake.uuid4(),
'CustomerID': fake.random_int(min=1001, max=1200),
'Product': product,
'Rating': rating,
'ReviewText': text,
'Date': fake.date_this_year().strftime('%Y-%m-%d')
})
df = pd.DataFrame(reviews)
df.to_csv(os.path.join(DATA_DIR, 'customer_reviews.csv'), index=False)
print(f"Saved {num_records} reviews.")
if __name__ == "__main__":
generate_sales_data(2000)
generate_web_logs(1000)
generate_reviews(500)