| |
|
| | import pandas as pd
|
| | import random
|
| | import uuid
|
| | from faker import Faker
|
| |
|
| | fake = Faker()
|
| |
|
| | def generate_customer_record():
|
| | customer_id = str(uuid.uuid4())
|
| | age = random.randint(18, 80)
|
| | gender = random.choice(["Male", "Female", "Other"])
|
| | income = round(random.uniform(20000, 150000), 2)
|
| | purchase_frequency = random.randint(1, 100)
|
| | avg_spend = round(random.uniform(10, 2000), 2)
|
| | churn_risk = random.choice(["Low", "Medium", "High"])
|
| |
|
| |
|
| | sector = random.choice(["Retail", "E-commerce", "Banking", "Telecom", "Travel"])
|
| | if sector in ["Retail", "E-commerce"]:
|
| | product_category = random.choice(["Electronics", "Fashion", "Home", "Sports", "Beauty"])
|
| | avg_rating = round(random.uniform(1, 5), 1)
|
| | cart_abandon_rate = round(random.uniform(0, 0.5), 2)
|
| | extra = {"Sector": sector, "ProductCategory": product_category,
|
| | "AvgRating": avg_rating, "CartAbandonRate": cart_abandon_rate}
|
| | elif sector == "Banking":
|
| | credit_score = random.randint(300, 850)
|
| | num_transactions = random.randint(10, 200)
|
| | extra = {"Sector": sector, "CreditScore": credit_score,
|
| | "NumTransactions": num_transactions}
|
| | elif sector == "Telecom":
|
| | monthly_data = round(random.uniform(0.5, 50), 2)
|
| | call_minutes = random.randint(100, 3000)
|
| | extra = {"Sector": sector, "MonthlyDataGB": monthly_data,
|
| | "CallMinutes": call_minutes}
|
| | elif sector == "Travel":
|
| | trips_per_year = random.randint(0, 15)
|
| | loyalty_tier = random.choice(["Bronze", "Silver", "Gold", "Platinum"])
|
| | extra = {"Sector": sector, "TripsPerYear": trips_per_year,
|
| | "LoyaltyTier": loyalty_tier}
|
| | else:
|
| | extra = {"Sector": sector}
|
| |
|
| | record = {
|
| | "CustomerID": customer_id,
|
| | "Age": age,
|
| | "Gender": gender,
|
| | "Income": income,
|
| | "PurchaseFrequency": purchase_frequency,
|
| | "AvgSpend": avg_spend,
|
| | "ChurnRisk": churn_risk
|
| | }
|
| | record.update(extra)
|
| | return record
|
| |
|
| | def generate_dataset(num_records=30000):
|
| | records = [generate_customer_record() for _ in range(num_records)]
|
| | df = pd.DataFrame(records)
|
| |
|
| | df.to_csv("data\\customers.csv", index=False)
|
| | print(f" Generated {num_records} customer records and saved to ../data/customers.csv")
|
| |
|
| | if __name__ == "__main__":
|
| | generate_dataset()
|
| |
|