File size: 11,547 Bytes
133a630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import pandas as pd
import numpy as np
import os

np.random.seed(42)

DATA_DIR = os.path.dirname(os.path.abspath(__file__))

CATEGORIES = {
    "Electronics": ["Smartphones", "Laptops", "Headphones", "Tablets", "Cameras"],
    "Clothing": ["Men's", "Women's", "Kids'", "Accessories", "Footwear"],
    "Home & Kitchen": ["Furniture", "Appliances", "Cookware", "Decor", "Storage"],
    "Books": ["Fiction", "Non-Fiction", "Science", "Technology", "Self-Help"],
    "Sports": ["Fitness", "Outdoor", "Team Sports", "Cycling", "Swimming"],
    "Beauty": ["Skincare", "Makeup", "Haircare", "Fragrance", "Tools"],
    "Toys": ["Educational", "Action Figures", "Board Games", "Dolls", "Puzzles"],
    "Automotive": ["Car Care", "Interior", "Exterior", "Tools", "Electronics"],
}

BRANDS_BY_CATEGORY = {
    "Electronics": ["Samsung", "Apple", "Sony", "LG", "Dell", "HP", "Bose", "Canon"],
    "Clothing": ["Nike", "Adidas", "Zara", "H&M", "Levi's", "Puma", "Under Armour"],
    "Home & Kitchen": ["IKEA", "KitchenAid", "Ninja", "Dyson", "OXO", "Cuisinart"],
    "Books": ["Penguin", "HarperCollins", "Random House", "Simon & Schuster", "Macmillan"],
    "Sports": ["Nike", "Adidas", "Wilson", "Spalding", "The North Face", "Columbia"],
    "Beauty": ["L'Oreal", "Maybelline", "Clinique", "Estee Lauder", "Neutrogena"],
    "Toys": ["LEGO", "Mattel", "Hasbro", "Fisher-Price", "Melissa & Doug"],
    "Automotive": ["Meguiar's", "Armor All", "WeatherTech", "Michelin", "Bosch"],
}

PRODUCT_NAMES = {
    "Electronics": {
        "Smartphones": ["Galaxy S24", "iPhone 15 Pro", "Xperia 1 V", "Pixel 8 Pro", "LG G4"],
        "Laptops": ["ThinkPad X1", "MacBook Pro", "XPS 15", "Surface Laptop", "Spectre x360"],
        "Headphones": ["WH-1000XM5", "AirPods Pro", "QuietComfort 45", "Galaxy Buds", "Momentum 4"],
        "Tablets": ["iPad Pro", "Galaxy Tab S9", "Surface Pro", "Fire HD 10", "Lenovo Tab"],
        "Cameras": ["EOS R5", "Alpha A7 IV", "Z8", "X-T5", "Lumix S5"],
    },
    "Clothing": {
        "Men's": ["Classic Fit Jeans", "Cotton T-Shirt", "Blazer", "Chino Pants", "Hoodie"],
        "Women's": ["Summer Dress", "Yoga Pants", "Leather Jacket", "Silk Blouse", "Maxi Skirt"],
        "Kids'": ["Colorful Leggings", "Graphic Tee", "Denim Jacket", "Plaid Shirt", "Joggers"],
        "Accessories": ["Leather Belt", "Sunglasses", "Wool Scarf", "Baseball Cap", "Watch"],
        "Footwear": ["Running Shoes", "Sandals", "Boots", "Loafers", "Sneakers"],
    },
    "Home & Kitchen": {
        "Furniture": ["Sofa Set", "Coffee Table", "Bookshelf", "Desk Chair", "Bed Frame"],
        "Appliances": ["Air Fryer", "Blender", "Coffee Maker", "Microwave", "Toaster"],
        "Cookware": ["Non-Stick Pan", "Chef's Knife", "Cutting Board", "Pot Set", "Baking Sheet"],
        "Decor": ["Table Lamp", "Wall Art", "Vase", "Throw Pillow", "Candle Set"],
        "Storage": ["Plastic Bins", "Shelf Organizer", "Shoe Rack", "Closet System", "Drawer Divider"],
    },
    "Books": {
        "Fiction": ["The Silent Echo", "Midnight Sun", "Ocean's Memory", "The Last Garden", "Crimson Peak"],
        "Non-Fiction": ["Atomic Habits", "Sapiens", "Educated", "The Power of Now", "Outliers"],
        "Science": ["A Brief History of Time", "The Gene", "Cosmos", "The Selfish Gene", "Six Easy Pieces"],
        "Technology": ["Clean Code", "Design Patterns", "Introduction to Algorithms", "Structure and Interpretation", "The Pragmatic Programmer"],
        "Self-Help": ["The 7 Habits", "How to Win Friends", "Think and Grow Rich", "The Subtle Art", "Daring Greatly"],
    },
    "Sports": {
        "Fitness": ["Adjustable Dumbbells", "Yoga Mat", "Resistance Bands", "Jump Rope", "Foam Roller"],
        "Outdoor": ["Camping Tent", "Hiking Backpack", "Sleeping Bag", "Portable Stove", "Water Filter"],
        "Team Sports": ["Soccer Ball", "Basketball", "Volleyball", "Baseball Glove", "Football"],
        "Cycling": ["Mountain Bike", "Helmet", "Bike Pump", "Cycling Jersey", "Bike Lock"],
        "Swimming": ["Swim Goggles", "Kickboard", "Swim Cap", "Ear Plugs", "Waterproof Bag"],
    },
    "Beauty": {
        "Skincare": ["Moisturizer", "Face Serum", "Sunscreen", "Eye Cream", "Face Mask"],
        "Makeup": ["Foundation", "Lipstick", "Mascara", "Eyeshadow Palette", "Concealer"],
        "Haircare": ["Shampoo", "Conditioner", "Hair Oil", "Hair Dryer", "Straightener"],
        "Fragrance": ["Eau de Parfum", "Cologne", "Body Spray", "Perfume Oil", "Rollerball"],
        "Tools": ["Makeup Brushes", "Sponge Set", "Tweezers", "Mirror", "Travel Case"],
    },
    "Toys": {
        "Educational": ["Science Kit", "Building Blocks", "Math Puzzle", "Robot Kit", "Microscope Set"],
        "Action Figures": ["Superhero Figure", "Animal Set", "Dinosaur Set", "Space Explorer", "Fantasy Warrior"],
        "Board Games": ["Strategy Game", "Family Game", "Card Game", "Trivia Game", "Cooperative Game"],
        "Dolls": ["Fashion Doll", "Baby Doll", "Dollhouse", "Doll Clothes Set", "Puppet Set"],
        "Puzzles": ["1000 Piece Puzzle", "Floor Puzzle", "3D Puzzle", "Wooden Puzzle", "Brain Teaser"],
    },
    "Automotive": {
        "Car Care": ["Car Shampoo", "Wax Kit", "Microfiber Cloth", "Tire Cleaner", "Interior Wipes"],
        "Interior": ["Seat Covers", "Floor Mats", "Steering Wheel Cover", "Air Freshener", "Phone Mount"],
        "Exterior": ["Car Cover", "Mud Flaps", "Window Visors", "License Plate Frame", "Side Moldings"],
        "Tools": ["Jump Starter", "Tire Inflator", "Tool Kit", "Car Jack", "Emergency Kit"],
        "Electronics": ["Dash Cam", "GPS Navigator", "Bluetooth Adapter", "Backup Camera", "Car Charger"],
    },
}


def generate_products(n_per_category=8):
    products = []
    pid = 1
    for category, subcategories in CATEGORIES.items():
        brands = BRANDS_BY_CATEGORY[category]
        names = PRODUCT_NAMES[category]
        for subcategory in subcategories:
            subcat_names = names.get(subcategory, ["Generic Item"])
            for i in range(min(n_per_category, len(subcat_names))):
                name = subcat_names[i]
                brand = np.random.choice(brands)
                price = round(np.random.uniform(9.99, 1499.99), 2)
                avg_rating = round(np.random.uniform(3.0, 5.0), 1)
                num_reviews = np.random.randint(10, 500)
                products.append({
                    "product_id": pid,
                    "name": name,
                    "category": category,
                    "subcategory": subcategory,
                    "brand": brand,
                    "price": price,
                    "avg_rating": avg_rating,
                    "num_reviews": num_reviews,
                })
                pid += 1
    return pd.DataFrame(products)


def generate_users(n_users=200):
    first_names = ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace",
                   "Henry", "Ivy", "Jack", "Kate", "Leo", "Mia", "Noah", "Olivia",
                   "Paul", "Quinn", "Rachel", "Sam", "Tina", "Uma", "Victor", "Wendy",
                   "Xander", "Yara", "Zack", "Ava", "Ben", "Clara", "David"]
    users = []
    for uid in range(1, n_users + 1):
        name = np.random.choice(first_names) + f"_{uid}"
        age = np.random.randint(18, 65)
        n_cats = np.random.randint(1, 4)
        preferred_categories = list(np.random.choice(list(CATEGORIES.keys()), n_cats, replace=False))
        budget_min = round(np.random.uniform(10, 200), 2)
        budget_max = round(budget_min + np.random.uniform(50, 1500), 2)
        n_brands = np.random.randint(0, 3)
        all_brands = list({b for brands in BRANDS_BY_CATEGORY.values() for b in brands})
        favorite_brands = list(np.random.choice(all_brands, n_brands, replace=False)) if n_brands > 0 else []
        users.append({
            "user_id": uid,
            "name": name,
            "age": age,
            "preferred_categories": ",".join(preferred_categories),
            "budget_min": budget_min,
            "budget_max": budget_max,
            "favorite_brands": ",".join(favorite_brands),
        })
    return pd.DataFrame(users)


def generate_ratings(products_df, users_df, n_ratings=5000, sparsity_factor=0.05):
    ratings = []
    n_users = len(users_df)
    n_products = len(products_df)

    user_cat_prefs = {}
    for _, u in users_df.iterrows():
        user_cat_prefs[u["user_id"]] = set(u["preferred_categories"].split(",")) if u["preferred_categories"] else set()

    user_brand_prefs = {}
    for _, u in users_df.iterrows():
        user_brand_prefs[u["user_id"]] = set(u["favorite_brands"].split(",")) if u["favorite_brands"] else set()

    possible_pairs = []
    for uid in range(1, n_users + 1):
        for pid in range(1, n_products + 1):
            possible_pairs.append((uid, pid))

    selected_indices = np.random.choice(len(possible_pairs), min(n_ratings, len(possible_pairs)), replace=False)
    selected_pairs = [possible_pairs[i] for i in selected_indices]

    prod_cat = dict(zip(products_df["product_id"], products_df["category"]))
    prod_brand = dict(zip(products_df["product_id"], products_df["brand"]))
    prod_price = dict(zip(products_df["product_id"], products_df["price"]))

    for uid, pid in selected_pairs:
        base = 3.0

        cat = prod_cat.get(pid, "")
        if uid in user_cat_prefs and cat in user_cat_prefs[uid]:
            base += np.random.uniform(0.5, 1.5)

        brand = prod_brand.get(pid, "")
        if uid in user_brand_prefs and brand in user_brand_prefs[uid]:
            base += np.random.uniform(0.3, 1.0)

        price = prod_price.get(pid, 50)
        user_row = users_df[users_df["user_id"] == uid].iloc[0]
        if user_row["budget_min"] <= price <= user_row["budget_max"]:
            base += np.random.uniform(0.0, 0.5)
        else:
            base -= np.random.uniform(0.0, 0.5)

        noise = np.random.normal(0, 0.5)
        rating = round(min(5.0, max(1.0, base + noise)), 1)

        ratings.append({
            "user_id": uid,
            "product_id": pid,
            "rating": rating,
        })

    return pd.DataFrame(ratings)


def generate_interactions(products_df, users_df, rating_df, n_purchases=2000):
    interactions = []
    selected = rating_df.sample(min(n_purchases, len(rating_df)))
    for _, row in selected.iterrows():
        interactions.append({
            "user_id": row["user_id"],
            "product_id": row["product_id"],
            "purchased": True,
            "quantity": np.random.randint(1, 4),
        })
    return pd.DataFrame(interactions)


def main():
    print("Generating products...")
    products = generate_products(n_per_category=8)
    products.to_csv(os.path.join(DATA_DIR, "products.csv"), index=False)
    print(f"  {len(products)} products generated")

    print("Generating users...")
    users = generate_users(n_users=200)
    users.to_csv(os.path.join(DATA_DIR, "users.csv"), index=False)
    print(f"  {len(users)} users generated")

    print("Generating ratings...")
    ratings = generate_ratings(products, users, n_ratings=8000)
    ratings.to_csv(os.path.join(DATA_DIR, "ratings.csv"), index=False)
    print(f"  {len(ratings)} ratings generated")

    print("Generating interactions...")
    interactions = generate_interactions(products, users, ratings)
    interactions.to_csv(os.path.join(DATA_DIR, "interactions.csv"), index=False)
    print(f"  {len(interactions)} interactions generated")

    print("All data generated successfully!")


if __name__ == "__main__":
    main()