Spaces:
Sleeping
Sleeping
File size: 11,547 Bytes
133a630 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | import pandas as pd
import numpy as np
import os
np.random.seed(42)
DATA_DIR = os.path.dirname(os.path.abspath(__file__))
CATEGORIES = {
"Electronics": ["Smartphones", "Laptops", "Headphones", "Tablets", "Cameras"],
"Clothing": ["Men's", "Women's", "Kids'", "Accessories", "Footwear"],
"Home & Kitchen": ["Furniture", "Appliances", "Cookware", "Decor", "Storage"],
"Books": ["Fiction", "Non-Fiction", "Science", "Technology", "Self-Help"],
"Sports": ["Fitness", "Outdoor", "Team Sports", "Cycling", "Swimming"],
"Beauty": ["Skincare", "Makeup", "Haircare", "Fragrance", "Tools"],
"Toys": ["Educational", "Action Figures", "Board Games", "Dolls", "Puzzles"],
"Automotive": ["Car Care", "Interior", "Exterior", "Tools", "Electronics"],
}
BRANDS_BY_CATEGORY = {
"Electronics": ["Samsung", "Apple", "Sony", "LG", "Dell", "HP", "Bose", "Canon"],
"Clothing": ["Nike", "Adidas", "Zara", "H&M", "Levi's", "Puma", "Under Armour"],
"Home & Kitchen": ["IKEA", "KitchenAid", "Ninja", "Dyson", "OXO", "Cuisinart"],
"Books": ["Penguin", "HarperCollins", "Random House", "Simon & Schuster", "Macmillan"],
"Sports": ["Nike", "Adidas", "Wilson", "Spalding", "The North Face", "Columbia"],
"Beauty": ["L'Oreal", "Maybelline", "Clinique", "Estee Lauder", "Neutrogena"],
"Toys": ["LEGO", "Mattel", "Hasbro", "Fisher-Price", "Melissa & Doug"],
"Automotive": ["Meguiar's", "Armor All", "WeatherTech", "Michelin", "Bosch"],
}
PRODUCT_NAMES = {
"Electronics": {
"Smartphones": ["Galaxy S24", "iPhone 15 Pro", "Xperia 1 V", "Pixel 8 Pro", "LG G4"],
"Laptops": ["ThinkPad X1", "MacBook Pro", "XPS 15", "Surface Laptop", "Spectre x360"],
"Headphones": ["WH-1000XM5", "AirPods Pro", "QuietComfort 45", "Galaxy Buds", "Momentum 4"],
"Tablets": ["iPad Pro", "Galaxy Tab S9", "Surface Pro", "Fire HD 10", "Lenovo Tab"],
"Cameras": ["EOS R5", "Alpha A7 IV", "Z8", "X-T5", "Lumix S5"],
},
"Clothing": {
"Men's": ["Classic Fit Jeans", "Cotton T-Shirt", "Blazer", "Chino Pants", "Hoodie"],
"Women's": ["Summer Dress", "Yoga Pants", "Leather Jacket", "Silk Blouse", "Maxi Skirt"],
"Kids'": ["Colorful Leggings", "Graphic Tee", "Denim Jacket", "Plaid Shirt", "Joggers"],
"Accessories": ["Leather Belt", "Sunglasses", "Wool Scarf", "Baseball Cap", "Watch"],
"Footwear": ["Running Shoes", "Sandals", "Boots", "Loafers", "Sneakers"],
},
"Home & Kitchen": {
"Furniture": ["Sofa Set", "Coffee Table", "Bookshelf", "Desk Chair", "Bed Frame"],
"Appliances": ["Air Fryer", "Blender", "Coffee Maker", "Microwave", "Toaster"],
"Cookware": ["Non-Stick Pan", "Chef's Knife", "Cutting Board", "Pot Set", "Baking Sheet"],
"Decor": ["Table Lamp", "Wall Art", "Vase", "Throw Pillow", "Candle Set"],
"Storage": ["Plastic Bins", "Shelf Organizer", "Shoe Rack", "Closet System", "Drawer Divider"],
},
"Books": {
"Fiction": ["The Silent Echo", "Midnight Sun", "Ocean's Memory", "The Last Garden", "Crimson Peak"],
"Non-Fiction": ["Atomic Habits", "Sapiens", "Educated", "The Power of Now", "Outliers"],
"Science": ["A Brief History of Time", "The Gene", "Cosmos", "The Selfish Gene", "Six Easy Pieces"],
"Technology": ["Clean Code", "Design Patterns", "Introduction to Algorithms", "Structure and Interpretation", "The Pragmatic Programmer"],
"Self-Help": ["The 7 Habits", "How to Win Friends", "Think and Grow Rich", "The Subtle Art", "Daring Greatly"],
},
"Sports": {
"Fitness": ["Adjustable Dumbbells", "Yoga Mat", "Resistance Bands", "Jump Rope", "Foam Roller"],
"Outdoor": ["Camping Tent", "Hiking Backpack", "Sleeping Bag", "Portable Stove", "Water Filter"],
"Team Sports": ["Soccer Ball", "Basketball", "Volleyball", "Baseball Glove", "Football"],
"Cycling": ["Mountain Bike", "Helmet", "Bike Pump", "Cycling Jersey", "Bike Lock"],
"Swimming": ["Swim Goggles", "Kickboard", "Swim Cap", "Ear Plugs", "Waterproof Bag"],
},
"Beauty": {
"Skincare": ["Moisturizer", "Face Serum", "Sunscreen", "Eye Cream", "Face Mask"],
"Makeup": ["Foundation", "Lipstick", "Mascara", "Eyeshadow Palette", "Concealer"],
"Haircare": ["Shampoo", "Conditioner", "Hair Oil", "Hair Dryer", "Straightener"],
"Fragrance": ["Eau de Parfum", "Cologne", "Body Spray", "Perfume Oil", "Rollerball"],
"Tools": ["Makeup Brushes", "Sponge Set", "Tweezers", "Mirror", "Travel Case"],
},
"Toys": {
"Educational": ["Science Kit", "Building Blocks", "Math Puzzle", "Robot Kit", "Microscope Set"],
"Action Figures": ["Superhero Figure", "Animal Set", "Dinosaur Set", "Space Explorer", "Fantasy Warrior"],
"Board Games": ["Strategy Game", "Family Game", "Card Game", "Trivia Game", "Cooperative Game"],
"Dolls": ["Fashion Doll", "Baby Doll", "Dollhouse", "Doll Clothes Set", "Puppet Set"],
"Puzzles": ["1000 Piece Puzzle", "Floor Puzzle", "3D Puzzle", "Wooden Puzzle", "Brain Teaser"],
},
"Automotive": {
"Car Care": ["Car Shampoo", "Wax Kit", "Microfiber Cloth", "Tire Cleaner", "Interior Wipes"],
"Interior": ["Seat Covers", "Floor Mats", "Steering Wheel Cover", "Air Freshener", "Phone Mount"],
"Exterior": ["Car Cover", "Mud Flaps", "Window Visors", "License Plate Frame", "Side Moldings"],
"Tools": ["Jump Starter", "Tire Inflator", "Tool Kit", "Car Jack", "Emergency Kit"],
"Electronics": ["Dash Cam", "GPS Navigator", "Bluetooth Adapter", "Backup Camera", "Car Charger"],
},
}
def generate_products(n_per_category=8):
products = []
pid = 1
for category, subcategories in CATEGORIES.items():
brands = BRANDS_BY_CATEGORY[category]
names = PRODUCT_NAMES[category]
for subcategory in subcategories:
subcat_names = names.get(subcategory, ["Generic Item"])
for i in range(min(n_per_category, len(subcat_names))):
name = subcat_names[i]
brand = np.random.choice(brands)
price = round(np.random.uniform(9.99, 1499.99), 2)
avg_rating = round(np.random.uniform(3.0, 5.0), 1)
num_reviews = np.random.randint(10, 500)
products.append({
"product_id": pid,
"name": name,
"category": category,
"subcategory": subcategory,
"brand": brand,
"price": price,
"avg_rating": avg_rating,
"num_reviews": num_reviews,
})
pid += 1
return pd.DataFrame(products)
def generate_users(n_users=200):
first_names = ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace",
"Henry", "Ivy", "Jack", "Kate", "Leo", "Mia", "Noah", "Olivia",
"Paul", "Quinn", "Rachel", "Sam", "Tina", "Uma", "Victor", "Wendy",
"Xander", "Yara", "Zack", "Ava", "Ben", "Clara", "David"]
users = []
for uid in range(1, n_users + 1):
name = np.random.choice(first_names) + f"_{uid}"
age = np.random.randint(18, 65)
n_cats = np.random.randint(1, 4)
preferred_categories = list(np.random.choice(list(CATEGORIES.keys()), n_cats, replace=False))
budget_min = round(np.random.uniform(10, 200), 2)
budget_max = round(budget_min + np.random.uniform(50, 1500), 2)
n_brands = np.random.randint(0, 3)
all_brands = list({b for brands in BRANDS_BY_CATEGORY.values() for b in brands})
favorite_brands = list(np.random.choice(all_brands, n_brands, replace=False)) if n_brands > 0 else []
users.append({
"user_id": uid,
"name": name,
"age": age,
"preferred_categories": ",".join(preferred_categories),
"budget_min": budget_min,
"budget_max": budget_max,
"favorite_brands": ",".join(favorite_brands),
})
return pd.DataFrame(users)
def generate_ratings(products_df, users_df, n_ratings=5000, sparsity_factor=0.05):
ratings = []
n_users = len(users_df)
n_products = len(products_df)
user_cat_prefs = {}
for _, u in users_df.iterrows():
user_cat_prefs[u["user_id"]] = set(u["preferred_categories"].split(",")) if u["preferred_categories"] else set()
user_brand_prefs = {}
for _, u in users_df.iterrows():
user_brand_prefs[u["user_id"]] = set(u["favorite_brands"].split(",")) if u["favorite_brands"] else set()
possible_pairs = []
for uid in range(1, n_users + 1):
for pid in range(1, n_products + 1):
possible_pairs.append((uid, pid))
selected_indices = np.random.choice(len(possible_pairs), min(n_ratings, len(possible_pairs)), replace=False)
selected_pairs = [possible_pairs[i] for i in selected_indices]
prod_cat = dict(zip(products_df["product_id"], products_df["category"]))
prod_brand = dict(zip(products_df["product_id"], products_df["brand"]))
prod_price = dict(zip(products_df["product_id"], products_df["price"]))
for uid, pid in selected_pairs:
base = 3.0
cat = prod_cat.get(pid, "")
if uid in user_cat_prefs and cat in user_cat_prefs[uid]:
base += np.random.uniform(0.5, 1.5)
brand = prod_brand.get(pid, "")
if uid in user_brand_prefs and brand in user_brand_prefs[uid]:
base += np.random.uniform(0.3, 1.0)
price = prod_price.get(pid, 50)
user_row = users_df[users_df["user_id"] == uid].iloc[0]
if user_row["budget_min"] <= price <= user_row["budget_max"]:
base += np.random.uniform(0.0, 0.5)
else:
base -= np.random.uniform(0.0, 0.5)
noise = np.random.normal(0, 0.5)
rating = round(min(5.0, max(1.0, base + noise)), 1)
ratings.append({
"user_id": uid,
"product_id": pid,
"rating": rating,
})
return pd.DataFrame(ratings)
def generate_interactions(products_df, users_df, rating_df, n_purchases=2000):
interactions = []
selected = rating_df.sample(min(n_purchases, len(rating_df)))
for _, row in selected.iterrows():
interactions.append({
"user_id": row["user_id"],
"product_id": row["product_id"],
"purchased": True,
"quantity": np.random.randint(1, 4),
})
return pd.DataFrame(interactions)
def main():
print("Generating products...")
products = generate_products(n_per_category=8)
products.to_csv(os.path.join(DATA_DIR, "products.csv"), index=False)
print(f" {len(products)} products generated")
print("Generating users...")
users = generate_users(n_users=200)
users.to_csv(os.path.join(DATA_DIR, "users.csv"), index=False)
print(f" {len(users)} users generated")
print("Generating ratings...")
ratings = generate_ratings(products, users, n_ratings=8000)
ratings.to_csv(os.path.join(DATA_DIR, "ratings.csv"), index=False)
print(f" {len(ratings)} ratings generated")
print("Generating interactions...")
interactions = generate_interactions(products, users, ratings)
interactions.to_csv(os.path.join(DATA_DIR, "interactions.csv"), index=False)
print(f" {len(interactions)} interactions generated")
print("All data generated successfully!")
if __name__ == "__main__":
main()
|