TasteEngine / data /generate_data.py
Abdallah4z's picture
Initial implementation of TasteEngine: multi-approach recommender system with Flask web UI, evaluation suite, and explanation engine
133a630
import pandas as pd
import numpy as np
import os
np.random.seed(42)
DATA_DIR = os.path.dirname(os.path.abspath(__file__))
CATEGORIES = {
"Electronics": ["Smartphones", "Laptops", "Headphones", "Tablets", "Cameras"],
"Clothing": ["Men's", "Women's", "Kids'", "Accessories", "Footwear"],
"Home & Kitchen": ["Furniture", "Appliances", "Cookware", "Decor", "Storage"],
"Books": ["Fiction", "Non-Fiction", "Science", "Technology", "Self-Help"],
"Sports": ["Fitness", "Outdoor", "Team Sports", "Cycling", "Swimming"],
"Beauty": ["Skincare", "Makeup", "Haircare", "Fragrance", "Tools"],
"Toys": ["Educational", "Action Figures", "Board Games", "Dolls", "Puzzles"],
"Automotive": ["Car Care", "Interior", "Exterior", "Tools", "Electronics"],
}
BRANDS_BY_CATEGORY = {
"Electronics": ["Samsung", "Apple", "Sony", "LG", "Dell", "HP", "Bose", "Canon"],
"Clothing": ["Nike", "Adidas", "Zara", "H&M", "Levi's", "Puma", "Under Armour"],
"Home & Kitchen": ["IKEA", "KitchenAid", "Ninja", "Dyson", "OXO", "Cuisinart"],
"Books": ["Penguin", "HarperCollins", "Random House", "Simon & Schuster", "Macmillan"],
"Sports": ["Nike", "Adidas", "Wilson", "Spalding", "The North Face", "Columbia"],
"Beauty": ["L'Oreal", "Maybelline", "Clinique", "Estee Lauder", "Neutrogena"],
"Toys": ["LEGO", "Mattel", "Hasbro", "Fisher-Price", "Melissa & Doug"],
"Automotive": ["Meguiar's", "Armor All", "WeatherTech", "Michelin", "Bosch"],
}
PRODUCT_NAMES = {
"Electronics": {
"Smartphones": ["Galaxy S24", "iPhone 15 Pro", "Xperia 1 V", "Pixel 8 Pro", "LG G4"],
"Laptops": ["ThinkPad X1", "MacBook Pro", "XPS 15", "Surface Laptop", "Spectre x360"],
"Headphones": ["WH-1000XM5", "AirPods Pro", "QuietComfort 45", "Galaxy Buds", "Momentum 4"],
"Tablets": ["iPad Pro", "Galaxy Tab S9", "Surface Pro", "Fire HD 10", "Lenovo Tab"],
"Cameras": ["EOS R5", "Alpha A7 IV", "Z8", "X-T5", "Lumix S5"],
},
"Clothing": {
"Men's": ["Classic Fit Jeans", "Cotton T-Shirt", "Blazer", "Chino Pants", "Hoodie"],
"Women's": ["Summer Dress", "Yoga Pants", "Leather Jacket", "Silk Blouse", "Maxi Skirt"],
"Kids'": ["Colorful Leggings", "Graphic Tee", "Denim Jacket", "Plaid Shirt", "Joggers"],
"Accessories": ["Leather Belt", "Sunglasses", "Wool Scarf", "Baseball Cap", "Watch"],
"Footwear": ["Running Shoes", "Sandals", "Boots", "Loafers", "Sneakers"],
},
"Home & Kitchen": {
"Furniture": ["Sofa Set", "Coffee Table", "Bookshelf", "Desk Chair", "Bed Frame"],
"Appliances": ["Air Fryer", "Blender", "Coffee Maker", "Microwave", "Toaster"],
"Cookware": ["Non-Stick Pan", "Chef's Knife", "Cutting Board", "Pot Set", "Baking Sheet"],
"Decor": ["Table Lamp", "Wall Art", "Vase", "Throw Pillow", "Candle Set"],
"Storage": ["Plastic Bins", "Shelf Organizer", "Shoe Rack", "Closet System", "Drawer Divider"],
},
"Books": {
"Fiction": ["The Silent Echo", "Midnight Sun", "Ocean's Memory", "The Last Garden", "Crimson Peak"],
"Non-Fiction": ["Atomic Habits", "Sapiens", "Educated", "The Power of Now", "Outliers"],
"Science": ["A Brief History of Time", "The Gene", "Cosmos", "The Selfish Gene", "Six Easy Pieces"],
"Technology": ["Clean Code", "Design Patterns", "Introduction to Algorithms", "Structure and Interpretation", "The Pragmatic Programmer"],
"Self-Help": ["The 7 Habits", "How to Win Friends", "Think and Grow Rich", "The Subtle Art", "Daring Greatly"],
},
"Sports": {
"Fitness": ["Adjustable Dumbbells", "Yoga Mat", "Resistance Bands", "Jump Rope", "Foam Roller"],
"Outdoor": ["Camping Tent", "Hiking Backpack", "Sleeping Bag", "Portable Stove", "Water Filter"],
"Team Sports": ["Soccer Ball", "Basketball", "Volleyball", "Baseball Glove", "Football"],
"Cycling": ["Mountain Bike", "Helmet", "Bike Pump", "Cycling Jersey", "Bike Lock"],
"Swimming": ["Swim Goggles", "Kickboard", "Swim Cap", "Ear Plugs", "Waterproof Bag"],
},
"Beauty": {
"Skincare": ["Moisturizer", "Face Serum", "Sunscreen", "Eye Cream", "Face Mask"],
"Makeup": ["Foundation", "Lipstick", "Mascara", "Eyeshadow Palette", "Concealer"],
"Haircare": ["Shampoo", "Conditioner", "Hair Oil", "Hair Dryer", "Straightener"],
"Fragrance": ["Eau de Parfum", "Cologne", "Body Spray", "Perfume Oil", "Rollerball"],
"Tools": ["Makeup Brushes", "Sponge Set", "Tweezers", "Mirror", "Travel Case"],
},
"Toys": {
"Educational": ["Science Kit", "Building Blocks", "Math Puzzle", "Robot Kit", "Microscope Set"],
"Action Figures": ["Superhero Figure", "Animal Set", "Dinosaur Set", "Space Explorer", "Fantasy Warrior"],
"Board Games": ["Strategy Game", "Family Game", "Card Game", "Trivia Game", "Cooperative Game"],
"Dolls": ["Fashion Doll", "Baby Doll", "Dollhouse", "Doll Clothes Set", "Puppet Set"],
"Puzzles": ["1000 Piece Puzzle", "Floor Puzzle", "3D Puzzle", "Wooden Puzzle", "Brain Teaser"],
},
"Automotive": {
"Car Care": ["Car Shampoo", "Wax Kit", "Microfiber Cloth", "Tire Cleaner", "Interior Wipes"],
"Interior": ["Seat Covers", "Floor Mats", "Steering Wheel Cover", "Air Freshener", "Phone Mount"],
"Exterior": ["Car Cover", "Mud Flaps", "Window Visors", "License Plate Frame", "Side Moldings"],
"Tools": ["Jump Starter", "Tire Inflator", "Tool Kit", "Car Jack", "Emergency Kit"],
"Electronics": ["Dash Cam", "GPS Navigator", "Bluetooth Adapter", "Backup Camera", "Car Charger"],
},
}
def generate_products(n_per_category=8):
products = []
pid = 1
for category, subcategories in CATEGORIES.items():
brands = BRANDS_BY_CATEGORY[category]
names = PRODUCT_NAMES[category]
for subcategory in subcategories:
subcat_names = names.get(subcategory, ["Generic Item"])
for i in range(min(n_per_category, len(subcat_names))):
name = subcat_names[i]
brand = np.random.choice(brands)
price = round(np.random.uniform(9.99, 1499.99), 2)
avg_rating = round(np.random.uniform(3.0, 5.0), 1)
num_reviews = np.random.randint(10, 500)
products.append({
"product_id": pid,
"name": name,
"category": category,
"subcategory": subcategory,
"brand": brand,
"price": price,
"avg_rating": avg_rating,
"num_reviews": num_reviews,
})
pid += 1
return pd.DataFrame(products)
def generate_users(n_users=200):
first_names = ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace",
"Henry", "Ivy", "Jack", "Kate", "Leo", "Mia", "Noah", "Olivia",
"Paul", "Quinn", "Rachel", "Sam", "Tina", "Uma", "Victor", "Wendy",
"Xander", "Yara", "Zack", "Ava", "Ben", "Clara", "David"]
users = []
for uid in range(1, n_users + 1):
name = np.random.choice(first_names) + f"_{uid}"
age = np.random.randint(18, 65)
n_cats = np.random.randint(1, 4)
preferred_categories = list(np.random.choice(list(CATEGORIES.keys()), n_cats, replace=False))
budget_min = round(np.random.uniform(10, 200), 2)
budget_max = round(budget_min + np.random.uniform(50, 1500), 2)
n_brands = np.random.randint(0, 3)
all_brands = list({b for brands in BRANDS_BY_CATEGORY.values() for b in brands})
favorite_brands = list(np.random.choice(all_brands, n_brands, replace=False)) if n_brands > 0 else []
users.append({
"user_id": uid,
"name": name,
"age": age,
"preferred_categories": ",".join(preferred_categories),
"budget_min": budget_min,
"budget_max": budget_max,
"favorite_brands": ",".join(favorite_brands),
})
return pd.DataFrame(users)
def generate_ratings(products_df, users_df, n_ratings=5000, sparsity_factor=0.05):
ratings = []
n_users = len(users_df)
n_products = len(products_df)
user_cat_prefs = {}
for _, u in users_df.iterrows():
user_cat_prefs[u["user_id"]] = set(u["preferred_categories"].split(",")) if u["preferred_categories"] else set()
user_brand_prefs = {}
for _, u in users_df.iterrows():
user_brand_prefs[u["user_id"]] = set(u["favorite_brands"].split(",")) if u["favorite_brands"] else set()
possible_pairs = []
for uid in range(1, n_users + 1):
for pid in range(1, n_products + 1):
possible_pairs.append((uid, pid))
selected_indices = np.random.choice(len(possible_pairs), min(n_ratings, len(possible_pairs)), replace=False)
selected_pairs = [possible_pairs[i] for i in selected_indices]
prod_cat = dict(zip(products_df["product_id"], products_df["category"]))
prod_brand = dict(zip(products_df["product_id"], products_df["brand"]))
prod_price = dict(zip(products_df["product_id"], products_df["price"]))
for uid, pid in selected_pairs:
base = 3.0
cat = prod_cat.get(pid, "")
if uid in user_cat_prefs and cat in user_cat_prefs[uid]:
base += np.random.uniform(0.5, 1.5)
brand = prod_brand.get(pid, "")
if uid in user_brand_prefs and brand in user_brand_prefs[uid]:
base += np.random.uniform(0.3, 1.0)
price = prod_price.get(pid, 50)
user_row = users_df[users_df["user_id"] == uid].iloc[0]
if user_row["budget_min"] <= price <= user_row["budget_max"]:
base += np.random.uniform(0.0, 0.5)
else:
base -= np.random.uniform(0.0, 0.5)
noise = np.random.normal(0, 0.5)
rating = round(min(5.0, max(1.0, base + noise)), 1)
ratings.append({
"user_id": uid,
"product_id": pid,
"rating": rating,
})
return pd.DataFrame(ratings)
def generate_interactions(products_df, users_df, rating_df, n_purchases=2000):
interactions = []
selected = rating_df.sample(min(n_purchases, len(rating_df)))
for _, row in selected.iterrows():
interactions.append({
"user_id": row["user_id"],
"product_id": row["product_id"],
"purchased": True,
"quantity": np.random.randint(1, 4),
})
return pd.DataFrame(interactions)
def main():
print("Generating products...")
products = generate_products(n_per_category=8)
products.to_csv(os.path.join(DATA_DIR, "products.csv"), index=False)
print(f" {len(products)} products generated")
print("Generating users...")
users = generate_users(n_users=200)
users.to_csv(os.path.join(DATA_DIR, "users.csv"), index=False)
print(f" {len(users)} users generated")
print("Generating ratings...")
ratings = generate_ratings(products, users, n_ratings=8000)
ratings.to_csv(os.path.join(DATA_DIR, "ratings.csv"), index=False)
print(f" {len(ratings)} ratings generated")
print("Generating interactions...")
interactions = generate_interactions(products, users, ratings)
interactions.to_csv(os.path.join(DATA_DIR, "interactions.csv"), index=False)
print(f" {len(interactions)} interactions generated")
print("All data generated successfully!")
if __name__ == "__main__":
main()