Spaces:

Abdallah4Zain
/

TasteEngine

Sleeping

File size: 11,547 Bytes

133a630

import pandas as pd
import numpy as np
import os

np.random.seed(42)

DATA_DIR = os.path.dirname(os.path.abspath(__file__))

CATEGORIES = {
    "Electronics": ["Smartphones", "Laptops", "Headphones", "Tablets", "Cameras"],
    "Clothing": ["Men's", "Women's", "Kids'", "Accessories", "Footwear"],
    "Home & Kitchen": ["Furniture", "Appliances", "Cookware", "Decor", "Storage"],
    "Books": ["Fiction", "Non-Fiction", "Science", "Technology", "Self-Help"],
    "Sports": ["Fitness", "Outdoor", "Team Sports", "Cycling", "Swimming"],
    "Beauty": ["Skincare", "Makeup", "Haircare", "Fragrance", "Tools"],
    "Toys": ["Educational", "Action Figures", "Board Games", "Dolls", "Puzzles"],
    "Automotive": ["Car Care", "Interior", "Exterior", "Tools", "Electronics"],
}

BRANDS_BY_CATEGORY = {
    "Electronics": ["Samsung", "Apple", "Sony", "LG", "Dell", "HP", "Bose", "Canon"],
    "Clothing": ["Nike", "Adidas", "Zara", "H&M", "Levi's", "Puma", "Under Armour"],
    "Home & Kitchen": ["IKEA", "KitchenAid", "Ninja", "Dyson", "OXO", "Cuisinart"],
    "Books": ["Penguin", "HarperCollins", "Random House", "Simon & Schuster", "Macmillan"],
    "Sports": ["Nike", "Adidas", "Wilson", "Spalding", "The North Face", "Columbia"],
    "Beauty": ["L'Oreal", "Maybelline", "Clinique", "Estee Lauder", "Neutrogena"],
    "Toys": ["LEGO", "Mattel", "Hasbro", "Fisher-Price", "Melissa & Doug"],
    "Automotive": ["Meguiar's", "Armor All", "WeatherTech", "Michelin", "Bosch"],
}

PRODUCT_NAMES = {
    "Electronics": {
        "Smartphones": ["Galaxy S24", "iPhone 15 Pro", "Xperia 1 V", "Pixel 8 Pro", "LG G4"],
        "Laptops": ["ThinkPad X1", "MacBook Pro", "XPS 15", "Surface Laptop", "Spectre x360"],
        "Headphones": ["WH-1000XM5", "AirPods Pro", "QuietComfort 45", "Galaxy Buds", "Momentum 4"],
        "Tablets": ["iPad Pro", "Galaxy Tab S9", "Surface Pro", "Fire HD 10", "Lenovo Tab"],
        "Cameras": ["EOS R5", "Alpha A7 IV", "Z8", "X-T5", "Lumix S5"],
    },
    "Clothing": {
        "Men's": ["Classic Fit Jeans", "Cotton T-Shirt", "Blazer", "Chino Pants", "Hoodie"],
        "Women's": ["Summer Dress", "Yoga Pants", "Leather Jacket", "Silk Blouse", "Maxi Skirt"],
        "Kids'": ["Colorful Leggings", "Graphic Tee", "Denim Jacket", "Plaid Shirt", "Joggers"],
        "Accessories": ["Leather Belt", "Sunglasses", "Wool Scarf", "Baseball Cap", "Watch"],
        "Footwear": ["Running Shoes", "Sandals", "Boots", "Loafers", "Sneakers"],
    },
    "Home & Kitchen": {
        "Furniture": ["Sofa Set", "Coffee Table", "Bookshelf", "Desk Chair", "Bed Frame"],
        "Appliances": ["Air Fryer", "Blender", "Coffee Maker", "Microwave", "Toaster"],
        "Cookware": ["Non-Stick Pan", "Chef's Knife", "Cutting Board", "Pot Set", "Baking Sheet"],
        "Decor": ["Table Lamp", "Wall Art", "Vase", "Throw Pillow", "Candle Set"],
        "Storage": ["Plastic Bins", "Shelf Organizer", "Shoe Rack", "Closet System", "Drawer Divider"],
    },
    "Books": {
        "Fiction": ["The Silent Echo", "Midnight Sun", "Ocean's Memory", "The Last Garden", "Crimson Peak"],
        "Non-Fiction": ["Atomic Habits", "Sapiens", "Educated", "The Power of Now", "Outliers"],
        "Science": ["A Brief History of Time", "The Gene", "Cosmos", "The Selfish Gene", "Six Easy Pieces"],
        "Technology": ["Clean Code", "Design Patterns", "Introduction to Algorithms", "Structure and Interpretation", "The Pragmatic Programmer"],
        "Self-Help": ["The 7 Habits", "How to Win Friends", "Think and Grow Rich", "The Subtle Art", "Daring Greatly"],
    },
    "Sports": {
        "Fitness": ["Adjustable Dumbbells", "Yoga Mat", "Resistance Bands", "Jump Rope", "Foam Roller"],
        "Outdoor": ["Camping Tent", "Hiking Backpack", "Sleeping Bag", "Portable Stove", "Water Filter"],
        "Team Sports": ["Soccer Ball", "Basketball", "Volleyball", "Baseball Glove", "Football"],
        "Cycling": ["Mountain Bike", "Helmet", "Bike Pump", "Cycling Jersey", "Bike Lock"],
        "Swimming": ["Swim Goggles", "Kickboard", "Swim Cap", "Ear Plugs", "Waterproof Bag"],
    },
    "Beauty": {
        "Skincare": ["Moisturizer", "Face Serum", "Sunscreen", "Eye Cream", "Face Mask"],
        "Makeup": ["Foundation", "Lipstick", "Mascara", "Eyeshadow Palette", "Concealer"],
        "Haircare": ["Shampoo", "Conditioner", "Hair Oil", "Hair Dryer", "Straightener"],
        "Fragrance": ["Eau de Parfum", "Cologne", "Body Spray", "Perfume Oil", "Rollerball"],
        "Tools": ["Makeup Brushes", "Sponge Set", "Tweezers", "Mirror", "Travel Case"],
    },
    "Toys": {
        "Educational": ["Science Kit", "Building Blocks", "Math Puzzle", "Robot Kit", "Microscope Set"],
        "Action Figures": ["Superhero Figure", "Animal Set", "Dinosaur Set", "Space Explorer", "Fantasy Warrior"],
        "Board Games": ["Strategy Game", "Family Game", "Card Game", "Trivia Game", "Cooperative Game"],
        "Dolls": ["Fashion Doll", "Baby Doll", "Dollhouse", "Doll Clothes Set", "Puppet Set"],
        "Puzzles": ["1000 Piece Puzzle", "Floor Puzzle", "3D Puzzle", "Wooden Puzzle", "Brain Teaser"],
    },
    "Automotive": {
        "Car Care": ["Car Shampoo", "Wax Kit", "Microfiber Cloth", "Tire Cleaner", "Interior Wipes"],
        "Interior": ["Seat Covers", "Floor Mats", "Steering Wheel Cover", "Air Freshener", "Phone Mount"],
        "Exterior": ["Car Cover", "Mud Flaps", "Window Visors", "License Plate Frame", "Side Moldings"],
        "Tools": ["Jump Starter", "Tire Inflator", "Tool Kit", "Car Jack", "Emergency Kit"],
        "Electronics": ["Dash Cam", "GPS Navigator", "Bluetooth Adapter", "Backup Camera", "Car Charger"],
    },
}


def generate_products(n_per_category=8):
    products = []
    pid = 1
    for category, subcategories in CATEGORIES.items():
        brands = BRANDS_BY_CATEGORY[category]
        names = PRODUCT_NAMES[category]
        for subcategory in subcategories:
            subcat_names = names.get(subcategory, ["Generic Item"])
            for i in range(min(n_per_category, len(subcat_names))):
                name = subcat_names[i]
                brand = np.random.choice(brands)
                price = round(np.random.uniform(9.99, 1499.99), 2)
                avg_rating = round(np.random.uniform(3.0, 5.0), 1)
                num_reviews = np.random.randint(10, 500)
                products.append({
                    "product_id": pid,
                    "name": name,
                    "category": category,
                    "subcategory": subcategory,
                    "brand": brand,
                    "price": price,
                    "avg_rating": avg_rating,
                    "num_reviews": num_reviews,
                })
                pid += 1
    return pd.DataFrame(products)


def generate_users(n_users=200):
    first_names = ["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace",
                   "Henry", "Ivy", "Jack", "Kate", "Leo", "Mia", "Noah", "Olivia",
                   "Paul", "Quinn", "Rachel", "Sam", "Tina", "Uma", "Victor", "Wendy",
                   "Xander", "Yara", "Zack", "Ava", "Ben", "Clara", "David"]
    users = []
    for uid in range(1, n_users + 1):
        name = np.random.choice(first_names) + f"_{uid}"
        age = np.random.randint(18, 65)
        n_cats = np.random.randint(1, 4)
        preferred_categories = list(np.random.choice(list(CATEGORIES.keys()), n_cats, replace=False))
        budget_min = round(np.random.uniform(10, 200), 2)
        budget_max = round(budget_min + np.random.uniform(50, 1500), 2)
        n_brands = np.random.randint(0, 3)
        all_brands = list({b for brands in BRANDS_BY_CATEGORY.values() for b in brands})
        favorite_brands = list(np.random.choice(all_brands, n_brands, replace=False)) if n_brands > 0 else []
        users.append({
            "user_id": uid,
            "name": name,
            "age": age,
            "preferred_categories": ",".join(preferred_categories),
            "budget_min": budget_min,
            "budget_max": budget_max,
            "favorite_brands": ",".join(favorite_brands),
        })
    return pd.DataFrame(users)


def generate_ratings(products_df, users_df, n_ratings=5000, sparsity_factor=0.05):
    ratings = []
    n_users = len(users_df)
    n_products = len(products_df)

    user_cat_prefs = {}
    for _, u in users_df.iterrows():
        user_cat_prefs[u["user_id"]] = set(u["preferred_categories"].split(",")) if u["preferred_categories"] else set()

    user_brand_prefs = {}
    for _, u in users_df.iterrows():
        user_brand_prefs[u["user_id"]] = set(u["favorite_brands"].split(",")) if u["favorite_brands"] else set()

    possible_pairs = []
    for uid in range(1, n_users + 1):
        for pid in range(1, n_products + 1):
            possible_pairs.append((uid, pid))

    selected_indices = np.random.choice(len(possible_pairs), min(n_ratings, len(possible_pairs)), replace=False)
    selected_pairs = [possible_pairs[i] for i in selected_indices]

    prod_cat = dict(zip(products_df["product_id"], products_df["category"]))
    prod_brand = dict(zip(products_df["product_id"], products_df["brand"]))
    prod_price = dict(zip(products_df["product_id"], products_df["price"]))

    for uid, pid in selected_pairs:
        base = 3.0

        cat = prod_cat.get(pid, "")
        if uid in user_cat_prefs and cat in user_cat_prefs[uid]:
            base += np.random.uniform(0.5, 1.5)

        brand = prod_brand.get(pid, "")
        if uid in user_brand_prefs and brand in user_brand_prefs[uid]:
            base += np.random.uniform(0.3, 1.0)

        price = prod_price.get(pid, 50)
        user_row = users_df[users_df["user_id"] == uid].iloc[0]
        if user_row["budget_min"] <= price <= user_row["budget_max"]:
            base += np.random.uniform(0.0, 0.5)
        else:
            base -= np.random.uniform(0.0, 0.5)

        noise = np.random.normal(0, 0.5)
        rating = round(min(5.0, max(1.0, base + noise)), 1)

        ratings.append({
            "user_id": uid,
            "product_id": pid,
            "rating": rating,
        })

    return pd.DataFrame(ratings)


def generate_interactions(products_df, users_df, rating_df, n_purchases=2000):
    interactions = []
    selected = rating_df.sample(min(n_purchases, len(rating_df)))
    for _, row in selected.iterrows():
        interactions.append({
            "user_id": row["user_id"],
            "product_id": row["product_id"],
            "purchased": True,
            "quantity": np.random.randint(1, 4),
        })
    return pd.DataFrame(interactions)


def main():
    print("Generating products...")
    products = generate_products(n_per_category=8)
    products.to_csv(os.path.join(DATA_DIR, "products.csv"), index=False)
    print(f"  {len(products)} products generated")

    print("Generating users...")
    users = generate_users(n_users=200)
    users.to_csv(os.path.join(DATA_DIR, "users.csv"), index=False)
    print(f"  {len(users)} users generated")

    print("Generating ratings...")
    ratings = generate_ratings(products, users, n_ratings=8000)
    ratings.to_csv(os.path.join(DATA_DIR, "ratings.csv"), index=False)
    print(f"  {len(ratings)} ratings generated")

    print("Generating interactions...")
    interactions = generate_interactions(products, users, ratings)
    interactions.to_csv(os.path.join(DATA_DIR, "interactions.csv"), index=False)
    print(f"  {len(interactions)} interactions generated")

    print("All data generated successfully!")


if __name__ == "__main__":
    main()