| | import streamlit as st |
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.manifold import TSNE |
| | from scipy.spatial.distance import cdist |
| |
|
| | st.markdown('<style>h1{color: white;}</style>', unsafe_allow_html=True) |
| | st.title('Call on Doc Skin Care Product Recommender') |
| | st.write('Find the Right Skin Care for you') |
| |
|
| | st.write("Hi there! If you have a skincare product you currently like I can help you find a similar one based on the ingredients.") |
| |
|
| | st.write('Please select a product below so I can recommend similar ones') |
| | |
| | df = pd.read_csv("./data/cosmetics.csv") |
| |
|
| | |
| | category = st.selectbox(label='Select a product category', options= df['Label'].unique() ) |
| | category_subset = df[df['Label'] == category] |
| | |
| | brand = st.selectbox(label='Select a brand', options= sorted(category_subset['Brand'].unique())) |
| | category_brand_subset = category_subset[category_subset['Brand'] == brand] |
| | |
| | product = st.selectbox(label='Select the product', options= sorted(category_brand_subset['Name'].unique() )) |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | def oh_encoder(tokens): |
| | x = np.zeros(N) |
| | for ingredient in tokens: |
| | |
| | idx = ingredient_idx[ingredient] |
| | |
| | x[idx] = 1 |
| | return x |
| |
|
| | def closest_point(point, points): |
| | """ Find closest point from a list of points. """ |
| | return points[cdist([point], points).argmin()] |
| |
|
| |
|
| | if category is not None: |
| | category_subset = df[df['Label'] == category] |
| |
|
| | if product is not None: |
| | |
| |
|
| | |
| | category_subset = category_subset.reset_index(drop=True) |
| |
|
| | |
| | |
| |
|
| | |
| | ingredient_idx = {} |
| | corpus = [] |
| | idx = 0 |
| |
|
| | |
| | for i in range(len(category_subset)): |
| | ingredients = category_subset['Ingredients'][i] |
| | ingredients_lower = ingredients.lower() |
| | tokens = ingredients_lower.split(', ') |
| | corpus.append(tokens) |
| | for ingredient in tokens: |
| | if ingredient not in ingredient_idx: |
| | ingredient_idx[ingredient] = idx |
| | idx += 1 |
| |
|
| | |
| | |
| | M = len(category_subset) |
| | N = len(ingredient_idx) |
| |
|
| | |
| | A = np.zeros((M,N)) |
| |
|
| | |
| | i = 0 |
| | for tokens in corpus: |
| | A[i, :] = oh_encoder(tokens) |
| | i +=1 |
| |
|
| | model_run = st.button('Find similar products!') |
| |
|
| |
|
| | if model_run: |
| |
|
| | st.write('Based on the ingredients of the product you selected') |
| | st.write('here are the top 10 products that are the most similar :sparkles:') |
| | |
| | |
| | model = TSNE(n_components = 2, learning_rate = 150, random_state = 42) |
| | tsne_features = model.fit_transform(A) |
| |
|
| | |
| | category_subset['X'] = tsne_features[:, 0] |
| | category_subset['Y'] = tsne_features[:, 1] |
| |
|
| | target = category_subset[category_subset['Name'] == product] |
| |
|
| | target_x = target['X'].values[0] |
| | target_y = target['Y'].values[0] |
| |
|
| | df1 = pd.DataFrame() |
| | df1['point'] = [(x, y) for x,y in zip(category_subset['X'], category_subset['Y'])] |
| |
|
| | category_subset['distance'] = [cdist(np.array([[target_x,target_y]]), np.array([product]), metric='euclidean') for product in df1['point']] |
| |
|
| | |
| | top_matches = category_subset.sort_values(by=['distance']) |
| |
|
| | |
| | target_ingredients = target.Ingredients.values |
| | c1_list = target_ingredients[0].split(",") |
| | c1_list = [x.strip(' ') for x in c1_list] |
| | c1_set = set(c1_list) |
| |
|
| | top_matches['Ingredients in common'] = [c1_set.intersection( set([x.strip(' ')for x in product.split(",")]) ) for product in top_matches['Ingredients']] |
| |
|
| | |
| | top_matches = top_matches[['Label', 'Brand', 'Name', 'Price', 'Ingredients','Ingredients in common']] |
| | top_matches = top_matches.reset_index(drop=True) |
| | top_matches = top_matches.drop(top_matches.index[0]) |
| |
|
| | st.dataframe(top_matches.head(10)) |
| |
|
| |
|
| |
|