| | import streamlit as st |
| | from transformers import pipeline |
| | import pickle |
| | import os |
| | import pandas as pd |
| | import ast |
| | import string |
| | import re |
| | from sentence_transformers import SentenceTransformer, util |
| |
|
| | st.set_page_config( |
| | page_title="Offer Recommender", |
| | layout="wide" |
| | ) |
| |
|
| | |
| | pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3") |
| | model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
| |
|
| | |
| | dire = "DS_NLP_search_data" |
| |
|
| | |
| | @st.cache_data |
| | def get_processed_offers(): |
| | ''' |
| | Load processed offers from exploration notebook and cache |
| | |
| | Returns: |
| | processed_offers (pd.DataFrame) : zero-shot categorized offers |
| | ''' |
| | processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv")) |
| | processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval) |
| |
|
| | return processed_offers |
| |
|
| |
|
| | @st.cache_data |
| | def get_categories_data(): |
| | ''' |
| | Load raw category data and cache |
| | |
| | Returns: |
| | cats (pd.DataFrame) : raw category data |
| | ''' |
| |
|
| | cats = pd.read_csv(os.path.join(dire, "categories.csv")) |
| |
|
| | return cats |
| |
|
| |
|
| | @st.cache_data |
| | def get_offers_data(): |
| | ''' |
| | Load raw offfers data and cache |
| | |
| | Returns: |
| | cats (pd.DataFrame) : raw offers data |
| | ''' |
| |
|
| | offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv")) |
| |
|
| | return offers |
| |
|
| |
|
| | @st.cache_data |
| | def get_categories(cats_): |
| | ''' |
| | Extract, load categories and cache |
| | |
| | Parameters: |
| | cats_ (pd.DataFrame) : raw categories data |
| | |
| | Returns: |
| | categories (List) : child categories |
| | ''' |
| |
|
| | categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique()) |
| | for x in ["Mature"]: |
| | if x in categories: |
| | categories.remove(x) |
| |
|
| | return categories |
| |
|
| |
|
| | def check_in_offer(search_str, offer_rets): |
| | ''' |
| | Determine if the input text is directly in the offer with basic string matching |
| | |
| | Parameters: |
| | search_str (string) : user text input |
| | offer_rets (pd.DataFrame) : raw offer data |
| | |
| | Returns: |
| | df (pd.DataFrame) : offers with text input |
| | ''' |
| |
|
| | offers = [] |
| | for i in range(len(offer_rets)): |
| | offer_str = offer_rets.iloc[i]["OFFER"] |
| | parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation)) |
| | parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str) |
| |
|
| | if search_str.lower() in parsed_str.split(" "): |
| | offers.append(offer_str) |
| | df = pd.DataFrame({"OFFER":offers}) |
| |
|
| | return df |
| |
|
| |
|
| | def is_retailer(search_str, threshold=0.5): |
| | ''' |
| | Determine if the text input is highly likely to be a retailer |
| | |
| | Parameters: |
| | search_str (string) : user text input |
| | threshold (int) : probability threshold |
| | |
| | Returns: |
| | is_ret (boolean) : true if retailer, false otherwise |
| | ''' |
| |
|
| | processed_search_str = search_str.lower().capitalize() |
| | labels = pipe(processed_search_str, |
| | candidate_labels=["brand", "retailer", "item"], |
| | ) |
| |
|
| | is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold |
| |
|
| | return is_ret |
| |
|
| |
|
| | def perform_cat_inference(search_str, categories, cats, processed_offers): |
| | ''' |
| | Perform zero shot learning twice and return the offers relevant to the child categories |
| | |
| | Parameters: |
| | search_str (string) : user text input |
| | categories (pd.DataFrame) : list of categories |
| | cats (pd.DataFrame) : raw category data |
| | processed_offers (pd.DataFrame) : processed_offer_data |
| | |
| | Returns: |
| | offers (pd.DataFrame) : relevant offers |
| | labels (dict) : parent categories and their probability scores |
| | labels_2 (dict) : child categories and their probability scores |
| | ''' |
| |
|
| | labels = pipe(search_str, |
| | candidate_labels=categories, |
| | ) |
| | |
| | filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique()) |
| | labels_2 = pipe(search_str, |
| | candidate_labels=filtered_cats, |
| | ) |
| | top_labels = labels_2["labels"][:3] |
| | offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index() |
| |
|
| | return offers, labels, labels_2 |
| |
|
| |
|
| | def sort_by_similarity(search_str, related_offers): |
| | ''' |
| | Use sentence embeddings to evaluate the similarity of relevant offers to the text input |
| | |
| | Parameters: |
| | search_str (string) : user text input |
| | related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning |
| | |
| | Returns: |
| | df (pd.DataFrame) : relevant offers and their similiarity scores |
| | ''' |
| |
|
| | temp_dict = {} |
| | embedding_1 = model.encode(search_str, convert_to_tensor=True) |
| |
|
| | for offer in list(related_offers["OFFER"]): |
| | embedding_2 = model.encode(offer, convert_to_tensor=True) |
| |
|
| | temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2)) |
| |
|
| | sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True)) |
| | df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]}) |
| |
|
| | return df |
| |
|
| |
|
| | def main(): |
| | |
| | col_1, col_2, col_3 = st.columns(3) |
| | search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize() |
| | processed_offers = get_processed_offers() |
| | cats = get_categories_data() |
| | offer_rets = get_offers_data() |
| | categories = get_categories(cats) |
| |
|
| | if col_1.button("Search", type="primary"): |
| | |
| | retail = is_retailer(search_str) |
| | direct_offers = check_in_offer(search_str, offer_rets) |
| | col_2.write("Directly related offers") |
| |
|
| | if len(direct_offers) == 0: |
| | col_2.write("None found") |
| | else: |
| | col_2.table(direct_offers) |
| |
|
| | if retail: |
| | |
| | related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))] |
| | else: |
| | |
| | related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers) |
| | related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))] |
| |
|
| | col_2.write("Parent categories probabilities") |
| | col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]})) |
| | col_2.write("Child categories probabilities") |
| | col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]})) |
| | |
| | col_2.write("Other related offers") |
| | sorted_offers = sort_by_similarity(search_str, related_offers) |
| |
|
| | if len(sorted_offers) == 0: |
| | col_2.write("None found") |
| | else: |
| | col_2.table(sorted_offers) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|