Spaces:

cdy3870
/

Fetch_App

Sleeping

App Files Files Community

Calvin commited on Nov 30, 2023

Commit

f94a42e

1 Parent(s): 8a1aceb

final touches

Browse files

Files changed (3) hide show

Exploration.ipynb +0 -0
offer_pipeline.py +119 -24
requirements.txt +0 -1

Exploration.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

offer_pipeline.py CHANGED Viewed

@@ -3,7 +3,6 @@ from transformers import pipeline
 import pickle
 import os
 import pandas as pd
-# import seaborn as sns
 import ast
 import string
 import re
@@ -14,79 +13,165 @@ st.set_page_config(
 	layout="wide"
 )
-pipe = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli")
 model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 dire = "DS_NLP_search_data"
 @st.cache_data
 def get_processed_offers():
 	processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
 	processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)
 	return processed_offers
 @st.cache_data
 def get_categories_data():
 	cats = pd.read_csv(os.path.join(dire, "categories.csv"))
 	return cats
 @st.cache_data
 def get_offers_data():
 	offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))
 	return offers
 @st.cache_data
 def get_categories(cats_):
 	categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
 	for x in ["Mature"]:
 		if x in categories:
 			categories.remove(x)
 	return categories
 def check_in_offer(search_str, offer_rets):
 	offers = []
-	# print(offer_rets)
 	for i in range(len(offer_rets)):
 		offer_str = offer_rets.iloc[i]["OFFER"]
-		# print(offer_str)
 		parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
 		parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)
-		# print(parsed_str)
 		if search_str.lower() in parsed_str.split(" "):
 		  offers.append(offer_str)
 	df = pd.DataFrame({"OFFER":offers})
-	# print(df)
 	return df
 def is_retailer(search_str, threshold=0.5):
 	processed_search_str = search_str.lower().capitalize()
 	labels = pipe(processed_search_str,
 	  candidate_labels=["brand", "retailer", "item"],
 	)
-	return labels["labels"][0] == "retailer" and labels["scores"][0] > threshold
 def perform_cat_inference(search_str, categories, cats, processed_offers):
 	labels = pipe(search_str,
 		candidate_labels=categories,
 	)
-	print(labels)
 	# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
 	filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
 	labels_2 = pipe(search_str,
 		candidate_labels=filtered_cats,
 	)
-	print(labels_2)
 	top_labels = labels_2["labels"][:3]
-	print(top_labels)
 	offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()
 	return offers, labels, labels_2
 def sort_by_similarity(search_str, related_offers):
 	temp_dict = {}
 	embedding_1 = model.encode(search_str, convert_to_tensor=True)
@@ -96,42 +181,52 @@ def sort_by_similarity(search_str, related_offers):
 		temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))
 	sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
-	# casted_scores = list(map(lambda x : int(x), ))
 	df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})
 	return df
 def main():
 	col_1, col_2, col_3 = st.columns(3)
-	search_str = col_2.text_input("Enter a retailer, brand, or category").capitalize()
 	processed_offers = get_processed_offers()
 	cats = get_categories_data()
 	offer_rets = get_offers_data()
 	categories = get_categories(cats)
-	# retail_mapping = get_prod_categories()
-	if col_2.button("Search", type="primary"):
 		retail = is_retailer(search_str)
 		direct_offers = check_in_offer(search_str, offer_rets)
 		col_2.write("Directly related offers")
-		col_2.table(direct_offers)
 		if retail:
 			related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
 		else:
 			related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers)
 			related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]
 			col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
 			col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))
-			# df = get_confidence_charts(labels_2)
-			# st.table(df)
 		col_2.write("Other related offers")
 		sorted_offers = sort_by_similarity(search_str, related_offers)
-		col_2.table(sorted_offers)
-if __name__ == "__main__":
 	main()

 import pickle
 import os
 import pandas as pd
 import ast
 import string
 import re
 	layout="wide"
 )
+# Download and cache models
+pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
 model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+# Directory of csv files
 dire = "DS_NLP_search_data"
+# Use Streamlit caching to load data once
 @st.cache_data
 def get_processed_offers():
+	'''
+	Load processed offers from exploration notebook and cache
+	Returns:
+		processed_offers (pd.DataFrame) :  zero-shot categorized offers
+	'''
 	processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
 	processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)
 	return processed_offers
 @st.cache_data
 def get_categories_data():
+	'''
+	Load raw category data and cache
+	Returns:
+		cats (pd.DataFrame) :  raw category data
+	'''
 	cats = pd.read_csv(os.path.join(dire, "categories.csv"))
 	return cats
 @st.cache_data
 def get_offers_data():
+	'''
+	Load raw offfers data and cache
+	Returns:
+		cats (pd.DataFrame) :  raw offers data
+	'''
 	offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))
 	return offers
 @st.cache_data
 def get_categories(cats_):
+	'''
+	Extract, load categories and cache
+	Parameters:
+		cats_ (pd.DataFrame) : raw categories data
+	Returns:
+		categories (List) :  child categories
+	'''
 	categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
 	for x in ["Mature"]:
 		if x in categories:
 			categories.remove(x)
 	return categories
 def check_in_offer(search_str, offer_rets):
+	'''
+	Determine if the input text is directly in the offer with basic string matching
+	Parameters:
+		search_str (string) : user text input
+		offer_rets (pd.DataFrame) : raw offer data
+	Returns:
+		df (pd.DataFrame) :  offers with text input
+	'''
 	offers = []
 	for i in range(len(offer_rets)):
 		offer_str = offer_rets.iloc[i]["OFFER"]
 		parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
 		parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)
 		if search_str.lower() in parsed_str.split(" "):
 		  offers.append(offer_str)
 	df = pd.DataFrame({"OFFER":offers})
 	return df
 def is_retailer(search_str, threshold=0.5):
+	'''
+	Determine if the text input is highly likely to be a retailer
+	Parameters:
+		search_str (string) : user text input
+		threshold (int) : probability threshold
+	Returns:
+		is_ret (boolean) :  true if retailer, false otherwise
+	'''
 	processed_search_str = search_str.lower().capitalize()
 	labels = pipe(processed_search_str,
 	  candidate_labels=["brand", "retailer", "item"],
 	)
+	is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold
+	return is_ret
 def perform_cat_inference(search_str, categories, cats, processed_offers):
+	'''
+	Perform zero shot learning twice and return the offers relevant to the child categories
+	Parameters:
+		search_str (string) : user text input
+		categories (pd.DataFrame) : list of categories
+		cats (pd.DataFrame) : raw category data
+		processed_offers (pd.DataFrame) : processed_offer_data
+	Returns:
+		offers (pd.DataFrame) : relevant offers
+		labels (dict) : parent categories and their probability scores
+		labels_2 (dict) : child categories and their probability scores
+	'''
 	labels = pipe(search_str,
 		candidate_labels=categories,
 	)
 	# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
 	filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
 	labels_2 = pipe(search_str,
 		candidate_labels=filtered_cats,
 	)
 	top_labels = labels_2["labels"][:3]
 	offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()
 	return offers, labels, labels_2
 def sort_by_similarity(search_str, related_offers):
+	'''
+	Use sentence embeddings to evaluate the similarity of relevant offers to the text input
+	Parameters:
+		search_str (string) : user text input
+		related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning
+	Returns:
+		df (pd.DataFrame) : relevant offers and their similiarity scores
+	'''
 	temp_dict = {}
 	embedding_1 = model.encode(search_str, convert_to_tensor=True)
 		temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))
 	sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
 	df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})
 	return df
 def main():
+	# Load and cache data
 	col_1, col_2, col_3 = st.columns(3)
+	search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize()
 	processed_offers = get_processed_offers()
 	cats = get_categories_data()
 	offer_rets = get_offers_data()
 	categories = get_categories(cats)
+	if col_1.button("Search", type="primary"):
+		# Check offers where the text is directly in it
 		retail = is_retailer(search_str)
 		direct_offers = check_in_offer(search_str, offer_rets)
 		col_2.write("Directly related offers")
+		if len(direct_offers) == 0:
+			col_2.write("None found")
+		else:
+			col_2.table(direct_offers)
 		if retail:
+			# If retail, we directly compare every offer using sentence embeddings
 			related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
 		else:
+			# Otherwise, we use zero shot learning with processed offers to narrow down our search
 			related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers)
 			related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]
+			col_2.write("Parent categories probabilities")
 			col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
+			col_2.write("Child categories probabilities")
 			col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))
 		col_2.write("Other related offers")
 		sorted_offers = sort_by_similarity(search_str, related_offers)
+		if len(sorted_offers) == 0:
+			col_2.write("None found")
+		else:
+			col_2.table(sorted_offers)
+if __name__ == "__main__":
 	main()

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 streamlit
 transformers
 pandas
-seaborn
 torch
 sentence-transformers

 streamlit
 transformers
 pandas
 torch
 sentence-transformers