Spaces:

hjianganthony
/

fetch_ner

Sleeping

App Files Files Community

fetch_ner / src /utils.py

hjianganthony

Update src/utils.py

03a395a over 2 years ago

raw

history blame contribute delete

15.5 kB

	# helper functions
	from typing import List, Dict, Tuple
	import re
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import pandas as pd
	import numpy as np
	import pickle, json
	# from IPython.display import clear_output

	import spacy
	from spacy.tokens import DocBin
	from spacy.training import offsets_to_biluo_tags
	import en_fetch_ner_spacy_tsf
	nlp = en_fetch_ner_spacy_tsf.load()
	# clear_output()

	import nltk
	nltk.download('stopwords')

	from nltk.corpus import stopwords
	stop_words = set(stopwords.words('english'))
	additional_stop_words = {'pack'}
	stop_words.update(additional_stop_words)
	# clear_output()


	# load operation data
	path1 = "data/brand_belong_category_dict.json"
	path2 = "data/product_upper_category_dict.json"
	path3 = "data/offered_brands.pkl"
	path4 = "data/offer_retailer.csv"

	with open(path1, 'r') as f:
	brand_belong_category_dict = json.load(f)

	with open(path2, 'rb') as f:
	category_dict = json.load(f)

	with open(path3, 'rb') as f:
	offered_brands = pickle.load(f)

	df_offers_brand_retailer = pd.read_csv(path4)

	example_search = "Simply Spiked Lemonade 12 pack at Walmart"

	# helper functions

	def single_text_cleaner(text: str, remove_stopwords: bool=False, upper_case: bool = False, remove_punctuation: bool=True) -> str:
	"""Clean one single text input. By default it will convert text to lower case"""
	if upper_case:
	text = text.upper()
	else:
	text = text.lower()
	if remove_punctuation:
	text = re.sub(r'[^a-z\s]', '', text)
	if remove_stopwords:
	words = text.split()
	words = [word for word in words if word not in stop_words]
	text = ' '.join(words)
	return text

	def list_text_cleaner(texts: List[str], upper_case: bool = False, remove_stopwords: bool = False, remove_punctuation: bool=True) -> List[str]:
	"""Takes in a list of strings and returns a list of cleaned strings without stop words.
	Current tasks:
	- remove non-alphabetical characters
	- converting to lower cases
	- remove stop words (optional)"""
	cleaned_texts = [single_text_cleaner(text, remove_stopwords, upper_case, remove_punctuation) for text in texts]
	return cleaned_texts

	def match_product_category(s1: list[str], s2: list[str]) -> str:
	"""Find if items of a list is in one list of product categories"""
	return next((p for c in s1 for p in s2 if c in p), None) # this will stop after finding first match, which saves time

	def find_category(search_input: str, search_dict: Dict) -> str:
	"""Find the category of a search input based on a dictionary of categories
	Args:
	- search_input: a string
	- search_dict: a dictionary of product categories
	"""
	search_list = list_text_cleaner(re.split(r'[,\s]+', search_input), remove_stopwords=True)
	search_list = [c for c in search_list if len(c)>0] # sometimes there are empty strings
	matched_category = False
	for k, v in search_dict.items():
	v = list_text_cleaner(v, remove_punctuation=False)
	search_results = match_product_category(search_list, v)
	if search_results is not None:
	matched_category = True
	return k, search_results
	else:
	# print(f'Function find_category: No category {k} has matched for input: {search_input}')
	continue
	if not matched_category:
	print(f'Function find_category: No category has matched for input: {search_input}')
	return None


	def check_entity(search_input) -> bool:
	"""Takes in a search input and checks if it contains any entities"""
	doc = nlp(search_input)
	if len(doc.ents) > 0:
	return doc
	else:
	return False

	def get_cosine_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
	"""Calculate the cosine similarity of the input text against a list of texts
	Takes in:
	- input_text: a string
	- texts: a list of strings
	Returns a dataframe with two columns: Sentence Text and Cosine Similarity Score
	"""
	input_text_cleaned = list_text_cleaner([input_text], remove_stopwords=True)[0]
	cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)
	all_texts = [input_text_cleaned] + cleaned_texts
	vectors = get_vectors(*all_texts)
	sim_matrix = cosine_similarity(vectors)
	# Get the similarity scores of the input_text against all other texts
	sim_scores = sim_matrix[0, 1:]
	data = {'OFFER': texts, 'Cosine Similarity Score': sim_scores}
	df = pd.DataFrame(data)
	df = df.sort_values(by='Cosine Similarity Score', ascending=False).reset_index(drop=True)
	return df

	def get_vectors(*strs: str) -> np.ndarray:
	text = list(strs)
	vectorizer = CountVectorizer()
	vectorizer.fit(text)
	return vectorizer.transform(text).toarray()

	def jaccard_similarity(s1: List[str], s2: List[str]) -> float:
	"""Takes in two lists and returns the Jaccard similarity score (3 digits)"""
	intersection = set(s1).intersection(set(s2))
	n = len(intersection)
	score = round(n / (len(s1) + len(s2) - n), 3)
	return score

	def get_jaccard_sim(input_text: str, texts: List[str]) -> pd.DataFrame:
	"""Calculate the Jaccard similarity of the input text against a list of texts
	Takes in:
	- input_text: a string
	- texts: a list of strings
	Returns a dataframe with two columns: Sentence Text and Jaccard Similarity Score
	"""
	cleaned_input_text = list_text_cleaner([input_text], remove_stopwords=True)[0].split()
	cleaned_texts = list_text_cleaner(texts, remove_stopwords=True)

	jaccard_scores = [jaccard_similarity(cleaned_input_text, text.split()) for text in cleaned_texts]

	data = {'OFFER': texts, 'Jaccard Similarity Score': jaccard_scores}
	df = pd.DataFrame(data)
	# sort based on the similarity score
	df = df.sort_values(by='Jaccard Similarity Score', ascending=False).reset_index(drop=True)
	return df

	def find_column(df: pd.DataFrame, keyword: str) -> str:
	"""Function to find the first column containing a specific keyword. Note that we assume there will only be one score at most for a similarity score dataframe"""
	cols = [col for col in df.columns if keyword.lower() in col.lower()]
	return cols[0] if cols else None

	def extract_similar_offers(data: pd.DataFrame, threshold: float = 0.0) -> pd.DataFrame:
	"""Takes in the results from get_cosine_sim() and get_jaccard_sim(); returns a dataframe of similar offers with scores > threshold"""
	score = find_column(data, 'score')
	similar_offers = data[data[score] >= threshold]
	similar_offers[score] = similar_offers[score].apply(lambda x: round(x, 3)) # round to 3 digits
	return similar_offers

	def category_to_brand(category: str, offered_brands: List, brand_belong_category_dict: Dict) -> List[str]:
	"""Use case: when a user searches for a category, we return a list of brands in that category"""
	# checks if the category is in the dictionary keys
	if category.upper() in brand_belong_category_dict.keys():
	search_brands = brand_belong_category_dict[category.upper()] # becase all keys are in upper case
	result = list(set(search_brands) & set(offered_brands))
	print(f"Function category_to_brand \| Found {category} in offered brand")
	return result
	else:
	print(f"Function category_to_brand \| No offered brand is found in {category}")
	return None

	class CatchErros(Exception):
	class ParamsInputError(Exception):
	pass
	class SearchFailedError(Exception):
	pass
	class UnknownError(Exception):
	pass


	def offer_finder_by_category(search_input: str, search_category_tuple: Tuple, category_dict: Dict, offers: pd.DataFrame, offered_brands: List,
	brand_belong_category_dict: Dict, score: str, threshold: float = 0.0) -> pd.DataFrame:
	"""Find offers based on a category identified from search input.
	Args:
	- search_input: a string
	- search_category_tuple: a tuple of (upper_category, product_category)
	- category_dict: a dictionary of categories. Keys are upper categories and values are lists of product categories
	- offers: a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
	- offered_brands: a list of offers from offer.df
	- brand_belong_category_dict: a dictionary of brands and the categories they belong to
	- score: a string of either 'cosine' or 'jaccard'
	- threshold: a float between 0 and 1

	Returns a dataframe of similar offers, ordered by highest score
	"""
	# we assume people just search one category at a time
	# search_category_tuple = find_category(search_input, category_dict)
	product_category, upper_category = search_category_tuple[1], search_category_tuple[0] # ('Alcohol', 'beer')
	print(f'Function offer_finder_by_category \| Found items:\n- Search input: {search_input}\n- Product category: {product_category}\n- Upper category: {upper_category}')
	potential_brands = category_to_brand(product_category, offered_brands, brand_belong_category_dict)
	if potential_brands is not None:
	potential_offers = offers[offers['BRAND'].isin(potential_brands)]['OFFER'].tolist()
	if score == 'cosine':
	cos_sim_score = get_cosine_sim(search_input, potential_offers)
	output = extract_similar_offers(cos_sim_score, threshold)
	elif score == 'jaccard':
	jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
	output = extract_similar_offers(jaccard_sim_score, threshold)
	elif score not in ['cosine', 'jaccard']:
	raise ParamsInputError(f'Please enter a valid score: cosine or jaccard; Not {score}')
	else: # this means something else is worng
	raise UnknownError(f'Something must be broken. Please try again.')
	return output
	else:
	potential_product_categories = category_dict[upper_category]
	msg = f'{product_category} is not found. Do you wanna take a look at these similar offers in {upper_category}?\n We have: {potential_product_categories}' # we can still calculate similarity but this is computationally expensive
	print(msg)
	return None

	def offer_finder_by_entity(search_input: str, entities: Tuple, offers_data: pd.DataFrame, score: str, threshold: float=0.0) -> pd.DataFrame:
	"""Find offers based on entities identified from search input.
	Args:
	- search_input: a string
	- entities: a tuple of entities
	- offers_data: a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
	- score: a string of either 'cosine' or 'jaccard'
	- threshold: a float between 0 and 1

	Returns a dataframe of similar offers, ordered by highest score
	"""
	collects = [] # collect all the results if there are more than one entity
	for ent in entities:
	ent_name, ent_label = ent.text, ent.label_
	print(f'Function offer_finder_by_entity \| Found entity: {ent_name} with label: {ent_label}')
	# filter offers by entity
	df_tmp = offers_data[offers_data[ent_label.upper()] == ent_name.upper()]
	if df_tmp.shape[0] > 0:
	print(f'Function offer_finder_by_entity \| Found {df_tmp.shape[0]} offer(s) for the brand/retailer: {ent_name}')
	potential_offers = df_tmp['OFFER'].drop_duplicates().tolist()
	if score == 'cosine':
	cos_sim_score = get_cosine_sim(search_input, potential_offers)
	output = extract_similar_offers(cos_sim_score, threshold)
	elif score == 'jaccard':
	jaccard_sim_score = get_jaccard_sim(search_input, potential_offers)
	output = extract_similar_offers(jaccard_sim_score, threshold)
	elif score not in ['cosine', 'jaccard']:
	raise ValueError(f'Please enter a valid score: cosine or jaccard; Not {score}')
	else: # this means something else is worng
	raise UnknownError(f'Something must be broken. Please try again.')
	collects.append(output)
	else:
	print(f'Function offer_finder_by_entity \| No offer is found for the brand/retailer: {ent_name}')

	if len(collects) > 0:
	final_output = pd.concat(collects, ignore_index=True)# they should be using the same similarity score
	score = find_column(collects[0], 'score')
	final_output = final_output.sort_values(by=score, ascending=False).reset_index(drop=True) # sort final_output by score
	return final_output
	elif len(collects) == 1:
	return collects[0]
	else:
	print('###'5 + 'FINAL SEARCH RESULTS' + '###'5)
	print('Function offer_finder_by_entity \| No offer is found for any of the entities.')
	return None


	def search_offers(search_input: str=example_search, offers: pd.DataFrame=df_offers_brand_retailer, offer_brands: List=offered_brands,
	category_dict: Dict=category_dict, brand_belong_category_dict: Dict=brand_belong_category_dict,
	score: str="jaccard", score_threshold: float = 0.0):
	"""Main function. Takes in a serach_input and decide whether it can find entities or not. Then excecute the appropriate functions
	Inputs:
	- search_input: a string that a user enters
	- offers: a dataframe of offers (OFFER, BRAND, RETAILER) that are avaialble in our database
	- category_dict: a dictionary of categories. Keys are upper categories and values are lists of product categories
	- brand_belong_category_dict: a dictionary of brands and the categories they belong to
	- score: a string of either 'cosine' or 'jaccard'
	- score_threshold: a float between 0 and 1

	Returns a dataframe of similar offers, ordered by highest score
	"""
	print(f'Function main \| Search input: {search_input}')
	check_ent = check_entity(search_input)
	if not check_entity(search_input): # no entities found
	# check category
	cat_check = find_category(search_input, category_dict)
	if cat_check is None:
	print('No brand/retailer/category is found. Please try again.')
	return None
	else:
	# we assume people just search one category at a time
	cat_tuple = cat_check # ('Alcohol', 'beer')
	search_results = offer_finder_by_category(search_input, cat_tuple, category_dict, offers, offered_brands, brand_belong_category_dict, score, score_threshold)
	return search_results
	else:
	entities = check_ent.ents # entities will be a tuple anyways
	print(f'Found {len(entities)} entity object(s) in the search input.')
	search_results = offer_finder_by_entity(search_input, entities, offers, score, score_threshold)
	if search_results is None:
	print('No offers matched retailer/category is found. Now trying to recommend based on category.')
	cat_check = find_category(search_input, category_dict)
	if cat_check is None:
	print('No brand/retailer/category is found. Please try again.')
	return None
	else:
	cat_tuple = cat_check
	search_results = offer_finder_by_category(search_input, cat_tuple, category_dict, offers, offered_brands, brand_belong_category_dict, score, score_threshold)
	return search_results

	if __name__ == "__main__":
	search_offers()