Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Product_Matching / preprocess /preprocess.py

Gainward777

Upload 22 files

1f22e94 verified 10 months ago

raw

history blame contribute delete

11.2 kB

	import json
	from tqdm import tqdm
	from preprocess.utils.items.attrs import *
	from preprocess.utils.common.extracters import *
	from preprocess.utils.common.brand_matching import *
	from preprocess.utils.common.parallel_brand_matching import *
	from preprocess.utils.common.utils import *
	from preprocess.utils.common.top_inserts import *
	import pandas as pd



	class Preprocessor():

	def __init__(self, long_types_list, short_types_list, sour_list,
	type_wine, gbs, colors_for_trim, grapes, other_words,
	sour_merge_dict, type_merge_dict, color_merge_dict,
	country_list, normalized_names_dict):

	self.long_types_list=long_types_list
	self.short_types_list=short_types_list
	self.sour=sour_list
	self.type_wine=type_wine
	self.gbs=gbs
	self.colors_ft=colors_for_trim
	self.grapes=grapes
	self.other_words=other_words

	self.types_n_others=long_types_list+other_words+sour_list+country_list
	self.types_n_others.remove("Шерри")

	self.sour_dict=sour_merge_dict
	self.type_dict=type_merge_dict
	self.color_merge_dict=color_merge_dict
	self.country_list = country_list
	self.normalized_names_dict=normalized_names_dict


	def preprocess_name(self, name):
	return name.replace("\n", " ")


	def process_items(self, df):
	result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
	#counter=0
	for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):

	try:
	i=json.loads(i)
	result['id'].append(idf)
	if 'brand' in i.keys():
	result['brand'].append(i['brand'])
	else: result['brand'].append(None)

	name = self.preprocess_name(i['name'])
	result['name'].append(name)
	result['fullname'].append(name)
	drink_type=get_type(i, self.long_types_list)
	if drink_type is None:
	drink_type=check_spark(i)
	if drink_type is None:
	drink_type=check_color_and_sour(i)
	if drink_type is None:
	drink_type=check_spark(i, col_name='type_wine')
	if drink_type is None:
	drink_type=check_color_and_sour(i, types=self.sour)
	if drink_type is None:
	drink_type=check_color_and_sour(i, col_name='name')
	#if 'type' in i.keys():
	result['type'].append(drink_type)#i['type'])
	#else: dd['type'].append(None)
	if 'volume' in i.keys():
	result['volume'].append(i['volume'])
	else:
	vol=extract_volume_or_number(i['name'])
	result['volume'].append(vol)
	if 'year' in i.keys():
	result['year'].append(i['year'])
	else:
	year=extract_production_year(i['name'])
	result['year'].append(year)
	alco=extract_alcohol_content(i['name'])
	if 'type_wine' in i.keys():
	result['type_wine'].append(i['type_wine'])
	else: result['type_wine'].append(None)
	#f alco is not None:
	result['alco'].append(alco)
	#else: dd['type_wine'].append(None)
	except Exception as ex:
	print(idf, ex)
	return pd.DataFrame(result)


	def process_products(self, products):
	result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
	for idx, row in tqdm(products.iterrows()):
	try:
	result['id'].append(row['id'])
	result['brand'].append(row['brand'])
	result['type_wine'].append(row['category'])
	result['type'].append(row['product_type'])
	result['name'].append(row['name_long'])
	result['fullname'].append(row['name_long'])
	vol=extract_volume_or_number(row['name'])
	result['volume'].append(vol)
	#year=extract_production_year(row['name'])
	year=extract_production_year(str(row['name_postfix']))
	result['year'].append(year)
	#rr['year'].append(row['name_postfix'])
	alco=extract_alcohol_content(row['name'])
	#f alco is not None:
	result['alco'].append(alco)
	except Exception as ex:
	print(ex)
	return pd.DataFrame(result)


	def prcess_text(self, text):
	#text=''+origin
	#text=str(split_russian_and_english(text))
	gb=find_full_word(text, self.gbs)#get_GB(text)
	if gb is not None:
	text=text.replace(str(gb), '')

	alcohol = extract_alcohol_content(text)
	if alcohol is not None:
	alco_w_comma=alcohol.replace('.', ',')
	text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
	volume_or_number = extract_volume_or_number(text)
	if volume_or_number is not None:
	volume_with_comma=str(volume_or_number).replace('.', ',')
	text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
	text = re.sub(r'\s+\b[лЛlL].\b', '', text)
	text = re.sub(r'\s+\b[лЛlL]\b', '', text)
	test=clean_wine_name(text) #remove_l(text)
	#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
	# else:
	# volume_or_number=re_extract_volume(text)
	# if volume_or_number is not None:
	# volume_with_comma=volume_or_number.replace('.', ',')
	# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
	years = extract_years(text)
	if years is not None:
	text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
	production_year = extract_production_year(text)
	if production_year is not None:
	text=text.replace(str(production_year), '')


	color=find_full_word(text, self.colors_ft)
	if color is not None:
	text=text.replace(str(color), '')
	sour=find_full_word(text, self.sour) #get_sour(text)
	if sour is not None:
	text=text.replace(str(sour), '')
	# re_extracted_volume=re_extract_volume(text)
	# if re_extracted_volume is not None:
	# volume_with_comma=re_extracted_volume.replace('.', ',')
	# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')

	# else:
	# re_extracted_volume=re_extract_volume(str(volume_or_number))
	# volume_or_number=re_extracted_volume

	return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour


	def process(self, products, items):

	print('-----------Prepare items catalogue----------')
	items=self.process_items(items.copy())
	print('----------Prepare products catalogue----------')
	products=self.process_products(products.copy())

	items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
	products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())

	print('----------Split n match----------')
	splited=split_n_match(products, items)
	items["brand"] = items["brand"].replace(splited)

	print('----------Fill brands in items----------')
	fill_brands_in_dataframe(products['brand'].unique(), items)

	print('----------Brand matching----------')
	comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
	out_prods=list(set(prod_brand_list)-set(comp_list))
	out_items=list(set(items_brand_list)-set(comp_list))
	brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
	items["new_brand"] = items["new_brand"].replace(brand_map_improved)

	items['type']=items['type'].replace(self.type_dict)

	print('----------Unwrap brand cats step 1----------')
	unwrap_b_match=unwrap_brands(products)
	items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
	products["brand"] = products["brand"].replace(unwrap_b_match)

	print('----------Unwrap brand cats step 2----------')
	unwrap_b_match=unwrap_brands(products)
	items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
	products["brand"] = products["brand"].replace(unwrap_b_match)

	print('----------Finding brands in names----------')
	items['new_brand']=items['new_brand'].replace('none', None)
	i_brands=items[items['new_brand'].isna()]['name'].values
	p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
	new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
	items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)

	print('----------Top inserts----------')
	process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
	self.grapes, self.other_words)

	print('----------Adding service categories----------')
	merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
	merge_types(items, products, type_merge_dict=self.type_dict)
	merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
	merge_types(products, products, type_merge_dict=self.type_dict)


	print('----------Name trimming----------')
	item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
	#items['name']=items['id'].replace(item_timed_names)
	items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
	items['gb']=gb
	items['sour']=sour
	items['sour']=items['sour'].replace(self.sour_dict)

	products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
	products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
	products['gb']=gb
	products['sour']=sour
	products['sour']=products['sour'].replace(self.sour_dict)

	print('----------Replacing product types----------')
	products['type']=products['type'].replace(self.type_dict)

	return items, products