Product_Matching / preprocess /preprocess.py
Gainward777's picture
Upload 22 files
1f22e94 verified
import json
from tqdm import tqdm
from preprocess.utils.items.attrs import *
from preprocess.utils.common.extracters import *
from preprocess.utils.common.brand_matching import *
from preprocess.utils.common.parallel_brand_matching import *
from preprocess.utils.common.utils import *
from preprocess.utils.common.top_inserts import *
import pandas as pd
class Preprocessor():
def __init__(self, long_types_list, short_types_list, sour_list,
type_wine, gbs, colors_for_trim, grapes, other_words,
sour_merge_dict, type_merge_dict, color_merge_dict,
country_list, normalized_names_dict):
self.long_types_list=long_types_list
self.short_types_list=short_types_list
self.sour=sour_list
self.type_wine=type_wine
self.gbs=gbs
self.colors_ft=colors_for_trim
self.grapes=grapes
self.other_words=other_words
self.types_n_others=long_types_list+other_words+sour_list+country_list
self.types_n_others.remove("Шерри")
self.sour_dict=sour_merge_dict
self.type_dict=type_merge_dict
self.color_merge_dict=color_merge_dict
self.country_list = country_list
self.normalized_names_dict=normalized_names_dict
def preprocess_name(self, name):
return name.replace("\n", " ")
def process_items(self, df):
result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
#counter=0
for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
try:
i=json.loads(i)
result['id'].append(idf)
if 'brand' in i.keys():
result['brand'].append(i['brand'])
else: result['brand'].append(None)
name = self.preprocess_name(i['name'])
result['name'].append(name)
result['fullname'].append(name)
drink_type=get_type(i, self.long_types_list)
if drink_type is None:
drink_type=check_spark(i)
if drink_type is None:
drink_type=check_color_and_sour(i)
if drink_type is None:
drink_type=check_spark(i, col_name='type_wine')
if drink_type is None:
drink_type=check_color_and_sour(i, types=self.sour)
if drink_type is None:
drink_type=check_color_and_sour(i, col_name='name')
#if 'type' in i.keys():
result['type'].append(drink_type)#i['type'])
#else: dd['type'].append(None)
if 'volume' in i.keys():
result['volume'].append(i['volume'])
else:
vol=extract_volume_or_number(i['name'])
result['volume'].append(vol)
if 'year' in i.keys():
result['year'].append(i['year'])
else:
year=extract_production_year(i['name'])
result['year'].append(year)
alco=extract_alcohol_content(i['name'])
if 'type_wine' in i.keys():
result['type_wine'].append(i['type_wine'])
else: result['type_wine'].append(None)
#f alco is not None:
result['alco'].append(alco)
#else: dd['type_wine'].append(None)
except Exception as ex:
print(idf, ex)
return pd.DataFrame(result)
def process_products(self, products):
result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
for idx, row in tqdm(products.iterrows()):
try:
result['id'].append(row['id'])
result['brand'].append(row['brand'])
result['type_wine'].append(row['category'])
result['type'].append(row['product_type'])
result['name'].append(row['name_long'])
result['fullname'].append(row['name_long'])
vol=extract_volume_or_number(row['name'])
result['volume'].append(vol)
#year=extract_production_year(row['name'])
year=extract_production_year(str(row['name_postfix']))
result['year'].append(year)
#rr['year'].append(row['name_postfix'])
alco=extract_alcohol_content(row['name'])
#f alco is not None:
result['alco'].append(alco)
except Exception as ex:
print(ex)
return pd.DataFrame(result)
def prcess_text(self, text):
#text=''+origin
#text=str(split_russian_and_english(text))
gb=find_full_word(text, self.gbs)#get_GB(text)
if gb is not None:
text=text.replace(str(gb), '')
alcohol = extract_alcohol_content(text)
if alcohol is not None:
alco_w_comma=alcohol.replace('.', ',')
text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
volume_or_number = extract_volume_or_number(text)
if volume_or_number is not None:
volume_with_comma=str(volume_or_number).replace('.', ',')
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
text = re.sub(r'\s+\b[лЛlL].\b', '', text)
text = re.sub(r'\s+\b[лЛlL]\b', '', text)
test=clean_wine_name(text) #remove_l(text)
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
# else:
# volume_or_number=re_extract_volume(text)
# if volume_or_number is not None:
# volume_with_comma=volume_or_number.replace('.', ',')
# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
years = extract_years(text)
if years is not None:
text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
production_year = extract_production_year(text)
if production_year is not None:
text=text.replace(str(production_year), '')
color=find_full_word(text, self.colors_ft)
if color is not None:
text=text.replace(str(color), '')
sour=find_full_word(text, self.sour) #get_sour(text)
if sour is not None:
text=text.replace(str(sour), '')
# re_extracted_volume=re_extract_volume(text)
# if re_extracted_volume is not None:
# volume_with_comma=re_extracted_volume.replace('.', ',')
# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
# else:
# re_extracted_volume=re_extract_volume(str(volume_or_number))
# volume_or_number=re_extracted_volume
return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
def process(self, products, items):
print('------*-----Prepare items catalogue-----*-----')
items=self.process_items(items.copy())
print('-----*-----Prepare products catalogue-----*-----')
products=self.process_products(products.copy())
items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
print('-----*-----Split n match-----*-----')
splited=split_n_match(products, items)
items["brand"] = items["brand"].replace(splited)
print('-----*-----Fill brands in items-----*-----')
fill_brands_in_dataframe(products['brand'].unique(), items)
print('-----*-----Brand matching-----*-----')
comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
out_prods=list(set(prod_brand_list)-set(comp_list))
out_items=list(set(items_brand_list)-set(comp_list))
brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
items["new_brand"] = items["new_brand"].replace(brand_map_improved)
items['type']=items['type'].replace(self.type_dict)
print('-----*-----Unwrap brand cats step 1-----*-----')
unwrap_b_match=unwrap_brands(products)
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
products["brand"] = products["brand"].replace(unwrap_b_match)
print('-----*-----Unwrap brand cats step 2-----*-----')
unwrap_b_match=unwrap_brands(products)
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
products["brand"] = products["brand"].replace(unwrap_b_match)
print('-----*-----Finding brands in names-----*-----')
items['new_brand']=items['new_brand'].replace('none', None)
i_brands=items[items['new_brand'].isna()]['name'].values
p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
print('-----*-----Top inserts-----*-----')
process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
self.grapes, self.other_words)
print('-----*-----Adding service categories-----*-----')
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
merge_types(items, products, type_merge_dict=self.type_dict)
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
merge_types(products, products, type_merge_dict=self.type_dict)
print('-----*-----Name trimming-----*-----')
item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
#items['name']=items['id'].replace(item_timed_names)
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
items['gb']=gb
items['sour']=sour
items['sour']=items['sour'].replace(self.sour_dict)
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
products['gb']=gb
products['sour']=sour
products['sour']=products['sour'].replace(self.sour_dict)
print('-----*-----Replacing product types-----*-----')
products['type']=products['type'].replace(self.type_dict)
return items, products