import json from tqdm import tqdm from preprocess.utils.items.attrs import * from preprocess.utils.common.extracters import * from preprocess.utils.common.brand_matching import * from preprocess.utils.common.parallel_brand_matching import * from preprocess.utils.common.utils import * from preprocess.utils.common.top_inserts import * import pandas as pd class Preprocessor(): def __init__(self, long_types_list, short_types_list, sour_list, type_wine, gbs, colors_for_trim, grapes, other_words, sour_merge_dict, type_merge_dict, color_merge_dict, country_list, normalized_names_dict): self.long_types_list=long_types_list self.short_types_list=short_types_list self.sour=sour_list self.type_wine=type_wine self.gbs=gbs self.colors_ft=colors_for_trim self.grapes=grapes self.other_words=other_words self.types_n_others=long_types_list+other_words+sour_list+country_list self.types_n_others.remove("Шерри") self.sour_dict=sour_merge_dict self.type_dict=type_merge_dict self.color_merge_dict=color_merge_dict self.country_list = country_list self.normalized_names_dict=normalized_names_dict def preprocess_name(self, name): return name.replace("\n", " ") def process_items(self, df): result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]} #counter=0 for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)): try: i=json.loads(i) result['id'].append(idf) if 'brand' in i.keys(): result['brand'].append(i['brand']) else: result['brand'].append(None) name = self.preprocess_name(i['name']) result['name'].append(name) result['fullname'].append(name) drink_type=get_type(i, self.long_types_list) if drink_type is None: drink_type=check_spark(i) if drink_type is None: drink_type=check_color_and_sour(i) if drink_type is None: drink_type=check_spark(i, col_name='type_wine') if drink_type is None: drink_type=check_color_and_sour(i, types=self.sour) if drink_type is None: drink_type=check_color_and_sour(i, col_name='name') #if 'type' in i.keys(): result['type'].append(drink_type)#i['type']) #else: dd['type'].append(None) if 'volume' in i.keys(): result['volume'].append(i['volume']) else: vol=extract_volume_or_number(i['name']) result['volume'].append(vol) if 'year' in i.keys(): result['year'].append(i['year']) else: year=extract_production_year(i['name']) result['year'].append(year) alco=extract_alcohol_content(i['name']) if 'type_wine' in i.keys(): result['type_wine'].append(i['type_wine']) else: result['type_wine'].append(None) #f alco is not None: result['alco'].append(alco) #else: dd['type_wine'].append(None) except Exception as ex: print(idf, ex) return pd.DataFrame(result) def process_products(self, products): result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]} for idx, row in tqdm(products.iterrows()): try: result['id'].append(row['id']) result['brand'].append(row['brand']) result['type_wine'].append(row['category']) result['type'].append(row['product_type']) result['name'].append(row['name_long']) result['fullname'].append(row['name_long']) vol=extract_volume_or_number(row['name']) result['volume'].append(vol) #year=extract_production_year(row['name']) year=extract_production_year(str(row['name_postfix'])) result['year'].append(year) #rr['year'].append(row['name_postfix']) alco=extract_alcohol_content(row['name']) #f alco is not None: result['alco'].append(alco) except Exception as ex: print(ex) return pd.DataFrame(result) def prcess_text(self, text): #text=''+origin #text=str(split_russian_and_english(text)) gb=find_full_word(text, self.gbs)#get_GB(text) if gb is not None: text=text.replace(str(gb), '') alcohol = extract_alcohol_content(text) if alcohol is not None: alco_w_comma=alcohol.replace('.', ',') text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '') volume_or_number = extract_volume_or_number(text) if volume_or_number is not None: volume_with_comma=str(volume_or_number).replace('.', ',') text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') text = re.sub(r'\s+\b[лЛlL].\b', '', text) text = re.sub(r'\s+\b[лЛlL]\b', '', text) test=clean_wine_name(text) #remove_l(text) #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '') # else: # volume_or_number=re_extract_volume(text) # if volume_or_number is not None: # volume_with_comma=volume_or_number.replace('.', ',') # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') years = extract_years(text) if years is not None: text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '') production_year = extract_production_year(text) if production_year is not None: text=text.replace(str(production_year), '') color=find_full_word(text, self.colors_ft) if color is not None: text=text.replace(str(color), '') sour=find_full_word(text, self.sour) #get_sour(text) if sour is not None: text=text.replace(str(sour), '') # re_extracted_volume=re_extract_volume(text) # if re_extracted_volume is not None: # volume_with_comma=re_extracted_volume.replace('.', ',') # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '') # else: # re_extracted_volume=re_extract_volume(str(volume_or_number)) # volume_or_number=re_extracted_volume return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour def process(self, products, items): print('------*-----Prepare items catalogue-----*-----') items=self.process_items(items.copy()) print('-----*-----Prepare products catalogue-----*-----') products=self.process_products(products.copy()) items['brand']=items['brand'].apply(lambda x: str(x).strip().lower()) products['brand']=products['brand'].apply(lambda x: str(x).strip().lower()) print('-----*-----Split n match-----*-----') splited=split_n_match(products, items) items["brand"] = items["brand"].replace(splited) print('-----*-----Fill brands in items-----*-----') fill_brands_in_dataframe(products['brand'].unique(), items) print('-----*-----Brand matching-----*-----') comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) out_prods=list(set(prod_brand_list)-set(comp_list)) out_items=list(set(items_brand_list)-set(comp_list)) brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique())) items["new_brand"] = items["new_brand"].replace(brand_map_improved) items['type']=items['type'].replace(self.type_dict) print('-----*-----Unwrap brand cats step 1-----*-----') unwrap_b_match=unwrap_brands(products) items["new_brand"] = items["new_brand"].replace(unwrap_b_match) products["brand"] = products["brand"].replace(unwrap_b_match) print('-----*-----Unwrap brand cats step 2-----*-----') unwrap_b_match=unwrap_brands(products) items["new_brand"] = items["new_brand"].replace(unwrap_b_match) products["brand"] = products["brand"].replace(unwrap_b_match) print('-----*-----Finding brands in names-----*-----') items['new_brand']=items['new_brand'].replace('none', None) i_brands=items[items['new_brand'].isna()]['name'].values p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3] new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands) items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands) print('-----*-----Top inserts-----*-----') process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list self.grapes, self.other_words) print('-----*-----Adding service categories-----*-----') merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict) merge_types(items, products, type_merge_dict=self.type_dict) merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict) merge_types(products, products, type_merge_dict=self.type_dict) print('-----*-----Name trimming-----*-----') item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others) #items['name']=items['id'].replace(item_timed_names) items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names) items['gb']=gb items['sour']=sour items['sour']=items['sour'].replace(self.sour_dict) products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others) products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names) products['gb']=gb products['sour']=sour products['sour']=products['sour'].replace(self.sour_dict) print('-----*-----Replacing product types-----*-----') products['type']=products['type'].replace(self.type_dict) return items, products