Spaces:
Sleeping
Sleeping
| import json | |
| from tqdm import tqdm | |
| from preprocess.utils.items.attrs import * | |
| from preprocess.utils.common.extracters import * | |
| from preprocess.utils.common.brand_matching import * | |
| from preprocess.utils.common.parallel_brand_matching import * | |
| from preprocess.utils.common.utils import * | |
| from preprocess.utils.common.top_inserts import * | |
| import pandas as pd | |
| class Preprocessor(): | |
| def __init__(self, long_types_list, short_types_list, sour_list, | |
| type_wine, gbs, colors_for_trim, grapes, other_words, | |
| sour_merge_dict, type_merge_dict, color_merge_dict, | |
| country_list, normalized_names_dict): | |
| self.long_types_list=long_types_list | |
| self.short_types_list=short_types_list | |
| self.sour=sour_list | |
| self.type_wine=type_wine | |
| self.gbs=gbs | |
| self.colors_ft=colors_for_trim | |
| self.grapes=grapes | |
| self.other_words=other_words | |
| self.types_n_others=long_types_list+other_words+sour_list+country_list | |
| self.types_n_others.remove("Шерри") | |
| self.sour_dict=sour_merge_dict | |
| self.type_dict=type_merge_dict | |
| self.color_merge_dict=color_merge_dict | |
| self.country_list = country_list | |
| self.normalized_names_dict=normalized_names_dict | |
| def preprocess_name(self, name): | |
| return name.replace("\n", " ") | |
| def process_items(self, df): | |
| result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]} | |
| #counter=0 | |
| for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)): | |
| try: | |
| i=json.loads(i) | |
| result['id'].append(idf) | |
| if 'brand' in i.keys(): | |
| result['brand'].append(i['brand']) | |
| else: result['brand'].append(None) | |
| name = self.preprocess_name(i['name']) | |
| result['name'].append(name) | |
| result['fullname'].append(name) | |
| drink_type=get_type(i, self.long_types_list) | |
| if drink_type is None: | |
| drink_type=check_spark(i) | |
| if drink_type is None: | |
| drink_type=check_color_and_sour(i) | |
| if drink_type is None: | |
| drink_type=check_spark(i, col_name='type_wine') | |
| if drink_type is None: | |
| drink_type=check_color_and_sour(i, types=self.sour) | |
| if drink_type is None: | |
| drink_type=check_color_and_sour(i, col_name='name') | |
| #if 'type' in i.keys(): | |
| result['type'].append(drink_type)#i['type']) | |
| #else: dd['type'].append(None) | |
| if 'volume' in i.keys(): | |
| result['volume'].append(i['volume']) | |
| else: | |
| vol=extract_volume_or_number(i['name']) | |
| result['volume'].append(vol) | |
| if 'year' in i.keys(): | |
| result['year'].append(i['year']) | |
| else: | |
| year=extract_production_year(i['name']) | |
| result['year'].append(year) | |
| alco=extract_alcohol_content(i['name']) | |
| if 'type_wine' in i.keys(): | |
| result['type_wine'].append(i['type_wine']) | |
| else: result['type_wine'].append(None) | |
| #f alco is not None: | |
| result['alco'].append(alco) | |
| #else: dd['type_wine'].append(None) | |
| except Exception as ex: | |
| print(idf, ex) | |
| return pd.DataFrame(result) | |
| def process_products(self, products): | |
| result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]} | |
| for idx, row in tqdm(products.iterrows()): | |
| try: | |
| result['id'].append(row['id']) | |
| result['brand'].append(row['brand']) | |
| result['type_wine'].append(row['category']) | |
| result['type'].append(row['product_type']) | |
| result['name'].append(row['name_long']) | |
| result['fullname'].append(row['name_long']) | |
| vol=extract_volume_or_number(row['name']) | |
| result['volume'].append(vol) | |
| #year=extract_production_year(row['name']) | |
| year=extract_production_year(str(row['name_postfix'])) | |
| result['year'].append(year) | |
| #rr['year'].append(row['name_postfix']) | |
| alco=extract_alcohol_content(row['name']) | |
| #f alco is not None: | |
| result['alco'].append(alco) | |
| except Exception as ex: | |
| print(ex) | |
| return pd.DataFrame(result) | |
| def prcess_text(self, text): | |
| #text=''+origin | |
| #text=str(split_russian_and_english(text)) | |
| gb=find_full_word(text, self.gbs)#get_GB(text) | |
| if gb is not None: | |
| text=text.replace(str(gb), '') | |
| alcohol = extract_alcohol_content(text) | |
| if alcohol is not None: | |
| alco_w_comma=alcohol.replace('.', ',') | |
| text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '') | |
| volume_or_number = extract_volume_or_number(text) | |
| if volume_or_number is not None: | |
| volume_with_comma=str(volume_or_number).replace('.', ',') | |
| text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') | |
| text = re.sub(r'\s+\b[лЛlL].\b', '', text) | |
| text = re.sub(r'\s+\b[лЛlL]\b', '', text) | |
| test=clean_wine_name(text) #remove_l(text) | |
| #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '') | |
| # else: | |
| # volume_or_number=re_extract_volume(text) | |
| # if volume_or_number is not None: | |
| # volume_with_comma=volume_or_number.replace('.', ',') | |
| # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') | |
| years = extract_years(text) | |
| if years is not None: | |
| text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '') | |
| production_year = extract_production_year(text) | |
| if production_year is not None: | |
| text=text.replace(str(production_year), '') | |
| color=find_full_word(text, self.colors_ft) | |
| if color is not None: | |
| text=text.replace(str(color), '') | |
| sour=find_full_word(text, self.sour) #get_sour(text) | |
| if sour is not None: | |
| text=text.replace(str(sour), '') | |
| # re_extracted_volume=re_extract_volume(text) | |
| # if re_extracted_volume is not None: | |
| # volume_with_comma=re_extracted_volume.replace('.', ',') | |
| # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '') | |
| # else: | |
| # re_extracted_volume=re_extract_volume(str(volume_or_number)) | |
| # volume_or_number=re_extracted_volume | |
| return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour | |
| def process(self, products, items): | |
| print('------*-----Prepare items catalogue-----*-----') | |
| items=self.process_items(items.copy()) | |
| print('-----*-----Prepare products catalogue-----*-----') | |
| products=self.process_products(products.copy()) | |
| items['brand']=items['brand'].apply(lambda x: str(x).strip().lower()) | |
| products['brand']=products['brand'].apply(lambda x: str(x).strip().lower()) | |
| print('-----*-----Split n match-----*-----') | |
| splited=split_n_match(products, items) | |
| items["brand"] = items["brand"].replace(splited) | |
| print('-----*-----Fill brands in items-----*-----') | |
| fill_brands_in_dataframe(products['brand'].unique(), items) | |
| print('-----*-----Brand matching-----*-----') | |
| comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) | |
| out_prods=list(set(prod_brand_list)-set(comp_list)) | |
| out_items=list(set(items_brand_list)-set(comp_list)) | |
| brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique())) | |
| items["new_brand"] = items["new_brand"].replace(brand_map_improved) | |
| items['type']=items['type'].replace(self.type_dict) | |
| print('-----*-----Unwrap brand cats step 1-----*-----') | |
| unwrap_b_match=unwrap_brands(products) | |
| items["new_brand"] = items["new_brand"].replace(unwrap_b_match) | |
| products["brand"] = products["brand"].replace(unwrap_b_match) | |
| print('-----*-----Unwrap brand cats step 2-----*-----') | |
| unwrap_b_match=unwrap_brands(products) | |
| items["new_brand"] = items["new_brand"].replace(unwrap_b_match) | |
| products["brand"] = products["brand"].replace(unwrap_b_match) | |
| print('-----*-----Finding brands in names-----*-----') | |
| items['new_brand']=items['new_brand'].replace('none', None) | |
| i_brands=items[items['new_brand'].isna()]['name'].values | |
| p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3] | |
| new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands) | |
| items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands) | |
| print('-----*-----Top inserts-----*-----') | |
| process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list | |
| self.grapes, self.other_words) | |
| print('-----*-----Adding service categories-----*-----') | |
| merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict) | |
| merge_types(items, products, type_merge_dict=self.type_dict) | |
| merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict) | |
| merge_types(products, products, type_merge_dict=self.type_dict) | |
| print('-----*-----Name trimming-----*-----') | |
| item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others) | |
| #items['name']=items['id'].replace(item_timed_names) | |
| items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names) | |
| items['gb']=gb | |
| items['sour']=sour | |
| items['sour']=items['sour'].replace(self.sour_dict) | |
| products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others) | |
| products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names) | |
| products['gb']=gb | |
| products['sour']=sour | |
| products['sour']=products['sour'].replace(self.sour_dict) | |
| print('-----*-----Replacing product types-----*-----') | |
| products['type']=products['type'].replace(self.type_dict) | |
| return items, products | |