import os.path from preprocess.utils.common.utils import normalize_name from math import isnan from preprocess.utils.items.attrs import * from preprocess.utils.common.brand_matching import * from preprocess.utils.common.top_inserts import * from preprocess.utils.products.products import * import pandas as pd from processor.matching import prepare_groups_with_ids_ex class Preprocessor(): def __init__(self, long_types_list, short_types_list, sour_list, type_wine, gbs, grapes, other_words, #sour_merge_dict, type_merge_dict, color_merge_dict, country_list): self.long_types_list=[element.lower() for element in long_types_list] self.short_types_list=short_types_list self.sour=sour_list self.type_wine=type_wine self.gbs=gbs self.grapes=grapes self.other_words=other_words self.types_n_others=long_types_list+other_words+sour_list+country_list self.types_n_others.remove("Шерри") self.type_dict=type_merge_dict self.color_merge_dict=color_merge_dict self.country_list = country_list global TYPES_FROM_BRAND_DICT updated = {} for k, v in TYPES_FROM_BRAND_DICT.items(): updated[k] = v updated[normalize_name(k)] = v TYPES_FROM_BRAND_DICT = updated def write_log(self, logfn, s): print(s + "\n") with open(logfn, 'a') as logf: logf.write(datetime.now().strftime('[%Y-%m-%d %H:%M:%S]: ') + s + "\n") def process_products(self, products): result={'index':[], 'id':[], 'orig_brand':[], 'brand':[], 'brand_unwrap':[], 'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[], 'orig_name_2':[], 'name_2': [], 'orig_type':[], 'type':[], 'type_l1':[], 'type_l0':[], 'orig_type_wine':[], "type_wine":[], 'sour':[], "volume":[], "gb":[], "year":[], 'alco':[], 'other': []}#, 'embeddings':[]} index = 0 for idx, row in tqdm(products.iterrows()): try: #if not row['id'] == 1115: # continue #if not isinstance(row['brand'], str): # continue #if (row['brand'].lower() == 'Villa Raiano'.lower()) or (row['brand'].lower() == 'bosco'.lower()): # row = row #else: # continue if isinstance(row['product_type'], (int, float)) and isnan(row['product_type']): print("Product type is not specified or incorrect for product id=[" + str(row['id']) + "]. Product is ignored") continue result['index'].append(index) result['id'].append(row['id']) result['orig_brand'].append(row['brand']) #result['orig_name'].append(row['name_long']) result['orig_name'].append(row['name']) result['orig_name_2'].append(row['name_translit']) result['orig_type'].append(row['product_type']) result['orig_type_wine'].append(row['category']) brand = preprocess_product_brand(row['brand']) #name = preprocess_product_name(row['name_long']) name = preprocess_product_name(row['name']) name_translit = preprocess_product_name(row['name_translit']) # First of all let's check if it is sparkling wine drink_type, _ = extract_spark(row['product_type'], False) drink_type_n, name = extract_spark(name, True) if not drink_type: drink_type, _ = extract_type(row['product_type'], False) drink_type_n, name = extract_type(name, True) if not drink_type: drink_type = row['product_type'].lower() type_wine = None sour_wine = '' if isinstance(row['type_prefix'], str) and row['type_prefix']: type_wine, sour_wine, _ = extract_color_and_sour(row['type_prefix'], remove=False) if drink_type is None and (type_wine or sour_wine): drink_type='вино' volume = is_volume(row['volume']) year, _ = extract_production_year(str(row['name_postfix'])) gb, _ = extract_gb(row['name_postfix'], False) alco, _ = extract_alcohol_content(name) name, alcohol_n, volume_n, aging_n, year_n, gb_n, color_n, sour_wine_n, other_n = extract_attributes_from_name(name) name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ') name = normalize_and_clean_name(name) name_translit, alcohol_n2, volume_n2, aging_n2, year_n2, gb_n2, color_n2, sour_wine_n2, other_n2 = extract_attributes_from_name(name_translit) name_translit = trim_name(name_translit, self.types_n_others).replace(',', ' ').replace('.', ' ') name_translit = normalize_and_clean_name(name_translit) if not year: year = year_n #elif year and year_n and (year != year_n): # print("Product year conflict detected for product id=[" + str(row['id']) + "]: " + str(year) + " vs " + str(year_n)) if not type_wine: type_wine = color_n #elif color_n and type_wine and (color_n != type_wine): # print("Product type_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(type_wine) + " vs " + str(color_n)) if not sour_wine: sour_wine = sour_wine_n #if sour_wine_n and sour_wine and (sour_wine != sour_wine_n): # print("Product sour_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n)) if not volume: volume = volume_n elif volume_n and volume and (volume_n != volume): print("Product volume conflict detected for product id=[" + str(row['id']) + "]: " + str(volume) + " vs " + str(volume_n)) result['brand'].append(brand) result['brand_unwrap'].append('') result['name'].append(name) result['name_2'].append(name_translit) result['name_wo_brand'].append('') result['name_with_brand'].append('') if not type_wine: type_wine = '' result['type'].append(drink_type.lower()) result['type_wine'].append(type_wine.lower()) result['type_l1'].append('') result['type_l0'].append('') if not sour_wine: sour_wine = '' result['sour'].append(sour_wine) result['volume'].append(volume) result['year'].append(year) result['gb'].append(gb) result['alco'].append(alco) result['other'].append(other_n) index += 1 except Exception as ex: print("Error processing product id=" + str(idx) + ": " + str(ex)) return pd.DataFrame(result) def process_products_full(self, products_data): logfn = os.path.join(products_data['dir'], "update_log.txt") try: self.write_log(logfn, "Products processing started") prods_file = products_data['path'] products_delimiter = get_delimiter(prods_file) # row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip') products = pd.read_csv(prods_file, sep=products_delimiter) # 1) self.write_log(logfn, '------*-----Prepare products catalogue-----*-----') products = self.process_products(products.copy()) products_data["dict_types"] = products['type'].unique().tolist() # 2) #products['brand'] = products['brand'].apply(lambda x: str(x).strip().lower()) # 3) #products_data["brand_3"] = products['brand'].unique() self.write_log(logfn, '------*-----Unwrapping brands-----*-----') products["brand_unwrap"] = products["brand"] # 4) ##products_data["unwrap_brands_1"] = unwrap_brands(products) products_data["unwrap_brands_1"] = {} # 5) products["brand_unwrap"] = products["brand"].replace(products_data["unwrap_brands_1"]) # 6) #products_data["unwrap_brand_2"] = unwrap_brands(products) # 7) ##products_data["unwrap_brands_2"] = unwrap_brands(products, products['brand_unwrap'].unique()) products_data["unwrap_brands_2"] = {} # 8) products["brand_unwrap"] = products["brand_unwrap"].replace(products_data["unwrap_brands_2"]) products["brand_unwrap"] = products.apply(lambda row: row["brand_unwrap"] if row["brand_unwrap"] != row["brand"] else '', axis=1) # 9) self.write_log(logfn, '-----*-----Adding service categories-----*-----') merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict) merge_types(products, products, type_merge_dict=self.type_dict) # Now we can normalize and clean brands and names (only after trimming) products['brand'] = products['brand'].apply(normalize_and_clean_brand) products['norm_name'] = products['name'] # 11) self.write_log(logfn, '-----*-----Replacing product types-----*-----') products['type']=products['type'].replace(self.type_dict) products['new_brand']=products['brand'] #products["name_with_brand"] = products["name"] products["name_wo_brand"] = products.apply(lambda row: remove_brand_from_name(row['name'], row['brand']), axis=1) products["name_with_brand"] = products.apply(lambda row: insert_brand_in_name(row['name'], row['brand']), axis=1) #products["name_wo_brand_len"] = products['name_wo_brand'].apply(lambda x: len(x)) #products_data["dict_groups_brand_type_vol_typewine"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume', 'new_type_wine']) products_data["groups_brand_type_vol"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume'], "name_wo_brand") # Change it from type_wine to type products['type_l1'] = products['type'].replace(TYPES_LEVEL_1_DICT) products['type_l0'] = products['type_l1'].replace(TYPES_LEVEL_0_DICT) products_data["groups_brand_typel1_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l1', 'volume'], "name_wo_brand") products_data["groups_brand_typel0_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l0', 'volume'], "name_wo_brand") products_data["groups_typewine_type_vol"] = prepare_groups_with_ids_ex(products, ['new_type_wine', 'new_type', 'volume'], "name_with_brand") products_data["groups_typel0"] = prepare_groups_with_ids_ex(products, ['type_l0'], "name_with_brand") #products_data["dict_groups_typel1_vol"] = prepare_groups_with_ids_ex(products, ['type_l1','volume']) #products_data["dict_groups_typel0_vol"] = prepare_groups_with_ids_ex(products, ['type_l0','volume']) #products_data["dict_groups_vol"] = prepare_groups_with_ids_ex(products, ['volume']) products_data["df_products"] = products save_products_data(products_data) remove_old_products(products_data) self.write_log(logfn, "Products processing finished") except Exception as ex: self.write_log(logfn, "An error occurred: " + str(ex)) return None return products_data def preprocess_item_brand(self, brand): if not isinstance(brand, str): return str(brand), '' parts = brand.split('/', 2) if len(parts) > 1: return parts[0].strip(), parts[1].strip() return brand.strip(), '' def detect_language_simple_2(self, name, reverse=False): if reverse: name = name[::-1] ru_count = 0 en_count = 0 for ch in name: if (ord(ch) >= ord('А') and ord(ch) <= ord('Я')) or \ (ord(ch) >= ord('а') and ord(ch) <= ord('я')): ru_count += 1 elif (ord(ch) >= ord('A') and ord(ch) <= ord('Z')) or \ (ord(ch) >= ord('a') and ord(ch) <= ord('z')): en_count += 1 if ru_count < 2 and en_count < 2: return 'xx' if ru_count > en_count: return 'ru' return 'en' def check_alternative_name(self, name, check_len = True, simple_lang_check=True): startpos = 0 while True: pos = name.find("/", startpos) if pos == -1: return name, '' parts = [name[:pos], name[pos+1:]] startpos = pos + 1 if check_len: if float(min(len(parts[0]), len(parts[1]))) / max(len(parts[0]), len(parts[1])) < 0.5: continue if len(parts[1]) < 3: return name, '' lang1 = self.detect_language_simple_2(parts[0], True) lang2 = self.detect_language_simple_2(parts[1]) if (lang1 == 'ru' and lang2=='en') or (lang1 == 'en' and lang2=='ru'): return parts[0], parts[1] return name, '' def merge_multiline_name(self, name_parts): name = name_parts[0] name_2 = "" lang_0 = detect_language(name) for n in name_parts[1:]: if detect_language(n) == lang_0: name += " " + n else: name_2 += " " + n return name, name_2 def process_multiline_name(self, name, check_len = True, simple_lane_check=True): if not name: return name, '' pos = name.find(" ##### ") if pos >= 0: parts = name.split(" ##### ") # Special processing for complex multiline names like; # "Луи Мемори До\nВыдержка: от 30 до 50 лет\nLouis Memory Deau\nAgeing: from 30 to 50 years" if len(parts) > 2: return self.merge_multiline_name(parts) return parts[0], parts[1] return name, '' def process_items(self, df): result={'id':[], 'orig_brand':[], 'brand':[], 'brand_short':[], 'brand_2':[], 'brand_2_short':[], 'alt_brands': [], 'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[], 'name_2':[], 'name_2_wo_brand':[], 'name_2_with_brand':[], 'names_wo_alt_brands': [], 'names_with_alt_brands': [], 'names_2_wo_alt_brands': [], 'names_2_with_alt_brands': [], 'type':[], 'new_type':[], 'type_n':[], "type_wine":[], "new_type_wine":[], "type_wine_n":[], "sour":[], "volume":[], 'gb':[], "year":[], 'aging':[], 'alco':[]} #, 'orig_attrs':[],}#, 'embeddings':[]} volume_issues = [] year_issues = [] for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)): try: if not isinstance(i, str) or not i: #print("Skipping item with id=" + str(idf) + " because of incorrect format\n") volume_issues.append(0) year_issues.append(0) continue #if not (idf == 2008546 or idf == 2007114 or idf == 2008080) : # continue #if not idf == 275213: # continue #if not idf == 173796: # continue #if idf > 1000: # continue i = json.loads(i.lower().replace("\\n", " ##### ").replace("\n", " ##### ")) result['id'].append(idf) if 'brand' in i.keys(): result['orig_brand'].append(i['brand']) brand, brand_2 = self.preprocess_item_brand(i['brand']) brand = normalize_and_clean_brand(brand) brand_2 = normalize_and_clean_brand(brand_2) else: result['orig_brand'].append(None) brand = brand_2 = None name = i['name'] result['orig_name'].append(name) # First of all remove from name specific brands that makes collisions while name parsing and trimming name, specific_brand, specific_name = replace_specific_brand_and_name(name) if specific_brand: if brand and specific_brand and (brand != specific_brand): print("Conflict between brand and specific brand for item id=[" + str(idf) + "]") else: brand = specific_brand = normalize_and_clean_brand(specific_brand) brand_2 = None if specific_name: specific_name = normalize_and_clean_name(specific_name) # Some items contains many lines separated with new line. We can easilty process them because new line is universal separator # Other types of multiline names that are separated with \ or / we process later (using process_multiline_name2) after all attributes are extracted name, name_2 = self.process_multiline_name(name) type_wine = None sour_wine = None volume = None alcohol = None year = None # First of all let's check if it is sparkling wine drink_type, name = extract_spark(name, False) if not drink_type and ('type_wine' in i.keys()): drink_type, _ = extract_spark(i['type_wine'], False) # Next let's check any other known type if not drink_type and ('type' in i.keys()): drink_type, _ = extract_type(i['type'], False) if not drink_type and ('type_wine' in i.keys()): drink_type, _ = extract_type(i['type_wine'], False) # Next let's check any other known type if not drink_type and ('category' in i.keys()): drink_type, _ = extract_type(i['category'], False) # Special case for some brands like 'jaegermeister' which sometimes the only thing specified in name # so we try to detect drink type using only brand / name if it is possible if not drink_type and brand: drink_type = extract_type_by_brand_name(brand) if 'type_wine' in i.keys(): type_wine, sour_wine, _ = extract_color_and_sour(i['type_wine'], remove=False) if drink_type is None and (type_wine or sour_wine): drink_type='вино' # Try to extract type_wine and sour from "color" attribute if exists if 'color' in i.keys(): if not type_wine: type_wine, _ = extract_color(i['color']) if type_wine and drink_type is None: drink_type='вино' if not sour_wine: sour_wine, _ = extract_sour(i['color']) if sour_wine and drink_type is None: drink_type='вино' # Try to extract sour from "sugar" attribute if exists if 'sugar' in i.keys(): if sour_wine is None: sour_wine, _ = extract_sour(i['sugar']) if sour_wine and drink_type is None: drink_type='вино' if 'volume' in i.keys(): volume = i['volume'] if 'year' in i.keys(): year = i['year'] #alco, _ =extract_alcohol_content(i['name']) #result['alco'].append(alco) drink_type_n, name = extract_type(name, True) name, alcohol_n, volume_n, aging, year_n, gb, color_n, sour_wine_n, other_n = extract_attributes_from_name(name) name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ') # If alternative name is not specified, then it is time to check it # (after we removed all attributes that could break the logic, but before normalization in order to save language difference) if not name_2: name, name_2 = self.check_alternative_name(name) name = normalize_and_clean_name(name) if name_2: name_2, _, _, _, _, _, _, _, _ = extract_attributes_from_name(name_2) name_2 = trim_name(name_2, self.types_n_others).replace(',', ' ').replace('.', ' ') name_2 = normalize_and_clean_name(name_2) if specific_brand or specific_name: name = restore_specific_brand_and_name(name, specific_brand, specific_name) # Check that there is no conflict between values extracted from name and from item attributes if not drink_type: drink_type = drink_type_n #elif drink_type and drink_type_n and (drink_type != drink_type_n): # print("Item drink_type conflict detected for item id=[" + str(idf) + "]: " + str(drink_type) + " vs " + str(drink_type_n)) if not alcohol: alcohol = alcohol_n #elif alcohol and alcohol_n and (alcohol != alcohol_n): # print("Item alcohol conflict detected for item id=[" + str(idf) + "]: " + str(alcohol) + " vs " + str(alcohol_n)) vol_issue = 0 if not volume: volume = volume_n elif volume and volume_n and (volume != volume_n): vol_issue = 1 #print("Item volume conflict detected for item id=[" + str(idf) + "]: " + str(volume) + " vs " + str(volume_n)) volume_issues.append(vol_issue) year_issue = 0 if not year: year = year_n elif year and year_n and (str(year).strip() != str(year_n).strip()): #print("Item year conflict detected for item id=[" + str(idf) + "]: " + str(year) + " vs " + str(year_n)) year_issue = 1 year_issues.append(year_issue) if not type_wine: type_wine = color_n #elif type_wine and color_n and (type_wine != color_n): # print("Item type_wine conflict detected for item id=[" + str(idf) + "]: " + str(type_wine) + " vs " + str(color_n)) if not sour_wine: sour_wine = sour_wine_n #elif sour_wine and sour_wine_n and (sour_wine != sour_wine_n): # print("Item sour_wine conflict detected for item id=[" + str(idf) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n)) # Finally fill in the data result['brand'].append(brand) result['brand_short'].append('') result['brand_2'].append(brand_2) result['brand_2_short'].append('') result['alt_brands'].append([]) if name is None: name = name if name_2 is None: name_2 = name_2 result['name'].append(name) result['name_wo_brand'].append('') result['name_with_brand'].append('') result['names_wo_alt_brands'].append([]) result['names_with_alt_brands'].append([]) result['name_2'].append(name_2) result['name_2_wo_brand'].append('') result['name_2_with_brand'].append('') result['names_2_wo_alt_brands'].append([]) result['names_2_with_alt_brands'].append([]) result['new_type'].append('') result['type_n'].append('') result['new_type_wine'].append('') result['type_wine_n'].append('') result['type'].append(drink_type) result['type_wine'].append(type_wine) result['sour'].append(sour_wine) result['aging'].append(aging) result['alco'].append(alcohol) result['gb'].append(gb) result['volume'].append(volume) result['year'].append(year) except Exception as ex: print("Error occurred while processing item id=" + str(idf), ex) #df = df.assign(volume_issues=volume_issues) #df = df.assign(year_issues=year_issues) #df.to_csv("c:\\!\\feed_items_issues.csv") #exit(0) return pd.DataFrame(result) def prcess_text(self, text): #text=''+origin #text=str(split_russian_and_english(text)) gb=find_full_word(text, self.gbs)#get_GB(text) if gb is not None: text=text.replace(str(gb), ' ') #text = remove_full_words(text, self.gbs) alcohol, text = extract_alcohol_content(text, True) #if alcohol is not None: # alco_w_comma=alcohol.replace('.', ',') # text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '') years, text = extract_years(text, True) if years is not None: text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ') production_year, text = extract_production_year(text, True) volume_or_number, text = extract_volume_or_number(text, True) #if volume_or_number is not None: #text = text.replace(vol_text, " ") #volume_with_comma=str(volume_or_number).replace('.', ',') #text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') #text = re.sub(r'\s+\b[лЛlL].\b', ' ', text) #text = re.sub(r'\s+\b[лЛlL]\b', ' ', text) #test=clean_wine_name(text) #remove_l(text) #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '') # else: # volume_or_number=re_extract_volume(text) # if volume_or_number is not None: # volume_with_comma=volume_or_number.replace('.', ',') # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') #if production_year is not None: # text = re.sub(r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text) color, sour, text = extract_color_and_sour(text, True) #color=find_full_word(text, self.type_wine) #if color is not None: # if not find_word(text, SPECIFIC_NAMES): # text=text.replace(str(color), '') #sour=find_full_word(text, self.sour) #get_sour(text) #if sour is not None: # text=text.replace(str(sour), '') # re_extracted_volume=re_extract_volume(text) # if re_extracted_volume is not None: # volume_with_comma=re_extracted_volume.replace('.', ',') # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '') # else: # re_extracted_volume=re_extract_volume(str(volume_or_number)) # volume_or_number=re_extracted_volume return text, alcohol, volume_or_number, years, production_year, gb, color, sour def process_new(self, products_data, items): if not "df_products" in products_data.keys(): products_data = self.process_products_full(products_data) print('------*-----Prepare items catalogue-----*-----') items=self.process_items(items.copy()) products = products_data["df_products"] products_brands = products['brand'].unique() items['type']=items['type'].replace(self.type_dict) print('-----*-----Adding service categories-----*-----') merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict) merge_types(items, products, type_merge_dict=self.type_dict, product_types=products_data["dict_types"]) items['brand']=items['brand'].apply(lambda x: str(x).strip().lower()) print('-----*-----Fill brands in items-----*-----') fill_brands_in_dataframe(products_brands, items) fill_brands_in_dataframe_2(products_brands, items) print('-----*-----Brand matching-----*-----') comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) out_prods=list(set(prod_brand_list)-set(comp_list)) out_items=list(set(items_brand_list)-set(comp_list)) brand_map_improved=match_brands_improved(out_items, list(products_brands)) items["new_brand"] = items["new_brand"].replace(brand_map_improved) print('-----*-----Finding brands in names-----*-----') items['new_brand']=items['new_brand'].replace('none', None) #i_brands=items[items['new_brand'].isna()]['name'].values i_brands = items['name'].values p_brands=[i for i in products_brands if i is not None and len(i)>3] #new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands, threshold=30) new_found_brands = check_brands_in_strings_pqdm(i_brands, p_brands) items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands) print('-----*-----Top inserts-----*-----') process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, self.grapes, self.other_words) items['brand']=items['brand'].replace('none', None) #print('-----*-----Replacing product types-----*-----') # 11) items['new_type'] = items['new_type'].replace(self.type_dict) items['type_l1'] = items['type'].replace(TYPES_LEVEL_1_DICT) items['type_l0'] = items['type_l1'].replace(TYPES_LEVEL_0_DICT) #fullpath = os.path.join("c:\\!!\\_items_with_types.pkl") #save_df_to_file(items, fullpath, True) #exit(1) return items, products