Spaces:
Build error
Build error
| import os.path | |
| from preprocess.utils.common.utils import normalize_name | |
| from math import isnan | |
| from preprocess.utils.items.attrs import * | |
| from preprocess.utils.common.brand_matching import * | |
| from preprocess.utils.common.top_inserts import * | |
| from preprocess.utils.products.products import * | |
| import pandas as pd | |
| from processor.matching import prepare_groups_with_ids_ex | |
| class Preprocessor(): | |
| def __init__(self, long_types_list, short_types_list, sour_list, | |
| type_wine, gbs, grapes, other_words, | |
| #sour_merge_dict, | |
| type_merge_dict, color_merge_dict, | |
| country_list): | |
| self.long_types_list=[element.lower() for element in long_types_list] | |
| self.short_types_list=short_types_list | |
| self.sour=sour_list | |
| self.type_wine=type_wine | |
| self.gbs=gbs | |
| self.grapes=grapes | |
| self.other_words=other_words | |
| self.types_n_others=long_types_list+other_words+sour_list+country_list | |
| self.types_n_others.remove("Шерри") | |
| self.type_dict=type_merge_dict | |
| self.color_merge_dict=color_merge_dict | |
| self.country_list = country_list | |
| global TYPES_FROM_BRAND_DICT | |
| updated = {} | |
| for k, v in TYPES_FROM_BRAND_DICT.items(): | |
| updated[k] = v | |
| updated[normalize_name(k)] = v | |
| TYPES_FROM_BRAND_DICT = updated | |
| def write_log(self, logfn, s): | |
| print(s + "\n") | |
| with open(logfn, 'a') as logf: | |
| logf.write(datetime.now().strftime('[%Y-%m-%d %H:%M:%S]: ') + s + "\n") | |
| def process_products(self, products): | |
| result={'index':[], 'id':[], 'orig_brand':[], 'brand':[], 'brand_unwrap':[], | |
| 'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[], | |
| 'orig_name_2':[], 'name_2': [], | |
| 'orig_type':[], 'type':[], 'type_l1':[], 'type_l0':[], | |
| 'orig_type_wine':[], "type_wine":[], 'sour':[], | |
| "volume":[], "gb":[], "year":[], 'alco':[], 'other': []}#, 'embeddings':[]} | |
| index = 0 | |
| for idx, row in tqdm(products.iterrows()): | |
| try: | |
| #if not row['id'] == 1115: | |
| # continue | |
| #if not isinstance(row['brand'], str): | |
| # continue | |
| #if (row['brand'].lower() == 'Villa Raiano'.lower()) or (row['brand'].lower() == 'bosco'.lower()): | |
| # row = row | |
| #else: | |
| # continue | |
| if isinstance(row['product_type'], (int, float)) and isnan(row['product_type']): | |
| print("Product type is not specified or incorrect for product id=[" + str(row['id']) + "]. Product is ignored") | |
| continue | |
| result['index'].append(index) | |
| result['id'].append(row['id']) | |
| result['orig_brand'].append(row['brand']) | |
| #result['orig_name'].append(row['name_long']) | |
| result['orig_name'].append(row['name']) | |
| result['orig_name_2'].append(row['name_translit']) | |
| result['orig_type'].append(row['product_type']) | |
| result['orig_type_wine'].append(row['category']) | |
| brand = preprocess_product_brand(row['brand']) | |
| #name = preprocess_product_name(row['name_long']) | |
| name = preprocess_product_name(row['name']) | |
| name_translit = preprocess_product_name(row['name_translit']) | |
| # First of all let's check if it is sparkling wine | |
| drink_type, _ = extract_spark(row['product_type'], False) | |
| drink_type_n, name = extract_spark(name, True) | |
| if not drink_type: | |
| drink_type, _ = extract_type(row['product_type'], False) | |
| drink_type_n, name = extract_type(name, True) | |
| if not drink_type: | |
| drink_type = row['product_type'].lower() | |
| type_wine = None | |
| sour_wine = '' | |
| if isinstance(row['type_prefix'], str) and row['type_prefix']: | |
| type_wine, sour_wine, _ = extract_color_and_sour(row['type_prefix'], remove=False) | |
| if drink_type is None and (type_wine or sour_wine): | |
| drink_type='вино' | |
| volume = is_volume(row['volume']) | |
| year, _ = extract_production_year(str(row['name_postfix'])) | |
| gb, _ = extract_gb(row['name_postfix'], False) | |
| alco, _ = extract_alcohol_content(name) | |
| name, alcohol_n, volume_n, aging_n, year_n, gb_n, color_n, sour_wine_n, other_n = extract_attributes_from_name(name) | |
| name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ') | |
| name = normalize_and_clean_name(name) | |
| name_translit, alcohol_n2, volume_n2, aging_n2, year_n2, gb_n2, color_n2, sour_wine_n2, other_n2 = extract_attributes_from_name(name_translit) | |
| name_translit = trim_name(name_translit, self.types_n_others).replace(',', ' ').replace('.', ' ') | |
| name_translit = normalize_and_clean_name(name_translit) | |
| if not year: | |
| year = year_n | |
| #elif year and year_n and (year != year_n): | |
| # print("Product year conflict detected for product id=[" + str(row['id']) + "]: " + str(year) + " vs " + str(year_n)) | |
| if not type_wine: | |
| type_wine = color_n | |
| #elif color_n and type_wine and (color_n != type_wine): | |
| # print("Product type_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(type_wine) + " vs " + str(color_n)) | |
| if not sour_wine: | |
| sour_wine = sour_wine_n | |
| #if sour_wine_n and sour_wine and (sour_wine != sour_wine_n): | |
| # print("Product sour_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n)) | |
| if not volume: | |
| volume = volume_n | |
| elif volume_n and volume and (volume_n != volume): | |
| print("Product volume conflict detected for product id=[" + str(row['id']) + "]: " + str(volume) + " vs " + str(volume_n)) | |
| result['brand'].append(brand) | |
| result['brand_unwrap'].append('') | |
| result['name'].append(name) | |
| result['name_2'].append(name_translit) | |
| result['name_wo_brand'].append('') | |
| result['name_with_brand'].append('') | |
| if not type_wine: | |
| type_wine = '' | |
| result['type'].append(drink_type.lower()) | |
| result['type_wine'].append(type_wine.lower()) | |
| result['type_l1'].append('') | |
| result['type_l0'].append('') | |
| if not sour_wine: | |
| sour_wine = '' | |
| result['sour'].append(sour_wine) | |
| result['volume'].append(volume) | |
| result['year'].append(year) | |
| result['gb'].append(gb) | |
| result['alco'].append(alco) | |
| result['other'].append(other_n) | |
| index += 1 | |
| except Exception as ex: | |
| print("Error processing product id=" + str(idx) + ": " + str(ex)) | |
| return pd.DataFrame(result) | |
| def process_products_full(self, products_data): | |
| logfn = os.path.join(products_data['dir'], "update_log.txt") | |
| try: | |
| self.write_log(logfn, "Products processing started") | |
| prods_file = products_data['path'] | |
| products_delimiter = get_delimiter(prods_file) | |
| # row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip') | |
| products = pd.read_csv(prods_file, sep=products_delimiter) | |
| # 1) | |
| self.write_log(logfn, '------*-----Prepare products catalogue-----*-----') | |
| products = self.process_products(products.copy()) | |
| products_data["dict_types"] = products['type'].unique().tolist() | |
| # 2) | |
| #products['brand'] = products['brand'].apply(lambda x: str(x).strip().lower()) | |
| # 3) | |
| #products_data["brand_3"] = products['brand'].unique() | |
| self.write_log(logfn, '------*-----Unwrapping brands-----*-----') | |
| products["brand_unwrap"] = products["brand"] | |
| # 4) | |
| ##products_data["unwrap_brands_1"] = unwrap_brands(products) | |
| products_data["unwrap_brands_1"] = {} | |
| # 5) | |
| products["brand_unwrap"] = products["brand"].replace(products_data["unwrap_brands_1"]) | |
| # 6) | |
| #products_data["unwrap_brand_2"] = unwrap_brands(products) | |
| # 7) | |
| ##products_data["unwrap_brands_2"] = unwrap_brands(products, products['brand_unwrap'].unique()) | |
| products_data["unwrap_brands_2"] = {} | |
| # 8) | |
| products["brand_unwrap"] = products["brand_unwrap"].replace(products_data["unwrap_brands_2"]) | |
| products["brand_unwrap"] = products.apply(lambda row: row["brand_unwrap"] if row["brand_unwrap"] != row["brand"] else '', axis=1) | |
| # 9) | |
| self.write_log(logfn, '-----*-----Adding service categories-----*-----') | |
| merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict) | |
| merge_types(products, products, type_merge_dict=self.type_dict) | |
| # Now we can normalize and clean brands and names (only after trimming) | |
| products['brand'] = products['brand'].apply(normalize_and_clean_brand) | |
| products['norm_name'] = products['name'] | |
| # 11) | |
| self.write_log(logfn, '-----*-----Replacing product types-----*-----') | |
| products['type']=products['type'].replace(self.type_dict) | |
| products['new_brand']=products['brand'] | |
| #products["name_with_brand"] = products["name"] | |
| products["name_wo_brand"] = products.apply(lambda row: remove_brand_from_name(row['name'], row['brand']), axis=1) | |
| products["name_with_brand"] = products.apply(lambda row: insert_brand_in_name(row['name'], row['brand']), axis=1) | |
| #products["name_wo_brand_len"] = products['name_wo_brand'].apply(lambda x: len(x)) | |
| #products_data["dict_groups_brand_type_vol_typewine"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume', 'new_type_wine']) | |
| products_data["groups_brand_type_vol"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume'], "name_wo_brand") | |
| # Change it from type_wine to type | |
| products['type_l1'] = products['type'].replace(TYPES_LEVEL_1_DICT) | |
| products['type_l0'] = products['type_l1'].replace(TYPES_LEVEL_0_DICT) | |
| products_data["groups_brand_typel1_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l1', 'volume'], "name_wo_brand") | |
| products_data["groups_brand_typel0_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l0', 'volume'], "name_wo_brand") | |
| products_data["groups_typewine_type_vol"] = prepare_groups_with_ids_ex(products, ['new_type_wine', 'new_type', 'volume'], "name_with_brand") | |
| products_data["groups_typel0"] = prepare_groups_with_ids_ex(products, ['type_l0'], "name_with_brand") | |
| #products_data["dict_groups_typel1_vol"] = prepare_groups_with_ids_ex(products, ['type_l1','volume']) | |
| #products_data["dict_groups_typel0_vol"] = prepare_groups_with_ids_ex(products, ['type_l0','volume']) | |
| #products_data["dict_groups_vol"] = prepare_groups_with_ids_ex(products, ['volume']) | |
| products_data["df_products"] = products | |
| save_products_data(products_data) | |
| remove_old_products(products_data) | |
| self.write_log(logfn, "Products processing finished") | |
| except Exception as ex: | |
| self.write_log(logfn, "An error occurred: " + str(ex)) | |
| return None | |
| return products_data | |
| def preprocess_item_brand(self, brand): | |
| if not isinstance(brand, str): | |
| return str(brand), '' | |
| parts = brand.split('/', 2) | |
| if len(parts) > 1: | |
| return parts[0].strip(), parts[1].strip() | |
| return brand.strip(), '' | |
| def detect_language_simple_2(self, name, reverse=False): | |
| if reverse: | |
| name = name[::-1] | |
| ru_count = 0 | |
| en_count = 0 | |
| for ch in name: | |
| if (ord(ch) >= ord('А') and ord(ch) <= ord('Я')) or \ | |
| (ord(ch) >= ord('а') and ord(ch) <= ord('я')): | |
| ru_count += 1 | |
| elif (ord(ch) >= ord('A') and ord(ch) <= ord('Z')) or \ | |
| (ord(ch) >= ord('a') and ord(ch) <= ord('z')): | |
| en_count += 1 | |
| if ru_count < 2 and en_count < 2: | |
| return 'xx' | |
| if ru_count > en_count: | |
| return 'ru' | |
| return 'en' | |
| def check_alternative_name(self, name, check_len = True, simple_lang_check=True): | |
| startpos = 0 | |
| while True: | |
| pos = name.find("/", startpos) | |
| if pos == -1: | |
| return name, '' | |
| parts = [name[:pos], name[pos+1:]] | |
| startpos = pos + 1 | |
| if check_len: | |
| if float(min(len(parts[0]), len(parts[1]))) / max(len(parts[0]), len(parts[1])) < 0.5: | |
| continue | |
| if len(parts[1]) < 3: | |
| return name, '' | |
| lang1 = self.detect_language_simple_2(parts[0], True) | |
| lang2 = self.detect_language_simple_2(parts[1]) | |
| if (lang1 == 'ru' and lang2=='en') or (lang1 == 'en' and lang2=='ru'): | |
| return parts[0], parts[1] | |
| return name, '' | |
| def merge_multiline_name(self, name_parts): | |
| name = name_parts[0] | |
| name_2 = "" | |
| lang_0 = detect_language(name) | |
| for n in name_parts[1:]: | |
| if detect_language(n) == lang_0: | |
| name += " " + n | |
| else: | |
| name_2 += " " + n | |
| return name, name_2 | |
| def process_multiline_name(self, name, check_len = True, simple_lane_check=True): | |
| if not name: | |
| return name, '' | |
| pos = name.find(" ##### ") | |
| if pos >= 0: | |
| parts = name.split(" ##### ") | |
| # Special processing for complex multiline names like; | |
| # "Луи Мемори До\nВыдержка: от 30 до 50 лет\nLouis Memory Deau\nAgeing: from 30 to 50 years" | |
| if len(parts) > 2: | |
| return self.merge_multiline_name(parts) | |
| return parts[0], parts[1] | |
| return name, '' | |
| def process_items(self, df): | |
| result={'id':[], 'orig_brand':[], 'brand':[], 'brand_short':[], 'brand_2':[], 'brand_2_short':[], 'alt_brands': [], | |
| 'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[], | |
| 'name_2':[], 'name_2_wo_brand':[], 'name_2_with_brand':[], | |
| 'names_wo_alt_brands': [], 'names_with_alt_brands': [], 'names_2_wo_alt_brands': [], 'names_2_with_alt_brands': [], | |
| 'type':[], 'new_type':[], 'type_n':[], | |
| "type_wine":[], "new_type_wine":[], "type_wine_n":[], | |
| "sour":[], "volume":[], 'gb':[], "year":[], 'aging':[], 'alco':[]} #, 'orig_attrs':[],}#, 'embeddings':[]} | |
| volume_issues = [] | |
| year_issues = [] | |
| for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)): | |
| try: | |
| if not isinstance(i, str) or not i: | |
| #print("Skipping item with id=" + str(idf) + " because of incorrect format\n") | |
| volume_issues.append(0) | |
| year_issues.append(0) | |
| continue | |
| #if not (idf == 2008546 or idf == 2007114 or idf == 2008080) : | |
| # continue | |
| #if not idf == 275213: | |
| # continue | |
| #if not idf == 173796: | |
| # continue | |
| #if idf > 1000: | |
| # continue | |
| i = json.loads(i.lower().replace("\\n", " ##### ").replace("\n", " ##### ")) | |
| result['id'].append(idf) | |
| if 'brand' in i.keys(): | |
| result['orig_brand'].append(i['brand']) | |
| brand, brand_2 = self.preprocess_item_brand(i['brand']) | |
| brand = normalize_and_clean_brand(brand) | |
| brand_2 = normalize_and_clean_brand(brand_2) | |
| else: | |
| result['orig_brand'].append(None) | |
| brand = brand_2 = None | |
| name = i['name'] | |
| result['orig_name'].append(name) | |
| # First of all remove from name specific brands that makes collisions while name parsing and trimming | |
| name, specific_brand, specific_name = replace_specific_brand_and_name(name) | |
| if specific_brand: | |
| if brand and specific_brand and (brand != specific_brand): | |
| print("Conflict between brand and specific brand for item id=[" + str(idf) + "]") | |
| else: | |
| brand = specific_brand = normalize_and_clean_brand(specific_brand) | |
| brand_2 = None | |
| if specific_name: | |
| specific_name = normalize_and_clean_name(specific_name) | |
| # Some items contains many lines separated with new line. We can easilty process them because new line is universal separator | |
| # Other types of multiline names that are separated with \ or / we process later (using process_multiline_name2) after all attributes are extracted | |
| name, name_2 = self.process_multiline_name(name) | |
| type_wine = None | |
| sour_wine = None | |
| volume = None | |
| alcohol = None | |
| year = None | |
| # First of all let's check if it is sparkling wine | |
| drink_type, name = extract_spark(name, False) | |
| if not drink_type and ('type_wine' in i.keys()): | |
| drink_type, _ = extract_spark(i['type_wine'], False) | |
| # Next let's check any other known type | |
| if not drink_type and ('type' in i.keys()): | |
| drink_type, _ = extract_type(i['type'], False) | |
| if not drink_type and ('type_wine' in i.keys()): | |
| drink_type, _ = extract_type(i['type_wine'], False) | |
| # Next let's check any other known type | |
| if not drink_type and ('category' in i.keys()): | |
| drink_type, _ = extract_type(i['category'], False) | |
| # Special case for some brands like 'jaegermeister' which sometimes the only thing specified in name | |
| # so we try to detect drink type using only brand / name if it is possible | |
| if not drink_type and brand: | |
| drink_type = extract_type_by_brand_name(brand) | |
| if 'type_wine' in i.keys(): | |
| type_wine, sour_wine, _ = extract_color_and_sour(i['type_wine'], remove=False) | |
| if drink_type is None and (type_wine or sour_wine): | |
| drink_type='вино' | |
| # Try to extract type_wine and sour from "color" attribute if exists | |
| if 'color' in i.keys(): | |
| if not type_wine: | |
| type_wine, _ = extract_color(i['color']) | |
| if type_wine and drink_type is None: | |
| drink_type='вино' | |
| if not sour_wine: | |
| sour_wine, _ = extract_sour(i['color']) | |
| if sour_wine and drink_type is None: | |
| drink_type='вино' | |
| # Try to extract sour from "sugar" attribute if exists | |
| if 'sugar' in i.keys(): | |
| if sour_wine is None: | |
| sour_wine, _ = extract_sour(i['sugar']) | |
| if sour_wine and drink_type is None: | |
| drink_type='вино' | |
| if 'volume' in i.keys(): | |
| volume = i['volume'] | |
| if 'year' in i.keys(): | |
| year = i['year'] | |
| #alco, _ =extract_alcohol_content(i['name']) | |
| #result['alco'].append(alco) | |
| drink_type_n, name = extract_type(name, True) | |
| name, alcohol_n, volume_n, aging, year_n, gb, color_n, sour_wine_n, other_n = extract_attributes_from_name(name) | |
| name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ') | |
| # If alternative name is not specified, then it is time to check it | |
| # (after we removed all attributes that could break the logic, but before normalization in order to save language difference) | |
| if not name_2: | |
| name, name_2 = self.check_alternative_name(name) | |
| name = normalize_and_clean_name(name) | |
| if name_2: | |
| name_2, _, _, _, _, _, _, _, _ = extract_attributes_from_name(name_2) | |
| name_2 = trim_name(name_2, self.types_n_others).replace(',', ' ').replace('.', ' ') | |
| name_2 = normalize_and_clean_name(name_2) | |
| if specific_brand or specific_name: | |
| name = restore_specific_brand_and_name(name, specific_brand, specific_name) | |
| # Check that there is no conflict between values extracted from name and from item attributes | |
| if not drink_type: | |
| drink_type = drink_type_n | |
| #elif drink_type and drink_type_n and (drink_type != drink_type_n): | |
| # print("Item drink_type conflict detected for item id=[" + str(idf) + "]: " + str(drink_type) + " vs " + str(drink_type_n)) | |
| if not alcohol: | |
| alcohol = alcohol_n | |
| #elif alcohol and alcohol_n and (alcohol != alcohol_n): | |
| # print("Item alcohol conflict detected for item id=[" + str(idf) + "]: " + str(alcohol) + " vs " + str(alcohol_n)) | |
| vol_issue = 0 | |
| if not volume: | |
| volume = volume_n | |
| elif volume and volume_n and (volume != volume_n): | |
| vol_issue = 1 | |
| #print("Item volume conflict detected for item id=[" + str(idf) + "]: " + str(volume) + " vs " + str(volume_n)) | |
| volume_issues.append(vol_issue) | |
| year_issue = 0 | |
| if not year: | |
| year = year_n | |
| elif year and year_n and (str(year).strip() != str(year_n).strip()): | |
| #print("Item year conflict detected for item id=[" + str(idf) + "]: " + str(year) + " vs " + str(year_n)) | |
| year_issue = 1 | |
| year_issues.append(year_issue) | |
| if not type_wine: | |
| type_wine = color_n | |
| #elif type_wine and color_n and (type_wine != color_n): | |
| # print("Item type_wine conflict detected for item id=[" + str(idf) + "]: " + str(type_wine) + " vs " + str(color_n)) | |
| if not sour_wine: | |
| sour_wine = sour_wine_n | |
| #elif sour_wine and sour_wine_n and (sour_wine != sour_wine_n): | |
| # print("Item sour_wine conflict detected for item id=[" + str(idf) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n)) | |
| # Finally fill in the data | |
| result['brand'].append(brand) | |
| result['brand_short'].append('') | |
| result['brand_2'].append(brand_2) | |
| result['brand_2_short'].append('') | |
| result['alt_brands'].append([]) | |
| if name is None: | |
| name = name | |
| if name_2 is None: | |
| name_2 = name_2 | |
| result['name'].append(name) | |
| result['name_wo_brand'].append('') | |
| result['name_with_brand'].append('') | |
| result['names_wo_alt_brands'].append([]) | |
| result['names_with_alt_brands'].append([]) | |
| result['name_2'].append(name_2) | |
| result['name_2_wo_brand'].append('') | |
| result['name_2_with_brand'].append('') | |
| result['names_2_wo_alt_brands'].append([]) | |
| result['names_2_with_alt_brands'].append([]) | |
| result['new_type'].append('') | |
| result['type_n'].append('') | |
| result['new_type_wine'].append('') | |
| result['type_wine_n'].append('') | |
| result['type'].append(drink_type) | |
| result['type_wine'].append(type_wine) | |
| result['sour'].append(sour_wine) | |
| result['aging'].append(aging) | |
| result['alco'].append(alcohol) | |
| result['gb'].append(gb) | |
| result['volume'].append(volume) | |
| result['year'].append(year) | |
| except Exception as ex: | |
| print("Error occurred while processing item id=" + str(idf), ex) | |
| #df = df.assign(volume_issues=volume_issues) | |
| #df = df.assign(year_issues=year_issues) | |
| #df.to_csv("c:\\!\\feed_items_issues.csv") | |
| #exit(0) | |
| return pd.DataFrame(result) | |
| def prcess_text(self, text): | |
| #text=''+origin | |
| #text=str(split_russian_and_english(text)) | |
| gb=find_full_word(text, self.gbs)#get_GB(text) | |
| if gb is not None: | |
| text=text.replace(str(gb), ' ') | |
| #text = remove_full_words(text, self.gbs) | |
| alcohol, text = extract_alcohol_content(text, True) | |
| #if alcohol is not None: | |
| # alco_w_comma=alcohol.replace('.', ',') | |
| # text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '') | |
| years, text = extract_years(text, True) | |
| if years is not None: | |
| text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ') | |
| production_year, text = extract_production_year(text, True) | |
| volume_or_number, text = extract_volume_or_number(text, True) | |
| #if volume_or_number is not None: | |
| #text = text.replace(vol_text, " ") | |
| #volume_with_comma=str(volume_or_number).replace('.', ',') | |
| #text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') | |
| #text = re.sub(r'\s+\b[лЛlL].\b', ' ', text) | |
| #text = re.sub(r'\s+\b[лЛlL]\b', ' ', text) | |
| #test=clean_wine_name(text) #remove_l(text) | |
| #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '') | |
| # else: | |
| # volume_or_number=re_extract_volume(text) | |
| # if volume_or_number is not None: | |
| # volume_with_comma=volume_or_number.replace('.', ',') | |
| # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '') | |
| #if production_year is not None: | |
| # text = re.sub(r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text) | |
| color, sour, text = extract_color_and_sour(text, True) | |
| #color=find_full_word(text, self.type_wine) | |
| #if color is not None: | |
| # if not find_word(text, SPECIFIC_NAMES): | |
| # text=text.replace(str(color), '') | |
| #sour=find_full_word(text, self.sour) #get_sour(text) | |
| #if sour is not None: | |
| # text=text.replace(str(sour), '') | |
| # re_extracted_volume=re_extract_volume(text) | |
| # if re_extracted_volume is not None: | |
| # volume_with_comma=re_extracted_volume.replace('.', ',') | |
| # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '') | |
| # else: | |
| # re_extracted_volume=re_extract_volume(str(volume_or_number)) | |
| # volume_or_number=re_extracted_volume | |
| return text, alcohol, volume_or_number, years, production_year, gb, color, sour | |
| def process_new(self, products_data, items): | |
| if not "df_products" in products_data.keys(): | |
| products_data = self.process_products_full(products_data) | |
| print('------*-----Prepare items catalogue-----*-----') | |
| items=self.process_items(items.copy()) | |
| products = products_data["df_products"] | |
| products_brands = products['brand'].unique() | |
| items['type']=items['type'].replace(self.type_dict) | |
| print('-----*-----Adding service categories-----*-----') | |
| merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict) | |
| merge_types(items, products, type_merge_dict=self.type_dict, product_types=products_data["dict_types"]) | |
| items['brand']=items['brand'].apply(lambda x: str(x).strip().lower()) | |
| print('-----*-----Fill brands in items-----*-----') | |
| fill_brands_in_dataframe(products_brands, items) | |
| fill_brands_in_dataframe_2(products_brands, items) | |
| print('-----*-----Brand matching-----*-----') | |
| comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) | |
| comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items) | |
| out_prods=list(set(prod_brand_list)-set(comp_list)) | |
| out_items=list(set(items_brand_list)-set(comp_list)) | |
| brand_map_improved=match_brands_improved(out_items, list(products_brands)) | |
| items["new_brand"] = items["new_brand"].replace(brand_map_improved) | |
| print('-----*-----Finding brands in names-----*-----') | |
| items['new_brand']=items['new_brand'].replace('none', None) | |
| #i_brands=items[items['new_brand'].isna()]['name'].values | |
| i_brands = items['name'].values | |
| p_brands=[i for i in products_brands if i is not None and len(i)>3] | |
| #new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands, threshold=30) | |
| new_found_brands = check_brands_in_strings_pqdm(i_brands, p_brands) | |
| items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands) | |
| print('-----*-----Top inserts-----*-----') | |
| process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, self.grapes, self.other_words) | |
| items['brand']=items['brand'].replace('none', None) | |
| #print('-----*-----Replacing product types-----*-----') | |
| # 11) | |
| items['new_type'] = items['new_type'].replace(self.type_dict) | |
| items['type_l1'] = items['type'].replace(TYPES_LEVEL_1_DICT) | |
| items['type_l0'] = items['type_l1'].replace(TYPES_LEVEL_0_DICT) | |
| #fullpath = os.path.join("c:\\!!\\_items_with_types.pkl") | |
| #save_df_to_file(items, fullpath, True) | |
| #exit(1) | |
| return items, products | |