from preprocess.preprocess import Preprocessor from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex from preprocess.utils.common.utils import get_delimiter, verify_csv import os.path from preprocess.utils.products.products import * from rapidfuzz import fuzz, process import pandas as pd class Processor(): def __init__(self, long_types_list, short_types_list, sour_list, type_wine, gbs, grapes, other_words, #sour_merge_dict, type_merge_dict, color_merge_dict, country_list): self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list, type_wine, gbs, grapes, other_words, #sour_merge_dict, type_merge_dict, color_merge_dict, country_list) '''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True): items, products=self.preprocessor.process(prods_data, items) return self.match(items, products, is_items_first, threshold, include_alternatives) def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True): print('-----*-----Matching-----*-----') if is_items_first: #products['new_brand']=products['brand'] products['new_brand'] = products['brand_sndex_7'] #items['brand']=items['new_brand'] items['brand'] = items['brand_sndex_7'] products_groups = prepare_groups_with_ids(products) res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives) else: items_groups = prepare_groups_with_ids(items) res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives) return res.drop(['type','type_wine','alco','gb'], axis=1), items, products''' def process_products_full(self, products_data): self.preprocessor.process_products_full(products_data) def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8): prods_data = get_latest_products() if not prods_data or not os.path.isfile(prods_data["path"]): raise Exception("Actual products data not found") if not items_file: raise Exception("Items CSV not specified") #bad_lines = verify_csv(items_file, items_file + ".fixed") #if bad_lines: # items_file = items_file + ".fixed" #raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines)) items_delimiter = get_delimiter(items_file) print('items delimiter: "' + items_delimiter + '"') row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip') if not 'attrs' in row_items.columns.values: raise Exception("Uploaded Items CSV does not seem to be valid") items, products = self.preprocessor.process_new(prods_data, row_items) print('-----*-----Matching-----*-----') if is_items_first: #items['brand']=items['new_brand'] fullpath = os.path.join(prods_data["dir"], "_items.pkl") save_df_to_file(items, fullpath, True) #exit(1) #items['brand']=items['brand_sndex_5'] #products_groups = prods_data["dict_groups"] products_groups_brand_type_vol = prods_data["groups_brand_type_vol"] products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"] products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"] products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"] res = new_find_matches_with_ids(items, products, name_threshold=threshold, products_groups_brand_type_vol = products_groups_brand_type_vol, products_groups_brand_typel1_vol = products_groups_brand_typel1_vol, products_groups_brand_typel0_vol = products_groups_brand_typel0_vol, products_groups_typewine_type_vol = products_groups_typewine_type_vol, order_invariant_names_matching = order_invariant_names_matching, thread_count = thread_count) '''else: items_groups = prepare_groups_with_ids(items) items_alt_groups = prepare_groups_by_alternative_keys(items) res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)''' return res.drop(['type','type_wine','alco','gb'], axis=1), items, products '''def score_correct_items_to_products(self, manual_matchings): result = [] for mm in manual_matchings: item = mm[0] product = mm[1] item_to_compare = item['name'] if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']: item_to_compare = item['brand'] + " " + item['name'] product_to_compare = product['name_with_brand'] product2_to_compare = product['name_2'] item_with_name = item['orig_name'] if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']: item_with_name = item['orig_brand'] + " - " + item['orig_name'] product_brand = product['orig_brand'].values[0] product_with_brand = product['orig_name'].values[0] if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand: product_with_brand = product_brand + " - " + product_with_brand match, score, _ = process.extractOne(item_to_compare, product_to_compare) match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare) if score2 > score: score = score2 result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score, "item_orig":item_with_name, "product_orig":product_with_brand, "item": item_to_compare, "product":product_to_compare.values[0] }) return result def verify_correct_matching(self, correct_file, items_file, thread_count = 8): prods_data = get_latest_products() if not prods_data or not os.path.isfile(prods_data["path"]): raise Exception("Actual products data not found") products_df = prods_data["df_products"] if not correct_file: raise Exception("Correct CSV not specified") if not items_file: raise Exception("Items CSV not specified") csv_delimiter = get_delimiter(correct_file) manual_df = pd.read_csv(correct_file, sep=csv_delimiter) items_delimiter = get_delimiter(items_file) items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip') if not 'attrs' in items_df.columns.values: raise Exception("Uploaded Items CSV does not seem to be valid") items = self.preprocessor.process_items(items_df.copy()) manual_matchings = [] count = len(items) for index, row in items.iterrows(): print("Processing row #" + str(index) + "/" + str(count) + "\n") manual = manual_df[manual_df['item_id'] == row["id"]]['state'] if (len(manual) > 0) and (manual.values[0] == 1): p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]] if len(p.values) > 0: if isinstance(row, float): row = row manual_matchings.append([row, p, -1]) else: print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found") return self.score_correct_items_to_products(manual_matchings)'''