Spaces:
Build error
Build error
| from preprocess.preprocess import Preprocessor | |
| from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex | |
| from preprocess.utils.common.utils import get_delimiter, verify_csv | |
| import os.path | |
| from preprocess.utils.products.products import * | |
| from rapidfuzz import fuzz, process | |
| import pandas as pd | |
| class Processor(): | |
| def __init__(self, long_types_list, short_types_list, sour_list, | |
| type_wine, gbs, grapes, other_words, | |
| #sour_merge_dict, | |
| type_merge_dict, color_merge_dict, | |
| country_list): | |
| self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list, | |
| type_wine, gbs, grapes, other_words, | |
| #sour_merge_dict, | |
| type_merge_dict, color_merge_dict, | |
| country_list) | |
| '''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True): | |
| items, products=self.preprocessor.process(prods_data, items) | |
| return self.match(items, products, is_items_first, threshold, include_alternatives) | |
| def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True): | |
| print('-----*-----Matching-----*-----') | |
| if is_items_first: | |
| #products['new_brand']=products['brand'] | |
| products['new_brand'] = products['brand_sndex_7'] | |
| #items['brand']=items['new_brand'] | |
| items['brand'] = items['brand_sndex_7'] | |
| products_groups = prepare_groups_with_ids(products) | |
| res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives) | |
| else: | |
| items_groups = prepare_groups_with_ids(items) | |
| res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives) | |
| return res.drop(['type','type_wine','alco','gb'], axis=1), items, products''' | |
| def process_products_full(self, products_data): | |
| self.preprocessor.process_products_full(products_data) | |
| def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8): | |
| prods_data = get_latest_products() | |
| if not prods_data or not os.path.isfile(prods_data["path"]): | |
| raise Exception("Actual products data not found") | |
| if not items_file: | |
| raise Exception("Items CSV not specified") | |
| #bad_lines = verify_csv(items_file, items_file + ".fixed") | |
| #if bad_lines: | |
| # items_file = items_file + ".fixed" | |
| #raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines)) | |
| items_delimiter = get_delimiter(items_file) | |
| print('items delimiter: "' + items_delimiter + '"') | |
| row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip') | |
| if not 'attrs' in row_items.columns.values: | |
| raise Exception("Uploaded Items CSV does not seem to be valid") | |
| items, products = self.preprocessor.process_new(prods_data, row_items) | |
| print('-----*-----Matching-----*-----') | |
| if is_items_first: | |
| #items['brand']=items['new_brand'] | |
| fullpath = os.path.join(prods_data["dir"], "_items.pkl") | |
| save_df_to_file(items, fullpath, True) | |
| #exit(1) | |
| #items['brand']=items['brand_sndex_5'] | |
| #products_groups = prods_data["dict_groups"] | |
| products_groups_brand_type_vol = prods_data["groups_brand_type_vol"] | |
| products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"] | |
| products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"] | |
| products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"] | |
| res = new_find_matches_with_ids(items, products, | |
| name_threshold=threshold, | |
| products_groups_brand_type_vol = products_groups_brand_type_vol, | |
| products_groups_brand_typel1_vol = products_groups_brand_typel1_vol, | |
| products_groups_brand_typel0_vol = products_groups_brand_typel0_vol, | |
| products_groups_typewine_type_vol = products_groups_typewine_type_vol, | |
| order_invariant_names_matching = order_invariant_names_matching, | |
| thread_count = thread_count) | |
| '''else: | |
| items_groups = prepare_groups_with_ids(items) | |
| items_alt_groups = prepare_groups_by_alternative_keys(items) | |
| res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)''' | |
| return res.drop(['type','type_wine','alco','gb'], axis=1), items, products | |
| '''def score_correct_items_to_products(self, manual_matchings): | |
| result = [] | |
| for mm in manual_matchings: | |
| item = mm[0] | |
| product = mm[1] | |
| item_to_compare = item['name'] | |
| if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']: | |
| item_to_compare = item['brand'] + " " + item['name'] | |
| product_to_compare = product['name_with_brand'] | |
| product2_to_compare = product['name_2'] | |
| item_with_name = item['orig_name'] | |
| if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']: | |
| item_with_name = item['orig_brand'] + " - " + item['orig_name'] | |
| product_brand = product['orig_brand'].values[0] | |
| product_with_brand = product['orig_name'].values[0] | |
| if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand: | |
| product_with_brand = product_brand + " - " + product_with_brand | |
| match, score, _ = process.extractOne(item_to_compare, product_to_compare) | |
| match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare) | |
| if score2 > score: | |
| score = score2 | |
| result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score, | |
| "item_orig":item_with_name, "product_orig":product_with_brand, | |
| "item": item_to_compare, "product":product_to_compare.values[0] | |
| }) | |
| return result | |
| def verify_correct_matching(self, correct_file, items_file, thread_count = 8): | |
| prods_data = get_latest_products() | |
| if not prods_data or not os.path.isfile(prods_data["path"]): | |
| raise Exception("Actual products data not found") | |
| products_df = prods_data["df_products"] | |
| if not correct_file: | |
| raise Exception("Correct CSV not specified") | |
| if not items_file: | |
| raise Exception("Items CSV not specified") | |
| csv_delimiter = get_delimiter(correct_file) | |
| manual_df = pd.read_csv(correct_file, sep=csv_delimiter) | |
| items_delimiter = get_delimiter(items_file) | |
| items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip') | |
| if not 'attrs' in items_df.columns.values: | |
| raise Exception("Uploaded Items CSV does not seem to be valid") | |
| items = self.preprocessor.process_items(items_df.copy()) | |
| manual_matchings = [] | |
| count = len(items) | |
| for index, row in items.iterrows(): | |
| print("Processing row #" + str(index) + "/" + str(count) + "\n") | |
| manual = manual_df[manual_df['item_id'] == row["id"]]['state'] | |
| if (len(manual) > 0) and (manual.values[0] == 1): | |
| p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]] | |
| if len(p.values) > 0: | |
| if isinstance(row, float): | |
| row = row | |
| manual_matchings.append([row, p, -1]) | |
| else: | |
| print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found") | |
| return self.score_correct_items_to_products(manual_matchings)''' |