Spaces:
Build error
Build error
File size: 8,461 Bytes
3cc4e3f d3ca850 5b19d8a 606ca5f d4bade4 3cc4e3f 2956b24 95c9287 c5b2790 3cc4e3f 2956b24 95c9287 c5b2790 3cc4e3f d3ca850 606ca5f 3cc4e3f cb92a0f 3cc4e3f 606ca5f 3cc4e3f 606ca5f 3cc4e3f d3ca850 606ca5f c5b2790 606ca5f 5b19d8a 606ca5f 5b19d8a 606ca5f c5b2790 606ca5f c5b2790 606ca5f 2956b24 5b19d8a 606ca5f 2956b24 d3ca850 5b19d8a d3ca850 c5b2790 d3ca850 606ca5f d3ca850 606ca5f c5b2790 d4bade4 eedd5dc d4bade4 eedd5dc d4bade4 eedd5dc d4bade4 eedd5dc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | from preprocess.preprocess import Preprocessor
from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex
from preprocess.utils.common.utils import get_delimiter, verify_csv
import os.path
from preprocess.utils.products.products import *
from rapidfuzz import fuzz, process
import pandas as pd
class Processor():
def __init__(self, long_types_list, short_types_list, sour_list,
type_wine, gbs, grapes, other_words,
#sour_merge_dict,
type_merge_dict, color_merge_dict,
country_list):
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
type_wine, gbs, grapes, other_words,
#sour_merge_dict,
type_merge_dict, color_merge_dict,
country_list)
'''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True):
items, products=self.preprocessor.process(prods_data, items)
return self.match(items, products, is_items_first, threshold, include_alternatives)
def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True):
print('-----*-----Matching-----*-----')
if is_items_first:
#products['new_brand']=products['brand']
products['new_brand'] = products['brand_sndex_7']
#items['brand']=items['new_brand']
items['brand'] = items['brand_sndex_7']
products_groups = prepare_groups_with_ids(products)
res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives)
else:
items_groups = prepare_groups_with_ids(items)
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products'''
def process_products_full(self, products_data):
self.preprocessor.process_products_full(products_data)
def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8):
prods_data = get_latest_products()
if not prods_data or not os.path.isfile(prods_data["path"]):
raise Exception("Actual products data not found")
if not items_file:
raise Exception("Items CSV not specified")
#bad_lines = verify_csv(items_file, items_file + ".fixed")
#if bad_lines:
# items_file = items_file + ".fixed"
#raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines))
items_delimiter = get_delimiter(items_file)
print('items delimiter: "' + items_delimiter + '"')
row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
if not 'attrs' in row_items.columns.values:
raise Exception("Uploaded Items CSV does not seem to be valid")
items, products = self.preprocessor.process_new(prods_data, row_items)
print('-----*-----Matching-----*-----')
if is_items_first:
#items['brand']=items['new_brand']
fullpath = os.path.join(prods_data["dir"], "_items.pkl")
save_df_to_file(items, fullpath, True)
#exit(1)
#items['brand']=items['brand_sndex_5']
#products_groups = prods_data["dict_groups"]
products_groups_brand_type_vol = prods_data["groups_brand_type_vol"]
products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"]
products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"]
products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"]
res = new_find_matches_with_ids(items, products,
name_threshold=threshold,
products_groups_brand_type_vol = products_groups_brand_type_vol,
products_groups_brand_typel1_vol = products_groups_brand_typel1_vol,
products_groups_brand_typel0_vol = products_groups_brand_typel0_vol,
products_groups_typewine_type_vol = products_groups_typewine_type_vol,
order_invariant_names_matching = order_invariant_names_matching,
thread_count = thread_count)
'''else:
items_groups = prepare_groups_with_ids(items)
items_alt_groups = prepare_groups_by_alternative_keys(items)
res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)'''
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
'''def score_correct_items_to_products(self, manual_matchings):
result = []
for mm in manual_matchings:
item = mm[0]
product = mm[1]
item_to_compare = item['name']
if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
item_to_compare = item['brand'] + " " + item['name']
product_to_compare = product['name_with_brand']
product2_to_compare = product['name_2']
item_with_name = item['orig_name']
if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']:
item_with_name = item['orig_brand'] + " - " + item['orig_name']
product_brand = product['orig_brand'].values[0]
product_with_brand = product['orig_name'].values[0]
if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand:
product_with_brand = product_brand + " - " + product_with_brand
match, score, _ = process.extractOne(item_to_compare, product_to_compare)
match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare)
if score2 > score:
score = score2
result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score,
"item_orig":item_with_name, "product_orig":product_with_brand,
"item": item_to_compare, "product":product_to_compare.values[0]
})
return result
def verify_correct_matching(self, correct_file, items_file, thread_count = 8):
prods_data = get_latest_products()
if not prods_data or not os.path.isfile(prods_data["path"]):
raise Exception("Actual products data not found")
products_df = prods_data["df_products"]
if not correct_file:
raise Exception("Correct CSV not specified")
if not items_file:
raise Exception("Items CSV not specified")
csv_delimiter = get_delimiter(correct_file)
manual_df = pd.read_csv(correct_file, sep=csv_delimiter)
items_delimiter = get_delimiter(items_file)
items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
if not 'attrs' in items_df.columns.values:
raise Exception("Uploaded Items CSV does not seem to be valid")
items = self.preprocessor.process_items(items_df.copy())
manual_matchings = []
count = len(items)
for index, row in items.iterrows():
print("Processing row #" + str(index) + "/" + str(count) + "\n")
manual = manual_df[manual_df['item_id'] == row["id"]]['state']
if (len(manual) > 0) and (manual.values[0] == 1):
p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
if len(p.values) > 0:
if isinstance(row, float):
row = row
manual_matchings.append([row, p, -1])
else:
print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
return self.score_correct_items_to_products(manual_matchings)''' |