WineMatching / processor /processor.py
j-s-v's picture
2025-07-30
eedd5dc
from preprocess.preprocess import Preprocessor
from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex
from preprocess.utils.common.utils import get_delimiter, verify_csv
import os.path
from preprocess.utils.products.products import *
from rapidfuzz import fuzz, process
import pandas as pd
class Processor():
def __init__(self, long_types_list, short_types_list, sour_list,
type_wine, gbs, grapes, other_words,
#sour_merge_dict,
type_merge_dict, color_merge_dict,
country_list):
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
type_wine, gbs, grapes, other_words,
#sour_merge_dict,
type_merge_dict, color_merge_dict,
country_list)
'''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True):
items, products=self.preprocessor.process(prods_data, items)
return self.match(items, products, is_items_first, threshold, include_alternatives)
def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True):
print('-----*-----Matching-----*-----')
if is_items_first:
#products['new_brand']=products['brand']
products['new_brand'] = products['brand_sndex_7']
#items['brand']=items['new_brand']
items['brand'] = items['brand_sndex_7']
products_groups = prepare_groups_with_ids(products)
res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives)
else:
items_groups = prepare_groups_with_ids(items)
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products'''
def process_products_full(self, products_data):
self.preprocessor.process_products_full(products_data)
def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8):
prods_data = get_latest_products()
if not prods_data or not os.path.isfile(prods_data["path"]):
raise Exception("Actual products data not found")
if not items_file:
raise Exception("Items CSV not specified")
#bad_lines = verify_csv(items_file, items_file + ".fixed")
#if bad_lines:
# items_file = items_file + ".fixed"
#raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines))
items_delimiter = get_delimiter(items_file)
print('items delimiter: "' + items_delimiter + '"')
row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
if not 'attrs' in row_items.columns.values:
raise Exception("Uploaded Items CSV does not seem to be valid")
items, products = self.preprocessor.process_new(prods_data, row_items)
print('-----*-----Matching-----*-----')
if is_items_first:
#items['brand']=items['new_brand']
fullpath = os.path.join(prods_data["dir"], "_items.pkl")
save_df_to_file(items, fullpath, True)
#exit(1)
#items['brand']=items['brand_sndex_5']
#products_groups = prods_data["dict_groups"]
products_groups_brand_type_vol = prods_data["groups_brand_type_vol"]
products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"]
products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"]
products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"]
res = new_find_matches_with_ids(items, products,
name_threshold=threshold,
products_groups_brand_type_vol = products_groups_brand_type_vol,
products_groups_brand_typel1_vol = products_groups_brand_typel1_vol,
products_groups_brand_typel0_vol = products_groups_brand_typel0_vol,
products_groups_typewine_type_vol = products_groups_typewine_type_vol,
order_invariant_names_matching = order_invariant_names_matching,
thread_count = thread_count)
'''else:
items_groups = prepare_groups_with_ids(items)
items_alt_groups = prepare_groups_by_alternative_keys(items)
res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)'''
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
'''def score_correct_items_to_products(self, manual_matchings):
result = []
for mm in manual_matchings:
item = mm[0]
product = mm[1]
item_to_compare = item['name']
if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
item_to_compare = item['brand'] + " " + item['name']
product_to_compare = product['name_with_brand']
product2_to_compare = product['name_2']
item_with_name = item['orig_name']
if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']:
item_with_name = item['orig_brand'] + " - " + item['orig_name']
product_brand = product['orig_brand'].values[0]
product_with_brand = product['orig_name'].values[0]
if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand:
product_with_brand = product_brand + " - " + product_with_brand
match, score, _ = process.extractOne(item_to_compare, product_to_compare)
match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare)
if score2 > score:
score = score2
result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score,
"item_orig":item_with_name, "product_orig":product_with_brand,
"item": item_to_compare, "product":product_to_compare.values[0]
})
return result
def verify_correct_matching(self, correct_file, items_file, thread_count = 8):
prods_data = get_latest_products()
if not prods_data or not os.path.isfile(prods_data["path"]):
raise Exception("Actual products data not found")
products_df = prods_data["df_products"]
if not correct_file:
raise Exception("Correct CSV not specified")
if not items_file:
raise Exception("Items CSV not specified")
csv_delimiter = get_delimiter(correct_file)
manual_df = pd.read_csv(correct_file, sep=csv_delimiter)
items_delimiter = get_delimiter(items_file)
items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
if not 'attrs' in items_df.columns.values:
raise Exception("Uploaded Items CSV does not seem to be valid")
items = self.preprocessor.process_items(items_df.copy())
manual_matchings = []
count = len(items)
for index, row in items.iterrows():
print("Processing row #" + str(index) + "/" + str(count) + "\n")
manual = manual_df[manual_df['item_id'] == row["id"]]['state']
if (len(manual) > 0) and (manual.values[0] == 1):
p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
if len(p.values) > 0:
if isinstance(row, float):
row = row
manual_matchings.append([row, p, -1])
else:
print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
return self.score_correct_items_to_products(manual_matchings)'''