Spaces:

j-s-v
/

WineMatching

Build error

App Files Files Community

WineMatching / processor /processor.py

j-s-v

2025-07-30

eedd5dc 10 months ago

raw

history blame contribute delete

8.46 kB

	from preprocess.preprocess import Preprocessor
	from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex
	from preprocess.utils.common.utils import get_delimiter, verify_csv
	import os.path
	from preprocess.utils.products.products import *
	from rapidfuzz import fuzz, process
	import pandas as pd


	class Processor():
	def __init__(self, long_types_list, short_types_list, sour_list,
	type_wine, gbs, grapes, other_words,
	#sour_merge_dict,
	type_merge_dict, color_merge_dict,
	country_list):

	self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
	type_wine, gbs, grapes, other_words,
	#sour_merge_dict,
	type_merge_dict, color_merge_dict,
	country_list)


	'''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True):
	items, products=self.preprocessor.process(prods_data, items)
	return self.match(items, products, is_items_first, threshold, include_alternatives)

	def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True):
	print('----------Matching----------')
	if is_items_first:

	#products['new_brand']=products['brand']
	products['new_brand'] = products['brand_sndex_7']
	#items['brand']=items['new_brand']
	items['brand'] = items['brand_sndex_7']
	products_groups = prepare_groups_with_ids(products)

	res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives)
	else:
	items_groups = prepare_groups_with_ids(items)
	res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)

	return res.drop(['type','type_wine','alco','gb'], axis=1), items, products'''


	def process_products_full(self, products_data):
	self.preprocessor.process_products_full(products_data)


	def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8):
	prods_data = get_latest_products()
	if not prods_data or not os.path.isfile(prods_data["path"]):
	raise Exception("Actual products data not found")

	if not items_file:
	raise Exception("Items CSV not specified")

	#bad_lines = verify_csv(items_file, items_file + ".fixed")
	#if bad_lines:
	# items_file = items_file + ".fixed"
	#raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines))

	items_delimiter = get_delimiter(items_file)
	print('items delimiter: "' + items_delimiter + '"')
	row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
	if not 'attrs' in row_items.columns.values:
	raise Exception("Uploaded Items CSV does not seem to be valid")

	items, products = self.preprocessor.process_new(prods_data, row_items)

	print('----------Matching----------')
	if is_items_first:
	#items['brand']=items['new_brand']

	fullpath = os.path.join(prods_data["dir"], "_items.pkl")
	save_df_to_file(items, fullpath, True)
	#exit(1)

	#items['brand']=items['brand_sndex_5']

	#products_groups = prods_data["dict_groups"]
	products_groups_brand_type_vol = prods_data["groups_brand_type_vol"]
	products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"]
	products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"]
	products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"]

	res = new_find_matches_with_ids(items, products,
	name_threshold=threshold,
	products_groups_brand_type_vol = products_groups_brand_type_vol,
	products_groups_brand_typel1_vol = products_groups_brand_typel1_vol,
	products_groups_brand_typel0_vol = products_groups_brand_typel0_vol,
	products_groups_typewine_type_vol = products_groups_typewine_type_vol,
	order_invariant_names_matching = order_invariant_names_matching,
	thread_count = thread_count)
	'''else:
	items_groups = prepare_groups_with_ids(items)
	items_alt_groups = prepare_groups_by_alternative_keys(items)
	res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)'''

	return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

	'''def score_correct_items_to_products(self, manual_matchings):
	result = []

	for mm in manual_matchings:
	item = mm[0]
	product = mm[1]

	item_to_compare = item['name']
	if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
	item_to_compare = item['brand'] + " " + item['name']

	product_to_compare = product['name_with_brand']
	product2_to_compare = product['name_2']

	item_with_name = item['orig_name']
	if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']:
	item_with_name = item['orig_brand'] + " - " + item['orig_name']


	product_brand = product['orig_brand'].values[0]
	product_with_brand = product['orig_name'].values[0]
	if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand:
	product_with_brand = product_brand + " - " + product_with_brand


	match, score, _ = process.extractOne(item_to_compare, product_to_compare)
	match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare)
	if score2 > score:
	score = score2

	result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score,
	"item_orig":item_with_name, "product_orig":product_with_brand,
	"item": item_to_compare, "product":product_to_compare.values[0]
	})

	return result

	def verify_correct_matching(self, correct_file, items_file, thread_count = 8):
	prods_data = get_latest_products()
	if not prods_data or not os.path.isfile(prods_data["path"]):
	raise Exception("Actual products data not found")

	products_df = prods_data["df_products"]


	if not correct_file:
	raise Exception("Correct CSV not specified")

	if not items_file:
	raise Exception("Items CSV not specified")

	csv_delimiter = get_delimiter(correct_file)
	manual_df = pd.read_csv(correct_file, sep=csv_delimiter)

	items_delimiter = get_delimiter(items_file)
	items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
	if not 'attrs' in items_df.columns.values:
	raise Exception("Uploaded Items CSV does not seem to be valid")

	items = self.preprocessor.process_items(items_df.copy())

	manual_matchings = []
	count = len(items)
	for index, row in items.iterrows():
	print("Processing row #" + str(index) + "/" + str(count) + "\n")
	manual = manual_df[manual_df['item_id'] == row["id"]]['state']
	if (len(manual) > 0) and (manual.values[0] == 1):
	p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]

	if len(p.values) > 0:
	if isinstance(row, float):
	row = row
	manual_matchings.append([row, p, -1])
	else:
	print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")

	return self.score_correct_items_to_products(manual_matchings)'''