Spaces:

j-s-v
/

WineMatching

Build error

App Files Files Community

WineMatching / processor /matching_score.py

j-s-v

2025-07-30

eedd5dc 10 months ago

raw

history blame contribute delete

13.9 kB

	import json
	import pandas as pd
	from multiprocessing import Process, Queue
	from preprocess.utils.common.utils import get_delimiter
	from rapidfuzz import process
	from preprocess.utils.products.products import *


	OUTPUT_COUNTS = True
	OUTPUT_CSV = True

	INCORRECT_MANUAL_MATCHINGS = {"47":"64088", "50":"", "59":"", "133":"77024", "207":"", "252":"94238", "367":"104051",
	"674":"", "2686":"", "7986":"", "21204":"", "2007154":"15248", "2007498":"108089", "2007609":"61397",
	"2007652":"2383", "2008041":"", "2008052":"", "2008131":"", "2008606":"", "2008647":"2036",
	"2009069":"97208", "2009093": "81511", "2009521":"34044", "2010101":"107433", "2010586":"98170",
	"2017376":"", "2018418":"", "2033420":"15745", "2038482":"", "2051521":"", "1261":"", "2214":"",
	"142744":"88696", "142748":"", "142757":"7770", "142760":"13584", "4665045":"116169",
	"22845":"", "105736":"102244", "106425":"", "106539":"61254"}

	COMPLEX_MANUAL_MATCHINGS = {"22918":"", "22938":"", "22973":"", "22978":""}


	def score_correct_item_to_product(item, product):
	item_to_compare = item['name']
	if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
	item_to_compare = item['brand'] + " " + item['name']

	match, score, _ = process.extractOne(item_to_compare, product['name_with_brand'])
	match2, score2, _ = process.extractOne(item_to_compare, product['name_2'])
	if score2 > score:
	score = score2

	return score


	def compare_matching_with_correct_func(data, qresult):
	job_index = data["index"]
	items_df = data["items_df"]
	products_df = data["products_df"]
	match_df = data["match_df"]
	manual_df = data["manual_df"]
	results = data["initial_results"]

	row_index = 0
	row_count = int(items_df.count()[0])

	result_list = []

	for index, row in items_df.iterrows():
	result_data = {}

	row_index = row_index + 1
	#print("Processing row #" + str(job_index) + "-" + str(row_index) + "/" + str(row_count))
	#if row["id"] == 25197:
	# row_index == row_index

	result_data["id"] = row["id"]
	result_data["match_side"] = "no_match"
	result_data["match_score"] = "N"
	result_data["best_score_ex"] = ""
	result_data["manual_match_score"] = -1

	auto_match = match_df[match_df['id'] == row["id"]]
	if len(auto_match) == 0:
	print("Auto matched for item id=" + str(row["id"]) + " not found")
	results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + 1
	auto_match = ""
	else:
	result_data["best_score_ex"] = auto_match["best_score_ex"].values[0]
	auto_match = auto_match["matched_items"].values[0]

	if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2:
	results["auto_match_total_count"] = results["auto_match_total_count"] + 1

	manual_match = None
	manual = manual_df[manual_df['item_id'] == row["id"]]['state']
	if (len(manual) > 0) and (manual.values[0] == 1):
	p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]

	if len(p.values) > 0:
	manual_match = p
	results["manual_match_total_count"] = results["manual_match_total_count"] + 1
	result_data["manual_match_score"] = score_correct_item_to_product(row, p)
	else:
	print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
	results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + 1

	if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2 and (manual_match is not None):
	result_data["match_side"] = "both"
	results["both_match_count"] = results["both_match_count"] + 1

	manual_id = int(manual_match["id"].values[0])
	auto_match_ns = auto_match.replace(" ", "")
	i1 = auto_match_ns.find('"id":')
	i2 = auto_match_ns.find('"id":' + str(manual_id))

	if i1 == i2 and i1 != -1 and i2 != -1:
	result_data["match_score"] = 'E'
	results["equal_match_count"] = results["equal_match_count"] + 1
	elif i2 > i1:
	pos = 0
	partial_index = 0
	result_data["match_score"] = 'P'
	while pos != -1:
	pos = auto_match_ns.find('"id":', pos+1)
	if pos == i2:
	if partial_index < 5:
	result_data["match_score"] = 'P5'
	results["partial5_match_count"] = results["partial5_match_count"] + 1
	elif partial_index < 10:
	result_data["match_score"] = 'P10'
	results["partial10_match_count"] = results["partial10_match_count"] + 1
	else:
	result_data["match_score"] = 'P100'
	results["partial100_match_count"] = results["partial100_match_count"] + 1

	break

	partial_index = partial_index + 1

	#result_data["match_score"] = 'P'
	results["partial_match_count"] = results["partial_match_count"] + 1
	else:
	result_data["match_score"] = 'D'
	elif (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2:
	result_data["match_score"] = 'A'
	result_data["match_side"] = "only_auto"
	results["only_auto_match_count"] = results["only_auto_match_count"] + 1
	elif manual_match is not None:
	result_data["match_score"] = 'M'
	result_data["match_side"] = "only_manual"
	results["only_manual_match_count"] = results["only_manual_match_count"] + 1

	result_data["item"] = row["attrs"]

	result_data["item_id"] = row["id"]
	if row["orig_brand"]:
	result_data["item_name"] = row["orig_brand"] + " " + row["orig_name"]
	else:
	result_data["item_name"] = row["orig_name"]


	result_data["auto_match"] = auto_match

	manual_string = ""
	if (manual_match is not None):
	manual_string = '{' + \
	'"id": ' + str(manual_match["id"].values[0]) + ',' + \
	'"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
	'"name": "' + str(manual_match["orig_name"].values[0]) + '",' + \
	'"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
	'"year": ' + str(manual_match["year"].values[0]) + '"}'

	result_data["manual_match"] = manual_string

	if manual_match is not None:
	result_data["product_id"] = manual_match["id"].values[0]
	result_data["product_name"] = manual_match["orig_name"].values[0]
	else:
	result_data["product_id"] = ""
	result_data["product_name"] = ""

	result_list.append(result_data)

	qresult.put([results, result_list])


	def compare_matching_with_correct(products_file, items_file, match_result_file, manual_result_file, processor, csv_result_file, counts_result_file):

	#csv_delimiter = get_delimiter(products_file)
	#products_df = pd.read_csv(products_file, sep=csv_delimiter, on_bad_lines='skip')

	prods_data = get_latest_products()
	if not prods_data or not os.path.isfile(prods_data["path"]):
	raise Exception("Actual products data not found")

	products_df = prods_data["df_products"]

	csv_delimiter = get_delimiter(items_file)
	items_df_raw = pd.read_csv(items_file, sep=csv_delimiter, on_bad_lines='skip')

	csv_delimiter = get_delimiter(match_result_file)
	match_df = pd.read_csv(match_result_file, sep=csv_delimiter)

	csv_delimiter = get_delimiter(manual_result_file)
	manual_df = pd.read_csv(manual_result_file, sep=csv_delimiter)

	#items_df_raw = items_df_raw[0:100]
	items_df = processor.preprocessor.process_items(items_df_raw.copy())
	items_df["attrs"] = items_df_raw["attrs"]

	results = {
	"item_count" : int(items_df.count()[0]),
	"product_count" : int(products_df.count()[0]),

	"total_cm_match_count_percent": 0,
	"equal_cm_match_count_percent": 0,
	"partial_cm_match_count_percent": 0,
	"partial5_cm_match_count_percent": 0,
	"partial10_cm_match_count_percent": 0,
	"partial100_cm_match_count_percent": 0,

	"total_match_count": 0,
	"total_match_count_percent": 0,

	"equal_match_count": 0,
	"equal_match_count_percent": 0,

	"partial_match_count": 0,
	"partial_match_count_percent": 0,

	"partial5_match_count": 0,
	"partial10_match_count": 0,
	"partial100_match_count": 0,

	"only_auto_match_count": 0,
	"only_auto_match_percent": 0,

	"only_manual_match_count": 0,
	"only_manual_match_percent": 0,

	"auto_match_total_count": 0,
	"auto_match_total_percent": 0,

	"manual_match_total_count" : 0,
	"manual_match_total_percent": 0,
	"manual_match_count_no_products": 0,

	"auto_match_count_no_products": 0,
	"both_match_count": 0,
	}

	result_list = []

	threads_data = list()
	chunk_size = len(items_df) // 2 + 1
	num_chunks = len(items_df) // chunk_size + 1
	for i in range(num_chunks):
	chunk = items_df[i * chunk_size:(i + 1) * chunk_size]
	data = {"index": i, "items_df": chunk, "products_df": products_df, "match_df":match_df, "manual_df":manual_df, "initial_results":results }

	q = Queue()
	p = Process(target=compare_matching_with_correct_func, args=(data, q,))
	p.start()

	threads_data.append({"index": i, "q": q})


	for td in threads_data:
	td["result"] = td["q"].get()

	for td in threads_data:
	t_result = td["result"][0]
	t_result_list = td["result"][1]
	result_list.extend(t_result_list)

	results["total_match_count"] = results["total_match_count"] + t_result["total_match_count"]
	results["equal_match_count"] = results["equal_match_count"] + t_result["equal_match_count"]
	results["partial_match_count"] = results["partial_match_count"] + t_result["partial_match_count"]
	results["partial5_match_count"] = results["partial5_match_count"] + t_result["partial5_match_count"]
	results["partial10_match_count"] = results["partial10_match_count"] + t_result["partial10_match_count"]
	results["partial100_match_count"] = results["partial100_match_count"] + t_result["partial100_match_count"]
	results["only_auto_match_count"] = results["only_auto_match_count"] + t_result["only_auto_match_count"]
	results["only_manual_match_count"] = results["only_manual_match_count"] + t_result["only_manual_match_count"]
	results["auto_match_total_count"] = results["auto_match_total_count"] + t_result["auto_match_total_count"]
	results["manual_match_total_count"] = results["manual_match_total_count"] + t_result["manual_match_total_count"]
	results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + t_result["manual_match_count_no_products"]
	results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + t_result["auto_match_count_no_products"]
	results["both_match_count"] = results["both_match_count"] + t_result["both_match_count"]


	results['manual_match_total_percent'] = round(results["manual_match_total_count"] * 100 / results["item_count"], 2)
	results['auto_match_total_percent'] = round(results["auto_match_total_count"] * 100 / results["item_count"], 2)

	results["only_auto_match_percent"] = round(results["only_auto_match_count"] * 100 / results["item_count"], 2)
	results["only_manual_match_percent"] = round(results["only_manual_match_count"] * 100 / results["item_count"], 2)

	results['equal_match_count_percent'] = round(results["equal_match_count"] * 100 / results["item_count"], 2)
	results['partial_match_count_percent'] = round(results["partial_match_count"] * 100 / results["item_count"], 2)


	results['total_match_count'] = results['equal_match_count'] + results['partial_match_count']
	results['total_match_count_percent'] = results['equal_match_count_percent'] + results['partial_match_count_percent']

	results['total_cm_match_count_percent'] = round(results['total_match_count'] * 100 / results["manual_match_total_count"], 2)
	results['equal_cm_match_count_percent'] = round(results['equal_match_count'] * 100 / results["manual_match_total_count"], 2)
	results['partial_cm_match_count_percent'] = round(results['partial_match_count'] * 100 / results["manual_match_total_count"], 2)

	results['partial5_cm_match_count_percent'] = round(results['partial5_match_count'] * 100 / results["manual_match_total_count"], 2)
	results['partial10_cm_match_count_percent'] = round(results['partial10_match_count'] * 100 / results["manual_match_total_count"], 2)
	results['partial100_cm_match_count_percent'] = round(results['partial100_match_count'] * 100 / results["manual_match_total_count"], 2)

	if OUTPUT_CSV:
	results_df = pd.DataFrame(result_list)
	results_df.to_csv(csv_result_file, float_format='%.2f')

	if OUTPUT_COUNTS:
	with open(counts_result_file, 'w') as fn:
	fn.write(json.dumps(results, indent=4))

	print(results)

	return results