WineMatching / processor /matching_score.py
j-s-v's picture
2025-07-30
eedd5dc
import json
import pandas as pd
from multiprocessing import Process, Queue
from preprocess.utils.common.utils import get_delimiter
from rapidfuzz import process
from preprocess.utils.products.products import *
OUTPUT_COUNTS = True
OUTPUT_CSV = True
INCORRECT_MANUAL_MATCHINGS = {"47":"64088", "50":"", "59":"", "133":"77024", "207":"", "252":"94238", "367":"104051",
"674":"", "2686":"", "7986":"", "21204":"", "2007154":"15248", "2007498":"108089", "2007609":"61397",
"2007652":"2383", "2008041":"", "2008052":"", "2008131":"", "2008606":"", "2008647":"2036",
"2009069":"97208", "2009093": "81511", "2009521":"34044", "2010101":"107433", "2010586":"98170",
"2017376":"", "2018418":"", "2033420":"15745", "2038482":"", "2051521":"", "1261":"", "2214":"",
"142744":"88696", "142748":"", "142757":"7770", "142760":"13584", "4665045":"116169",
"22845":"", "105736":"102244", "106425":"", "106539":"61254"}
COMPLEX_MANUAL_MATCHINGS = {"22918":"", "22938":"", "22973":"", "22978":""}
def score_correct_item_to_product(item, product):
item_to_compare = item['name']
if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
item_to_compare = item['brand'] + " " + item['name']
match, score, _ = process.extractOne(item_to_compare, product['name_with_brand'])
match2, score2, _ = process.extractOne(item_to_compare, product['name_2'])
if score2 > score:
score = score2
return score
def compare_matching_with_correct_func(data, qresult):
job_index = data["index"]
items_df = data["items_df"]
products_df = data["products_df"]
match_df = data["match_df"]
manual_df = data["manual_df"]
results = data["initial_results"]
row_index = 0
row_count = int(items_df.count()[0])
result_list = []
for index, row in items_df.iterrows():
result_data = {}
row_index = row_index + 1
#print("Processing row #" + str(job_index) + "-" + str(row_index) + "/" + str(row_count))
#if row["id"] == 25197:
# row_index == row_index
result_data["id"] = row["id"]
result_data["match_side"] = "no_match"
result_data["match_score"] = "N"
result_data["best_score_ex"] = ""
result_data["manual_match_score"] = -1
auto_match = match_df[match_df['id'] == row["id"]]
if len(auto_match) == 0:
print("Auto matched for item id=" + str(row["id"]) + " not found")
results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + 1
auto_match = ""
else:
result_data["best_score_ex"] = auto_match["best_score_ex"].values[0]
auto_match = auto_match["matched_items"].values[0]
if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2:
results["auto_match_total_count"] = results["auto_match_total_count"] + 1
manual_match = None
manual = manual_df[manual_df['item_id'] == row["id"]]['state']
if (len(manual) > 0) and (manual.values[0] == 1):
p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
if len(p.values) > 0:
manual_match = p
results["manual_match_total_count"] = results["manual_match_total_count"] + 1
result_data["manual_match_score"] = score_correct_item_to_product(row, p)
else:
print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + 1
if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2 and (manual_match is not None):
result_data["match_side"] = "both"
results["both_match_count"] = results["both_match_count"] + 1
manual_id = int(manual_match["id"].values[0])
auto_match_ns = auto_match.replace(" ", "")
i1 = auto_match_ns.find('"id":')
i2 = auto_match_ns.find('"id":' + str(manual_id))
if i1 == i2 and i1 != -1 and i2 != -1:
result_data["match_score"] = 'E'
results["equal_match_count"] = results["equal_match_count"] + 1
elif i2 > i1:
pos = 0
partial_index = 0
result_data["match_score"] = 'P'
while pos != -1:
pos = auto_match_ns.find('"id":', pos+1)
if pos == i2:
if partial_index < 5:
result_data["match_score"] = 'P5'
results["partial5_match_count"] = results["partial5_match_count"] + 1
elif partial_index < 10:
result_data["match_score"] = 'P10'
results["partial10_match_count"] = results["partial10_match_count"] + 1
else:
result_data["match_score"] = 'P100'
results["partial100_match_count"] = results["partial100_match_count"] + 1
break
partial_index = partial_index + 1
#result_data["match_score"] = 'P'
results["partial_match_count"] = results["partial_match_count"] + 1
else:
result_data["match_score"] = 'D'
elif (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2:
result_data["match_score"] = 'A'
result_data["match_side"] = "only_auto"
results["only_auto_match_count"] = results["only_auto_match_count"] + 1
elif manual_match is not None:
result_data["match_score"] = 'M'
result_data["match_side"] = "only_manual"
results["only_manual_match_count"] = results["only_manual_match_count"] + 1
result_data["item"] = row["attrs"]
result_data["item_id"] = row["id"]
if row["orig_brand"]:
result_data["item_name"] = row["orig_brand"] + " " + row["orig_name"]
else:
result_data["item_name"] = row["orig_name"]
result_data["auto_match"] = auto_match
manual_string = ""
if (manual_match is not None):
manual_string = '{' + \
'"id": ' + str(manual_match["id"].values[0]) + ',' + \
'"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
'"name": "' + str(manual_match["orig_name"].values[0]) + '",' + \
'"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
'"year": ' + str(manual_match["year"].values[0]) + '"}'
result_data["manual_match"] = manual_string
if manual_match is not None:
result_data["product_id"] = manual_match["id"].values[0]
result_data["product_name"] = manual_match["orig_name"].values[0]
else:
result_data["product_id"] = ""
result_data["product_name"] = ""
result_list.append(result_data)
qresult.put([results, result_list])
def compare_matching_with_correct(products_file, items_file, match_result_file, manual_result_file, processor, csv_result_file, counts_result_file):
#csv_delimiter = get_delimiter(products_file)
#products_df = pd.read_csv(products_file, sep=csv_delimiter, on_bad_lines='skip')
prods_data = get_latest_products()
if not prods_data or not os.path.isfile(prods_data["path"]):
raise Exception("Actual products data not found")
products_df = prods_data["df_products"]
csv_delimiter = get_delimiter(items_file)
items_df_raw = pd.read_csv(items_file, sep=csv_delimiter, on_bad_lines='skip')
csv_delimiter = get_delimiter(match_result_file)
match_df = pd.read_csv(match_result_file, sep=csv_delimiter)
csv_delimiter = get_delimiter(manual_result_file)
manual_df = pd.read_csv(manual_result_file, sep=csv_delimiter)
#items_df_raw = items_df_raw[0:100]
items_df = processor.preprocessor.process_items(items_df_raw.copy())
items_df["attrs"] = items_df_raw["attrs"]
results = {
"item_count" : int(items_df.count()[0]),
"product_count" : int(products_df.count()[0]),
"total_cm_match_count_percent": 0,
"equal_cm_match_count_percent": 0,
"partial_cm_match_count_percent": 0,
"partial5_cm_match_count_percent": 0,
"partial10_cm_match_count_percent": 0,
"partial100_cm_match_count_percent": 0,
"total_match_count": 0,
"total_match_count_percent": 0,
"equal_match_count": 0,
"equal_match_count_percent": 0,
"partial_match_count": 0,
"partial_match_count_percent": 0,
"partial5_match_count": 0,
"partial10_match_count": 0,
"partial100_match_count": 0,
"only_auto_match_count": 0,
"only_auto_match_percent": 0,
"only_manual_match_count": 0,
"only_manual_match_percent": 0,
"auto_match_total_count": 0,
"auto_match_total_percent": 0,
"manual_match_total_count" : 0,
"manual_match_total_percent": 0,
"manual_match_count_no_products": 0,
"auto_match_count_no_products": 0,
"both_match_count": 0,
}
result_list = []
threads_data = list()
chunk_size = len(items_df) // 2 + 1
num_chunks = len(items_df) // chunk_size + 1
for i in range(num_chunks):
chunk = items_df[i * chunk_size:(i + 1) * chunk_size]
data = {"index": i, "items_df": chunk, "products_df": products_df, "match_df":match_df, "manual_df":manual_df, "initial_results":results }
q = Queue()
p = Process(target=compare_matching_with_correct_func, args=(data, q,))
p.start()
threads_data.append({"index": i, "q": q})
for td in threads_data:
td["result"] = td["q"].get()
for td in threads_data:
t_result = td["result"][0]
t_result_list = td["result"][1]
result_list.extend(t_result_list)
results["total_match_count"] = results["total_match_count"] + t_result["total_match_count"]
results["equal_match_count"] = results["equal_match_count"] + t_result["equal_match_count"]
results["partial_match_count"] = results["partial_match_count"] + t_result["partial_match_count"]
results["partial5_match_count"] = results["partial5_match_count"] + t_result["partial5_match_count"]
results["partial10_match_count"] = results["partial10_match_count"] + t_result["partial10_match_count"]
results["partial100_match_count"] = results["partial100_match_count"] + t_result["partial100_match_count"]
results["only_auto_match_count"] = results["only_auto_match_count"] + t_result["only_auto_match_count"]
results["only_manual_match_count"] = results["only_manual_match_count"] + t_result["only_manual_match_count"]
results["auto_match_total_count"] = results["auto_match_total_count"] + t_result["auto_match_total_count"]
results["manual_match_total_count"] = results["manual_match_total_count"] + t_result["manual_match_total_count"]
results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + t_result["manual_match_count_no_products"]
results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + t_result["auto_match_count_no_products"]
results["both_match_count"] = results["both_match_count"] + t_result["both_match_count"]
results['manual_match_total_percent'] = round(results["manual_match_total_count"] * 100 / results["item_count"], 2)
results['auto_match_total_percent'] = round(results["auto_match_total_count"] * 100 / results["item_count"], 2)
results["only_auto_match_percent"] = round(results["only_auto_match_count"] * 100 / results["item_count"], 2)
results["only_manual_match_percent"] = round(results["only_manual_match_count"] * 100 / results["item_count"], 2)
results['equal_match_count_percent'] = round(results["equal_match_count"] * 100 / results["item_count"], 2)
results['partial_match_count_percent'] = round(results["partial_match_count"] * 100 / results["item_count"], 2)
results['total_match_count'] = results['equal_match_count'] + results['partial_match_count']
results['total_match_count_percent'] = results['equal_match_count_percent'] + results['partial_match_count_percent']
results['total_cm_match_count_percent'] = round(results['total_match_count'] * 100 / results["manual_match_total_count"], 2)
results['equal_cm_match_count_percent'] = round(results['equal_match_count'] * 100 / results["manual_match_total_count"], 2)
results['partial_cm_match_count_percent'] = round(results['partial_match_count'] * 100 / results["manual_match_total_count"], 2)
results['partial5_cm_match_count_percent'] = round(results['partial5_match_count'] * 100 / results["manual_match_total_count"], 2)
results['partial10_cm_match_count_percent'] = round(results['partial10_match_count'] * 100 / results["manual_match_total_count"], 2)
results['partial100_cm_match_count_percent'] = round(results['partial100_match_count'] * 100 / results["manual_match_total_count"], 2)
if OUTPUT_CSV:
results_df = pd.DataFrame(result_list)
results_df.to_csv(csv_result_file, float_format='%.2f')
if OUTPUT_COUNTS:
with open(counts_result_file, 'w') as fn:
fn.write(json.dumps(results, indent=4))
print(results)
return results