import json import pandas as pd from multiprocessing import Process, Queue from preprocess.utils.common.utils import get_delimiter from rapidfuzz import process from preprocess.utils.products.products import * OUTPUT_COUNTS = True OUTPUT_CSV = True INCORRECT_MANUAL_MATCHINGS = {"47":"64088", "50":"", "59":"", "133":"77024", "207":"", "252":"94238", "367":"104051", "674":"", "2686":"", "7986":"", "21204":"", "2007154":"15248", "2007498":"108089", "2007609":"61397", "2007652":"2383", "2008041":"", "2008052":"", "2008131":"", "2008606":"", "2008647":"2036", "2009069":"97208", "2009093": "81511", "2009521":"34044", "2010101":"107433", "2010586":"98170", "2017376":"", "2018418":"", "2033420":"15745", "2038482":"", "2051521":"", "1261":"", "2214":"", "142744":"88696", "142748":"", "142757":"7770", "142760":"13584", "4665045":"116169", "22845":"", "105736":"102244", "106425":"", "106539":"61254"} COMPLEX_MANUAL_MATCHINGS = {"22918":"", "22938":"", "22973":"", "22978":""} def score_correct_item_to_product(item, product): item_to_compare = item['name'] if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']: item_to_compare = item['brand'] + " " + item['name'] match, score, _ = process.extractOne(item_to_compare, product['name_with_brand']) match2, score2, _ = process.extractOne(item_to_compare, product['name_2']) if score2 > score: score = score2 return score def compare_matching_with_correct_func(data, qresult): job_index = data["index"] items_df = data["items_df"] products_df = data["products_df"] match_df = data["match_df"] manual_df = data["manual_df"] results = data["initial_results"] row_index = 0 row_count = int(items_df.count()[0]) result_list = [] for index, row in items_df.iterrows(): result_data = {} row_index = row_index + 1 #print("Processing row #" + str(job_index) + "-" + str(row_index) + "/" + str(row_count)) #if row["id"] == 25197: # row_index == row_index result_data["id"] = row["id"] result_data["match_side"] = "no_match" result_data["match_score"] = "N" result_data["best_score_ex"] = "" result_data["manual_match_score"] = -1 auto_match = match_df[match_df['id'] == row["id"]] if len(auto_match) == 0: print("Auto matched for item id=" + str(row["id"]) + " not found") results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + 1 auto_match = "" else: result_data["best_score_ex"] = auto_match["best_score_ex"].values[0] auto_match = auto_match["matched_items"].values[0] if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2: results["auto_match_total_count"] = results["auto_match_total_count"] + 1 manual_match = None manual = manual_df[manual_df['item_id'] == row["id"]]['state'] if (len(manual) > 0) and (manual.values[0] == 1): p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]] if len(p.values) > 0: manual_match = p results["manual_match_total_count"] = results["manual_match_total_count"] + 1 result_data["manual_match_score"] = score_correct_item_to_product(row, p) else: print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found") results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + 1 if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2 and (manual_match is not None): result_data["match_side"] = "both" results["both_match_count"] = results["both_match_count"] + 1 manual_id = int(manual_match["id"].values[0]) auto_match_ns = auto_match.replace(" ", "") i1 = auto_match_ns.find('"id":') i2 = auto_match_ns.find('"id":' + str(manual_id)) if i1 == i2 and i1 != -1 and i2 != -1: result_data["match_score"] = 'E' results["equal_match_count"] = results["equal_match_count"] + 1 elif i2 > i1: pos = 0 partial_index = 0 result_data["match_score"] = 'P' while pos != -1: pos = auto_match_ns.find('"id":', pos+1) if pos == i2: if partial_index < 5: result_data["match_score"] = 'P5' results["partial5_match_count"] = results["partial5_match_count"] + 1 elif partial_index < 10: result_data["match_score"] = 'P10' results["partial10_match_count"] = results["partial10_match_count"] + 1 else: result_data["match_score"] = 'P100' results["partial100_match_count"] = results["partial100_match_count"] + 1 break partial_index = partial_index + 1 #result_data["match_score"] = 'P' results["partial_match_count"] = results["partial_match_count"] + 1 else: result_data["match_score"] = 'D' elif (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2: result_data["match_score"] = 'A' result_data["match_side"] = "only_auto" results["only_auto_match_count"] = results["only_auto_match_count"] + 1 elif manual_match is not None: result_data["match_score"] = 'M' result_data["match_side"] = "only_manual" results["only_manual_match_count"] = results["only_manual_match_count"] + 1 result_data["item"] = row["attrs"] result_data["item_id"] = row["id"] if row["orig_brand"]: result_data["item_name"] = row["orig_brand"] + " " + row["orig_name"] else: result_data["item_name"] = row["orig_name"] result_data["auto_match"] = auto_match manual_string = "" if (manual_match is not None): manual_string = '{' + \ '"id": ' + str(manual_match["id"].values[0]) + ',' + \ '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \ '"name": "' + str(manual_match["orig_name"].values[0]) + '",' + \ '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \ '"year": ' + str(manual_match["year"].values[0]) + '"}' result_data["manual_match"] = manual_string if manual_match is not None: result_data["product_id"] = manual_match["id"].values[0] result_data["product_name"] = manual_match["orig_name"].values[0] else: result_data["product_id"] = "" result_data["product_name"] = "" result_list.append(result_data) qresult.put([results, result_list]) def compare_matching_with_correct(products_file, items_file, match_result_file, manual_result_file, processor, csv_result_file, counts_result_file): #csv_delimiter = get_delimiter(products_file) #products_df = pd.read_csv(products_file, sep=csv_delimiter, on_bad_lines='skip') prods_data = get_latest_products() if not prods_data or not os.path.isfile(prods_data["path"]): raise Exception("Actual products data not found") products_df = prods_data["df_products"] csv_delimiter = get_delimiter(items_file) items_df_raw = pd.read_csv(items_file, sep=csv_delimiter, on_bad_lines='skip') csv_delimiter = get_delimiter(match_result_file) match_df = pd.read_csv(match_result_file, sep=csv_delimiter) csv_delimiter = get_delimiter(manual_result_file) manual_df = pd.read_csv(manual_result_file, sep=csv_delimiter) #items_df_raw = items_df_raw[0:100] items_df = processor.preprocessor.process_items(items_df_raw.copy()) items_df["attrs"] = items_df_raw["attrs"] results = { "item_count" : int(items_df.count()[0]), "product_count" : int(products_df.count()[0]), "total_cm_match_count_percent": 0, "equal_cm_match_count_percent": 0, "partial_cm_match_count_percent": 0, "partial5_cm_match_count_percent": 0, "partial10_cm_match_count_percent": 0, "partial100_cm_match_count_percent": 0, "total_match_count": 0, "total_match_count_percent": 0, "equal_match_count": 0, "equal_match_count_percent": 0, "partial_match_count": 0, "partial_match_count_percent": 0, "partial5_match_count": 0, "partial10_match_count": 0, "partial100_match_count": 0, "only_auto_match_count": 0, "only_auto_match_percent": 0, "only_manual_match_count": 0, "only_manual_match_percent": 0, "auto_match_total_count": 0, "auto_match_total_percent": 0, "manual_match_total_count" : 0, "manual_match_total_percent": 0, "manual_match_count_no_products": 0, "auto_match_count_no_products": 0, "both_match_count": 0, } result_list = [] threads_data = list() chunk_size = len(items_df) // 2 + 1 num_chunks = len(items_df) // chunk_size + 1 for i in range(num_chunks): chunk = items_df[i * chunk_size:(i + 1) * chunk_size] data = {"index": i, "items_df": chunk, "products_df": products_df, "match_df":match_df, "manual_df":manual_df, "initial_results":results } q = Queue() p = Process(target=compare_matching_with_correct_func, args=(data, q,)) p.start() threads_data.append({"index": i, "q": q}) for td in threads_data: td["result"] = td["q"].get() for td in threads_data: t_result = td["result"][0] t_result_list = td["result"][1] result_list.extend(t_result_list) results["total_match_count"] = results["total_match_count"] + t_result["total_match_count"] results["equal_match_count"] = results["equal_match_count"] + t_result["equal_match_count"] results["partial_match_count"] = results["partial_match_count"] + t_result["partial_match_count"] results["partial5_match_count"] = results["partial5_match_count"] + t_result["partial5_match_count"] results["partial10_match_count"] = results["partial10_match_count"] + t_result["partial10_match_count"] results["partial100_match_count"] = results["partial100_match_count"] + t_result["partial100_match_count"] results["only_auto_match_count"] = results["only_auto_match_count"] + t_result["only_auto_match_count"] results["only_manual_match_count"] = results["only_manual_match_count"] + t_result["only_manual_match_count"] results["auto_match_total_count"] = results["auto_match_total_count"] + t_result["auto_match_total_count"] results["manual_match_total_count"] = results["manual_match_total_count"] + t_result["manual_match_total_count"] results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + t_result["manual_match_count_no_products"] results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + t_result["auto_match_count_no_products"] results["both_match_count"] = results["both_match_count"] + t_result["both_match_count"] results['manual_match_total_percent'] = round(results["manual_match_total_count"] * 100 / results["item_count"], 2) results['auto_match_total_percent'] = round(results["auto_match_total_count"] * 100 / results["item_count"], 2) results["only_auto_match_percent"] = round(results["only_auto_match_count"] * 100 / results["item_count"], 2) results["only_manual_match_percent"] = round(results["only_manual_match_count"] * 100 / results["item_count"], 2) results['equal_match_count_percent'] = round(results["equal_match_count"] * 100 / results["item_count"], 2) results['partial_match_count_percent'] = round(results["partial_match_count"] * 100 / results["item_count"], 2) results['total_match_count'] = results['equal_match_count'] + results['partial_match_count'] results['total_match_count_percent'] = results['equal_match_count_percent'] + results['partial_match_count_percent'] results['total_cm_match_count_percent'] = round(results['total_match_count'] * 100 / results["manual_match_total_count"], 2) results['equal_cm_match_count_percent'] = round(results['equal_match_count'] * 100 / results["manual_match_total_count"], 2) results['partial_cm_match_count_percent'] = round(results['partial_match_count'] * 100 / results["manual_match_total_count"], 2) results['partial5_cm_match_count_percent'] = round(results['partial5_match_count'] * 100 / results["manual_match_total_count"], 2) results['partial10_cm_match_count_percent'] = round(results['partial10_match_count'] * 100 / results["manual_match_total_count"], 2) results['partial100_cm_match_count_percent'] = round(results['partial100_match_count'] * 100 / results["manual_match_total_count"], 2) if OUTPUT_CSV: results_df = pd.DataFrame(result_list) results_df.to_csv(csv_result_file, float_format='%.2f') if OUTPUT_COUNTS: with open(counts_result_file, 'w') as fn: fn.write(json.dumps(results, indent=4)) print(results) return results