Spaces:
Build error
Build error
| import json | |
| import pandas as pd | |
| from multiprocessing import Process, Queue | |
| from preprocess.utils.common.utils import get_delimiter | |
| from rapidfuzz import process | |
| from preprocess.utils.products.products import * | |
| OUTPUT_COUNTS = True | |
| OUTPUT_CSV = True | |
| INCORRECT_MANUAL_MATCHINGS = {"47":"64088", "50":"", "59":"", "133":"77024", "207":"", "252":"94238", "367":"104051", | |
| "674":"", "2686":"", "7986":"", "21204":"", "2007154":"15248", "2007498":"108089", "2007609":"61397", | |
| "2007652":"2383", "2008041":"", "2008052":"", "2008131":"", "2008606":"", "2008647":"2036", | |
| "2009069":"97208", "2009093": "81511", "2009521":"34044", "2010101":"107433", "2010586":"98170", | |
| "2017376":"", "2018418":"", "2033420":"15745", "2038482":"", "2051521":"", "1261":"", "2214":"", | |
| "142744":"88696", "142748":"", "142757":"7770", "142760":"13584", "4665045":"116169", | |
| "22845":"", "105736":"102244", "106425":"", "106539":"61254"} | |
| COMPLEX_MANUAL_MATCHINGS = {"22918":"", "22938":"", "22973":"", "22978":""} | |
| def score_correct_item_to_product(item, product): | |
| item_to_compare = item['name'] | |
| if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']: | |
| item_to_compare = item['brand'] + " " + item['name'] | |
| match, score, _ = process.extractOne(item_to_compare, product['name_with_brand']) | |
| match2, score2, _ = process.extractOne(item_to_compare, product['name_2']) | |
| if score2 > score: | |
| score = score2 | |
| return score | |
| def compare_matching_with_correct_func(data, qresult): | |
| job_index = data["index"] | |
| items_df = data["items_df"] | |
| products_df = data["products_df"] | |
| match_df = data["match_df"] | |
| manual_df = data["manual_df"] | |
| results = data["initial_results"] | |
| row_index = 0 | |
| row_count = int(items_df.count()[0]) | |
| result_list = [] | |
| for index, row in items_df.iterrows(): | |
| result_data = {} | |
| row_index = row_index + 1 | |
| #print("Processing row #" + str(job_index) + "-" + str(row_index) + "/" + str(row_count)) | |
| #if row["id"] == 25197: | |
| # row_index == row_index | |
| result_data["id"] = row["id"] | |
| result_data["match_side"] = "no_match" | |
| result_data["match_score"] = "N" | |
| result_data["best_score_ex"] = "" | |
| result_data["manual_match_score"] = -1 | |
| auto_match = match_df[match_df['id'] == row["id"]] | |
| if len(auto_match) == 0: | |
| print("Auto matched for item id=" + str(row["id"]) + " not found") | |
| results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + 1 | |
| auto_match = "" | |
| else: | |
| result_data["best_score_ex"] = auto_match["best_score_ex"].values[0] | |
| auto_match = auto_match["matched_items"].values[0] | |
| if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2: | |
| results["auto_match_total_count"] = results["auto_match_total_count"] + 1 | |
| manual_match = None | |
| manual = manual_df[manual_df['item_id'] == row["id"]]['state'] | |
| if (len(manual) > 0) and (manual.values[0] == 1): | |
| p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]] | |
| if len(p.values) > 0: | |
| manual_match = p | |
| results["manual_match_total_count"] = results["manual_match_total_count"] + 1 | |
| result_data["manual_match_score"] = score_correct_item_to_product(row, p) | |
| else: | |
| print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found") | |
| results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + 1 | |
| if (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2 and (manual_match is not None): | |
| result_data["match_side"] = "both" | |
| results["both_match_count"] = results["both_match_count"] + 1 | |
| manual_id = int(manual_match["id"].values[0]) | |
| auto_match_ns = auto_match.replace(" ", "") | |
| i1 = auto_match_ns.find('"id":') | |
| i2 = auto_match_ns.find('"id":' + str(manual_id)) | |
| if i1 == i2 and i1 != -1 and i2 != -1: | |
| result_data["match_score"] = 'E' | |
| results["equal_match_count"] = results["equal_match_count"] + 1 | |
| elif i2 > i1: | |
| pos = 0 | |
| partial_index = 0 | |
| result_data["match_score"] = 'P' | |
| while pos != -1: | |
| pos = auto_match_ns.find('"id":', pos+1) | |
| if pos == i2: | |
| if partial_index < 5: | |
| result_data["match_score"] = 'P5' | |
| results["partial5_match_count"] = results["partial5_match_count"] + 1 | |
| elif partial_index < 10: | |
| result_data["match_score"] = 'P10' | |
| results["partial10_match_count"] = results["partial10_match_count"] + 1 | |
| else: | |
| result_data["match_score"] = 'P100' | |
| results["partial100_match_count"] = results["partial100_match_count"] + 1 | |
| break | |
| partial_index = partial_index + 1 | |
| #result_data["match_score"] = 'P' | |
| results["partial_match_count"] = results["partial_match_count"] + 1 | |
| else: | |
| result_data["match_score"] = 'D' | |
| elif (auto_match is not None) and isinstance(auto_match, str) and len(auto_match) > 2: | |
| result_data["match_score"] = 'A' | |
| result_data["match_side"] = "only_auto" | |
| results["only_auto_match_count"] = results["only_auto_match_count"] + 1 | |
| elif manual_match is not None: | |
| result_data["match_score"] = 'M' | |
| result_data["match_side"] = "only_manual" | |
| results["only_manual_match_count"] = results["only_manual_match_count"] + 1 | |
| result_data["item"] = row["attrs"] | |
| result_data["item_id"] = row["id"] | |
| if row["orig_brand"]: | |
| result_data["item_name"] = row["orig_brand"] + " " + row["orig_name"] | |
| else: | |
| result_data["item_name"] = row["orig_name"] | |
| result_data["auto_match"] = auto_match | |
| manual_string = "" | |
| if (manual_match is not None): | |
| manual_string = '{' + \ | |
| '"id": ' + str(manual_match["id"].values[0]) + ',' + \ | |
| '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \ | |
| '"name": "' + str(manual_match["orig_name"].values[0]) + '",' + \ | |
| '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \ | |
| '"year": ' + str(manual_match["year"].values[0]) + '"}' | |
| result_data["manual_match"] = manual_string | |
| if manual_match is not None: | |
| result_data["product_id"] = manual_match["id"].values[0] | |
| result_data["product_name"] = manual_match["orig_name"].values[0] | |
| else: | |
| result_data["product_id"] = "" | |
| result_data["product_name"] = "" | |
| result_list.append(result_data) | |
| qresult.put([results, result_list]) | |
| def compare_matching_with_correct(products_file, items_file, match_result_file, manual_result_file, processor, csv_result_file, counts_result_file): | |
| #csv_delimiter = get_delimiter(products_file) | |
| #products_df = pd.read_csv(products_file, sep=csv_delimiter, on_bad_lines='skip') | |
| prods_data = get_latest_products() | |
| if not prods_data or not os.path.isfile(prods_data["path"]): | |
| raise Exception("Actual products data not found") | |
| products_df = prods_data["df_products"] | |
| csv_delimiter = get_delimiter(items_file) | |
| items_df_raw = pd.read_csv(items_file, sep=csv_delimiter, on_bad_lines='skip') | |
| csv_delimiter = get_delimiter(match_result_file) | |
| match_df = pd.read_csv(match_result_file, sep=csv_delimiter) | |
| csv_delimiter = get_delimiter(manual_result_file) | |
| manual_df = pd.read_csv(manual_result_file, sep=csv_delimiter) | |
| #items_df_raw = items_df_raw[0:100] | |
| items_df = processor.preprocessor.process_items(items_df_raw.copy()) | |
| items_df["attrs"] = items_df_raw["attrs"] | |
| results = { | |
| "item_count" : int(items_df.count()[0]), | |
| "product_count" : int(products_df.count()[0]), | |
| "total_cm_match_count_percent": 0, | |
| "equal_cm_match_count_percent": 0, | |
| "partial_cm_match_count_percent": 0, | |
| "partial5_cm_match_count_percent": 0, | |
| "partial10_cm_match_count_percent": 0, | |
| "partial100_cm_match_count_percent": 0, | |
| "total_match_count": 0, | |
| "total_match_count_percent": 0, | |
| "equal_match_count": 0, | |
| "equal_match_count_percent": 0, | |
| "partial_match_count": 0, | |
| "partial_match_count_percent": 0, | |
| "partial5_match_count": 0, | |
| "partial10_match_count": 0, | |
| "partial100_match_count": 0, | |
| "only_auto_match_count": 0, | |
| "only_auto_match_percent": 0, | |
| "only_manual_match_count": 0, | |
| "only_manual_match_percent": 0, | |
| "auto_match_total_count": 0, | |
| "auto_match_total_percent": 0, | |
| "manual_match_total_count" : 0, | |
| "manual_match_total_percent": 0, | |
| "manual_match_count_no_products": 0, | |
| "auto_match_count_no_products": 0, | |
| "both_match_count": 0, | |
| } | |
| result_list = [] | |
| threads_data = list() | |
| chunk_size = len(items_df) // 2 + 1 | |
| num_chunks = len(items_df) // chunk_size + 1 | |
| for i in range(num_chunks): | |
| chunk = items_df[i * chunk_size:(i + 1) * chunk_size] | |
| data = {"index": i, "items_df": chunk, "products_df": products_df, "match_df":match_df, "manual_df":manual_df, "initial_results":results } | |
| q = Queue() | |
| p = Process(target=compare_matching_with_correct_func, args=(data, q,)) | |
| p.start() | |
| threads_data.append({"index": i, "q": q}) | |
| for td in threads_data: | |
| td["result"] = td["q"].get() | |
| for td in threads_data: | |
| t_result = td["result"][0] | |
| t_result_list = td["result"][1] | |
| result_list.extend(t_result_list) | |
| results["total_match_count"] = results["total_match_count"] + t_result["total_match_count"] | |
| results["equal_match_count"] = results["equal_match_count"] + t_result["equal_match_count"] | |
| results["partial_match_count"] = results["partial_match_count"] + t_result["partial_match_count"] | |
| results["partial5_match_count"] = results["partial5_match_count"] + t_result["partial5_match_count"] | |
| results["partial10_match_count"] = results["partial10_match_count"] + t_result["partial10_match_count"] | |
| results["partial100_match_count"] = results["partial100_match_count"] + t_result["partial100_match_count"] | |
| results["only_auto_match_count"] = results["only_auto_match_count"] + t_result["only_auto_match_count"] | |
| results["only_manual_match_count"] = results["only_manual_match_count"] + t_result["only_manual_match_count"] | |
| results["auto_match_total_count"] = results["auto_match_total_count"] + t_result["auto_match_total_count"] | |
| results["manual_match_total_count"] = results["manual_match_total_count"] + t_result["manual_match_total_count"] | |
| results["manual_match_count_no_products"] = results["manual_match_count_no_products"] + t_result["manual_match_count_no_products"] | |
| results["auto_match_count_no_products"] = results["auto_match_count_no_products"] + t_result["auto_match_count_no_products"] | |
| results["both_match_count"] = results["both_match_count"] + t_result["both_match_count"] | |
| results['manual_match_total_percent'] = round(results["manual_match_total_count"] * 100 / results["item_count"], 2) | |
| results['auto_match_total_percent'] = round(results["auto_match_total_count"] * 100 / results["item_count"], 2) | |
| results["only_auto_match_percent"] = round(results["only_auto_match_count"] * 100 / results["item_count"], 2) | |
| results["only_manual_match_percent"] = round(results["only_manual_match_count"] * 100 / results["item_count"], 2) | |
| results['equal_match_count_percent'] = round(results["equal_match_count"] * 100 / results["item_count"], 2) | |
| results['partial_match_count_percent'] = round(results["partial_match_count"] * 100 / results["item_count"], 2) | |
| results['total_match_count'] = results['equal_match_count'] + results['partial_match_count'] | |
| results['total_match_count_percent'] = results['equal_match_count_percent'] + results['partial_match_count_percent'] | |
| results['total_cm_match_count_percent'] = round(results['total_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| results['equal_cm_match_count_percent'] = round(results['equal_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| results['partial_cm_match_count_percent'] = round(results['partial_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| results['partial5_cm_match_count_percent'] = round(results['partial5_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| results['partial10_cm_match_count_percent'] = round(results['partial10_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| results['partial100_cm_match_count_percent'] = round(results['partial100_match_count'] * 100 / results["manual_match_total_count"], 2) | |
| if OUTPUT_CSV: | |
| results_df = pd.DataFrame(result_list) | |
| results_df.to_csv(csv_result_file, float_format='%.2f') | |
| if OUTPUT_COUNTS: | |
| with open(counts_result_file, 'w') as fn: | |
| fn.write(json.dumps(results, indent=4)) | |
| print(results) | |
| return results |