vinhngba2704's picture
First commit to this repo
51db8d1
import json
from rapidfuzz import process, fuzz
# Initialized Modules
from modules.normalization import normalization
# Mapping merchant
def mapping_merchant(information, json_path, normalization_rule):
# Load from merchant JSON file
with open(json_path, "r", encoding="utf-8") as f:
name_id_dict = json.load(f)
# Create cached normalized dictionary
normalized_cached_map = {normalization(text= k, normalization_rule= normalization_rule): v for k, v in name_id_dict.items()}
name_list = list(normalized_cached_map.keys())
for item in information:
# Match seller
seller_name = normalization(text= item["seller"], normalization_rule= normalization_rule)
seller_match, seller_score, _ = process.extractOne(seller_name, name_list, scorer= fuzz.token_sort_ratio)
if seller_score >= 80:
item["seller_id"] = normalized_cached_map[seller_match]
else:
item["seller_id"] = None
# Match buyer
buyer_name = normalization(text= item["buyer"], normalization_rule= normalization_rule)
buyer_match, buyer_score, _ = process.extractOne(buyer_name, name_list, scorer= fuzz.token_sort_ratio)
if buyer_score >= 80:
item["buyer_id"] = normalized_cached_map[buyer_match]
else:
item["buyer_id"] = None
return information
# Mapping unit
def mapping_unit(information, json_path, normalization_rule):
# Load from unit JSON file
with open(json_path, "r", encoding="utf-8") as f:
unit_id_dict = json.load(f)
# Create cached normalized dictionary
normalized_cached_map = {normalization(text= k, normalization_rule= normalization_rule): v for k, v in unit_id_dict.items()}
unit_list = list(normalized_cached_map.keys())
for item in information:
# Match unit
unit = normalization(text= item["unit"], normalization_rule= normalization_rule)
unit_match, unit_score, _ = process.extractOne(unit, unit_list, scorer= fuzz.token_sort_ratio)
if unit_score >= 80:
item["unit_id"] = normalized_cached_map[unit_match]
else:
item["unit_id"] = item["unit"]
return information
# Mapping employee
def mapping_employee(information, json_path, normalization_rule):
# Load from employee JSON file
with open(json_path, "r", encoding="utf-8") as f:
employee_id_dict = json.load(f)
# Mapping product
def mapping_product(information, json_path, normalization_rule):
# Load from product JSON file
with open(json_path, "r", encoding="utf-8") as f:
product_id_dict = json.load(f)
# Create cached normalized dictionary
normalized_cached_map = {
(normalization(text= product_name, normalization_rule= normalization_rule),
normalization(text= unit_id, normalization_rule= normalization_rule)): product_id
for product_id, (product_name, unit_id) in product_id_dict.items()
}
product_list = list(normalized_cached_map.keys())
for item in information:
# Normalize product_name and unit_id:
normalized_product_name = normalization(text= item["product_name"], normalization_rule= normalization_rule)
normalized_unit_id = normalization(text = item["unit_id"], normalization_rule= normalization_rule)
product_name_match, product_name_score, _ = process.extractOne(normalized_product_name, [k[0] for k in product_list], scorer= fuzz.token_sort_ratio)
unit_id_match, unit_id_score, _ = process.extractOne(normalized_unit_id, [k[1] for k in product_list], scorer= fuzz.token_sort_ratio)
# Calculate the average matching score
average_score = (product_name_score + unit_id_score) / 2
if average_score >=80:
matched_key = (product_name_match, unit_id_match)
item["product_id"] = normalized_cached_map[matched_key]
else:
item["product_id"] = None
return information