Spaces:

j-s-v
/

WineMatching

Build error

File size: 8,461 Bytes

3cc4e3f
d3ca850
5b19d8a
606ca5f
 
d4bade4
 
3cc4e3f
 
 
2956b24
95c9287
 
 
c5b2790
3cc4e3f
2956b24
95c9287
 
 
c5b2790
3cc4e3f
 
d3ca850
606ca5f
 
 
 
3cc4e3f
 
cb92a0f
 
 
 
 
3cc4e3f
606ca5f
 
3cc4e3f
 
606ca5f
3cc4e3f
d3ca850
606ca5f
 
 
 
 
 
c5b2790
606ca5f
 
 
 
 
 
 
5b19d8a
 
 
 
 
606ca5f
5b19d8a
 
606ca5f
 
 
c5b2790
606ca5f
 
 
c5b2790
606ca5f
2956b24
 
5b19d8a
606ca5f
2956b24
 
d3ca850
 
 
 
 
 
 
5b19d8a
d3ca850
 
 
c5b2790
 
 
d3ca850
606ca5f
 
d3ca850
606ca5f
c5b2790
d4bade4
eedd5dc
d4bade4
 
 
 
 
 
 
 
 
 
 
eedd5dc
d4bade4
 
 
 
 
 
 
 
 
 
 
 
 
eedd5dc
 
 
 
d4bade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eedd5dc

from preprocess.preprocess import Preprocessor
from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex
from preprocess.utils.common.utils import get_delimiter, verify_csv
import os.path
from preprocess.utils.products.products import *
from rapidfuzz import fuzz, process
import pandas as pd


class Processor():
    def __init__(self, long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                 type_merge_dict, color_merge_dict,
                 country_list):

        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                type_merge_dict, color_merge_dict,
                 country_list)


    '''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True):
        items, products=self.preprocessor.process(prods_data, items)
        return self.match(items, products, is_items_first, threshold, include_alternatives)

    def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True):
        print('-----*-----Matching-----*-----')
        if is_items_first:

            #products['new_brand']=products['brand']
            products['new_brand'] = products['brand_sndex_7']
            #items['brand']=items['new_brand']
            items['brand'] = items['brand_sndex_7']
            products_groups = prepare_groups_with_ids(products)

            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives)
        else:
            items_groups = prepare_groups_with_ids(items)
            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)

        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products'''


    def process_products_full(self, products_data):
        self.preprocessor.process_products_full(products_data)


    def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8):
        prods_data = get_latest_products()
        if not prods_data or not os.path.isfile(prods_data["path"]):
            raise Exception("Actual products data not found")

        if not items_file:
            raise Exception("Items CSV not specified")

        #bad_lines = verify_csv(items_file, items_file + ".fixed")
        #if bad_lines:
        #    items_file = items_file + ".fixed"
            #raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines))

        items_delimiter = get_delimiter(items_file)
        print('items delimiter: "' + items_delimiter + '"')
        row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
        if not 'attrs' in row_items.columns.values:
            raise Exception("Uploaded Items CSV does not seem to be valid")

        items, products = self.preprocessor.process_new(prods_data, row_items)

        print('-----*-----Matching-----*-----')
        if is_items_first:
            #items['brand']=items['new_brand']

            fullpath = os.path.join(prods_data["dir"], "_items.pkl")
            save_df_to_file(items, fullpath, True)
            #exit(1)

            #items['brand']=items['brand_sndex_5']

            #products_groups = prods_data["dict_groups"]
            products_groups_brand_type_vol = prods_data["groups_brand_type_vol"]
            products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"]
            products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"]
            products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"]

            res = new_find_matches_with_ids(items, products,
                                            name_threshold=threshold,
                                            products_groups_brand_type_vol = products_groups_brand_type_vol,
                                            products_groups_brand_typel1_vol = products_groups_brand_typel1_vol,
                                            products_groups_brand_typel0_vol = products_groups_brand_typel0_vol,
                                            products_groups_typewine_type_vol = products_groups_typewine_type_vol,
                                            order_invariant_names_matching = order_invariant_names_matching,
                                            thread_count = thread_count)
        '''else:
            items_groups = prepare_groups_with_ids(items)
            items_alt_groups = prepare_groups_by_alternative_keys(items)
            res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)'''

        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

    '''def score_correct_items_to_products(self, manual_matchings):
        result = []

        for mm in manual_matchings:
            item = mm[0]
            product = mm[1]

            item_to_compare = item['name']
            if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
                item_to_compare = item['brand'] + " " + item['name']

            product_to_compare = product['name_with_brand']
            product2_to_compare = product['name_2']

            item_with_name = item['orig_name']
            if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']:
                item_with_name = item['orig_brand'] + " - " + item['orig_name']


            product_brand = product['orig_brand'].values[0]
            product_with_brand = product['orig_name'].values[0]
            if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand:
                product_with_brand = product_brand + " - " + product_with_brand


            match, score, _ = process.extractOne(item_to_compare, product_to_compare)
            match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare)
            if score2 > score:
                score = score2

            result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score,
                           "item_orig":item_with_name, "product_orig":product_with_brand,
                           "item": item_to_compare, "product":product_to_compare.values[0]
                           })

        return result

    def verify_correct_matching(self, correct_file, items_file, thread_count = 8):
        prods_data = get_latest_products()
        if not prods_data or not os.path.isfile(prods_data["path"]):
            raise Exception("Actual products data not found")

        products_df = prods_data["df_products"]


        if not correct_file:
            raise Exception("Correct CSV not specified")

        if not items_file:
            raise Exception("Items CSV not specified")

        csv_delimiter = get_delimiter(correct_file)
        manual_df = pd.read_csv(correct_file, sep=csv_delimiter)

        items_delimiter = get_delimiter(items_file)
        items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
        if not 'attrs' in items_df.columns.values:
            raise Exception("Uploaded Items CSV does not seem to be valid")

        items = self.preprocessor.process_items(items_df.copy())

        manual_matchings = []
        count = len(items)
        for index, row in items.iterrows():
            print("Processing row #" + str(index) + "/" + str(count) + "\n")
            manual = manual_df[manual_df['item_id'] == row["id"]]['state']
            if (len(manual) > 0) and (manual.values[0] == 1):
                p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]

                if len(p.values) > 0:
                    if isinstance(row, float):
                        row = row
                    manual_matchings.append([row, p, -1])
                else:
                    print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")

        return self.score_correct_items_to_products(manual_matchings)'''