File size: 8,461 Bytes
3cc4e3f
d3ca850
5b19d8a
606ca5f
 
d4bade4
 
3cc4e3f
 
 
2956b24
95c9287
 
 
c5b2790
3cc4e3f
2956b24
95c9287
 
 
c5b2790
3cc4e3f
 
d3ca850
606ca5f
 
 
 
3cc4e3f
 
cb92a0f
 
 
 
 
3cc4e3f
606ca5f
 
3cc4e3f
 
606ca5f
3cc4e3f
d3ca850
606ca5f
 
 
 
 
 
c5b2790
606ca5f
 
 
 
 
 
 
5b19d8a
 
 
 
 
606ca5f
5b19d8a
 
606ca5f
 
 
c5b2790
606ca5f
 
 
c5b2790
606ca5f
2956b24
 
5b19d8a
606ca5f
2956b24
 
d3ca850
 
 
 
 
 
 
5b19d8a
d3ca850
 
 
c5b2790
 
 
d3ca850
606ca5f
 
d3ca850
606ca5f
c5b2790
d4bade4
eedd5dc
d4bade4
 
 
 
 
 
 
 
 
 
 
eedd5dc
d4bade4
 
 
 
 
 
 
 
 
 
 
 
 
eedd5dc
 
 
 
d4bade4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eedd5dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from preprocess.preprocess import Preprocessor
from processor.matching import new_find_matches_with_ids, prepare_groups_with_ids_ex
from preprocess.utils.common.utils import get_delimiter, verify_csv
import os.path
from preprocess.utils.products.products import *
from rapidfuzz import fuzz, process
import pandas as pd


class Processor():
    def __init__(self, long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                 type_merge_dict, color_merge_dict,
                 country_list):

        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                type_merge_dict, color_merge_dict,
                 country_list)


    '''def process(self, prods_data, items, is_items_first=False, threshold=65, include_alternatives=True):
        items, products=self.preprocessor.process(prods_data, items)
        return self.match(items, products, is_items_first, threshold, include_alternatives)

    def match(self, items, products, is_items_first=False, threshold=65, include_alternatives=True):
        print('-----*-----Matching-----*-----')
        if is_items_first:

            #products['new_brand']=products['brand']
            products['new_brand'] = products['brand_sndex_7']
            #items['brand']=items['new_brand']
            items['brand'] = items['brand_sndex_7']
            products_groups = prepare_groups_with_ids(products)

            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=threshold, include_alternatives=include_alternatives)
        else:
            items_groups = prepare_groups_with_ids(items)
            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)

        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products'''


    def process_products_full(self, products_data):
        self.preprocessor.process_products_full(products_data)


    def process_new(self, items_file, is_items_first=False, threshold=65, order_invariant_names_matching = False, thread_count = 8):
        prods_data = get_latest_products()
        if not prods_data or not os.path.isfile(prods_data["path"]):
            raise Exception("Actual products data not found")

        if not items_file:
            raise Exception("Items CSV not specified")

        #bad_lines = verify_csv(items_file, items_file + ".fixed")
        #if bad_lines:
        #    items_file = items_file + ".fixed"
            #raise Exception("Uploaded Items CSV contains bad lines:\n" + "\n".join(bad_lines))

        items_delimiter = get_delimiter(items_file)
        print('items delimiter: "' + items_delimiter + '"')
        row_items = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
        if not 'attrs' in row_items.columns.values:
            raise Exception("Uploaded Items CSV does not seem to be valid")

        items, products = self.preprocessor.process_new(prods_data, row_items)

        print('-----*-----Matching-----*-----')
        if is_items_first:
            #items['brand']=items['new_brand']

            fullpath = os.path.join(prods_data["dir"], "_items.pkl")
            save_df_to_file(items, fullpath, True)
            #exit(1)

            #items['brand']=items['brand_sndex_5']

            #products_groups = prods_data["dict_groups"]
            products_groups_brand_type_vol = prods_data["groups_brand_type_vol"]
            products_groups_brand_typel1_vol = prods_data["groups_brand_typel1_vol"]
            products_groups_brand_typel0_vol = prods_data["groups_brand_typel0_vol"]
            products_groups_typewine_type_vol = prods_data["groups_typewine_type_vol"]

            res = new_find_matches_with_ids(items, products,
                                            name_threshold=threshold,
                                            products_groups_brand_type_vol = products_groups_brand_type_vol,
                                            products_groups_brand_typel1_vol = products_groups_brand_typel1_vol,
                                            products_groups_brand_typel0_vol = products_groups_brand_typel0_vol,
                                            products_groups_typewine_type_vol = products_groups_typewine_type_vol,
                                            order_invariant_names_matching = order_invariant_names_matching,
                                            thread_count = thread_count)
        '''else:
            items_groups = prepare_groups_with_ids(items)
            items_alt_groups = prepare_groups_by_alternative_keys(items)
            res=new_find_matches_with_ids(products, items_groups, None, items_alt_groups, items, name_threshold=threshold, include_alternatives=include_alternatives)'''

        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

    '''def score_correct_items_to_products(self, manual_matchings):
        result = []

        for mm in manual_matchings:
            item = mm[0]
            product = mm[1]

            item_to_compare = item['name']
            if 'brand' in item.keys() and item['brand'] and item['brand'] not in item['name']:
                item_to_compare = item['brand'] + " " + item['name']

            product_to_compare = product['name_with_brand']
            product2_to_compare = product['name_2']

            item_with_name = item['orig_name']
            if 'orig_brand' in item.keys() and item['orig_brand'] and item['orig_brand'] not in item['orig_name']:
                item_with_name = item['orig_brand'] + " - " + item['orig_name']


            product_brand = product['orig_brand'].values[0]
            product_with_brand = product['orig_name'].values[0]
            if product_brand and isinstance(product_brand, str) and product_brand not in product_with_brand:
                product_with_brand = product_brand + " - " + product_with_brand


            match, score, _ = process.extractOne(item_to_compare, product_to_compare)
            match2, score2, _ = process.extractOne(item_to_compare, product2_to_compare)
            if score2 > score:
                score = score2

            result.append({"item_id":item['id'], "product_id":product['id'].values[0], 'score':score,
                           "item_orig":item_with_name, "product_orig":product_with_brand,
                           "item": item_to_compare, "product":product_to_compare.values[0]
                           })

        return result

    def verify_correct_matching(self, correct_file, items_file, thread_count = 8):
        prods_data = get_latest_products()
        if not prods_data or not os.path.isfile(prods_data["path"]):
            raise Exception("Actual products data not found")

        products_df = prods_data["df_products"]


        if not correct_file:
            raise Exception("Correct CSV not specified")

        if not items_file:
            raise Exception("Items CSV not specified")

        csv_delimiter = get_delimiter(correct_file)
        manual_df = pd.read_csv(correct_file, sep=csv_delimiter)

        items_delimiter = get_delimiter(items_file)
        items_df = pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
        if not 'attrs' in items_df.columns.values:
            raise Exception("Uploaded Items CSV does not seem to be valid")

        items = self.preprocessor.process_items(items_df.copy())

        manual_matchings = []
        count = len(items)
        for index, row in items.iterrows():
            print("Processing row #" + str(index) + "/" + str(count) + "\n")
            manual = manual_df[manual_df['item_id'] == row["id"]]['state']
            if (len(manual) > 0) and (manual.values[0] == 1):
                p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]

                if len(p.values) > 0:
                    if isinstance(row, float):
                        row = row
                    manual_matchings.append([row, p, -1])
                else:
                    print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")

        return self.score_correct_items_to_products(manual_matchings)'''