Spaces:

Gainward777
/

Product_Matching

Sleeping

File size: 11,203 Bytes

import json
from tqdm import tqdm
from preprocess.utils.items.attrs import *
from preprocess.utils.common.extracters import *
from preprocess.utils.common.brand_matching import *
from preprocess.utils.common.parallel_brand_matching import *
from preprocess.utils.common.utils import *
from preprocess.utils.common.top_inserts import *
import pandas as pd



class Preprocessor():

    def __init__(self, long_types_list, short_types_list, sour_list, 

                 type_wine, gbs, colors_for_trim, grapes, other_words,

                 sour_merge_dict, type_merge_dict, color_merge_dict,

                 country_list, normalized_names_dict):

        self.long_types_list=long_types_list
        self.short_types_list=short_types_list
        self.sour=sour_list
        self.type_wine=type_wine
        self.gbs=gbs
        self.colors_ft=colors_for_trim
        self.grapes=grapes
        self.other_words=other_words

        self.types_n_others=long_types_list+other_words+sour_list+country_list
        self.types_n_others.remove("Шерри")

        self.sour_dict=sour_merge_dict
        self.type_dict=type_merge_dict
        self.color_merge_dict=color_merge_dict
        self.country_list = country_list
        self.normalized_names_dict=normalized_names_dict


    def preprocess_name(self, name):
        return name.replace("\n", " ")


    def process_items(self, df):
        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
    #counter=0
        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):

            try:
                i=json.loads(i)
                result['id'].append(idf)
                if 'brand' in i.keys():
                    result['brand'].append(i['brand'])
                else: result['brand'].append(None)

                name = self.preprocess_name(i['name'])
                result['name'].append(name)
                result['fullname'].append(name)
                drink_type=get_type(i, self.long_types_list)
                if drink_type is None:
                    drink_type=check_spark(i)
                if drink_type is None:
                    drink_type=check_color_and_sour(i)
                if drink_type is None:
                    drink_type=check_spark(i, col_name='type_wine')
                if drink_type is None:
                    drink_type=check_color_and_sour(i, types=self.sour)
                if drink_type is None:
                    drink_type=check_color_and_sour(i, col_name='name')
                #if 'type' in i.keys():
                result['type'].append(drink_type)#i['type'])
                #else: dd['type'].append(None)
                if 'volume' in i.keys():
                    result['volume'].append(i['volume'])
                else:
                    vol=extract_volume_or_number(i['name'])
                    result['volume'].append(vol)
                if 'year' in i.keys():
                    result['year'].append(i['year'])
                else:
                    year=extract_production_year(i['name'])
                    result['year'].append(year)
                alco=extract_alcohol_content(i['name'])
                if 'type_wine' in i.keys():
                    result['type_wine'].append(i['type_wine'])
                else: result['type_wine'].append(None)
                #f alco is not None:
                result['alco'].append(alco)
                #else: dd['type_wine'].append(None)
            except Exception as ex:
                print(idf, ex)
        return pd.DataFrame(result)


    def process_products(self, products):
        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
        for idx, row in tqdm(products.iterrows()):
            try:
                result['id'].append(row['id'])
                result['brand'].append(row['brand'])
                result['type_wine'].append(row['category'])
                result['type'].append(row['product_type'])
                result['name'].append(row['name_long'])
                result['fullname'].append(row['name_long'])
                vol=extract_volume_or_number(row['name'])
                result['volume'].append(vol)
                #year=extract_production_year(row['name'])
                year=extract_production_year(str(row['name_postfix']))
                result['year'].append(year)
                #rr['year'].append(row['name_postfix'])
                alco=extract_alcohol_content(row['name'])
                #f alco is not None:
                result['alco'].append(alco)
            except Exception as ex:
                print(ex)
        return pd.DataFrame(result)


    def prcess_text(self, text):
        #text=''+origin
        #text=str(split_russian_and_english(text))
        gb=find_full_word(text, self.gbs)#get_GB(text)
        if gb is not None:
            text=text.replace(str(gb), '')

        alcohol = extract_alcohol_content(text)
        if alcohol is not None:
            alco_w_comma=alcohol.replace('.', ',')
            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
        volume_or_number = extract_volume_or_number(text)
        if volume_or_number is not None:
            volume_with_comma=str(volume_or_number).replace('.', ',')
            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
            text = re.sub(r'\s+\b[лЛlL].\b', '', text)
            text = re.sub(r'\s+\b[лЛlL]\b', '', text)
            test=clean_wine_name(text) #remove_l(text)
            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
        # else:
        #     volume_or_number=re_extract_volume(text)
        #     if volume_or_number is not None:
        #         volume_with_comma=volume_or_number.replace('.', ',')
        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
        years = extract_years(text)
        if years is not None:
            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
        production_year = extract_production_year(text)
        if production_year is not None:
            text=text.replace(str(production_year), '')


        color=find_full_word(text, self.colors_ft)
        if color is not None:
            text=text.replace(str(color), '')
        sour=find_full_word(text, self.sour) #get_sour(text)
        if sour is not None:
            text=text.replace(str(sour), '')
        # re_extracted_volume=re_extract_volume(text)
        # if re_extracted_volume is not None:
        #     volume_with_comma=re_extracted_volume.replace('.', ',')
        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')

        # else:
        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
        # volume_or_number=re_extracted_volume

        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour


    def process(self, products, items):

        print('------*-----Prepare items catalogue-----*-----')
        items=self.process_items(items.copy())
        print('-----*-----Prepare products catalogue-----*-----')
        products=self.process_products(products.copy())

        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())

        print('-----*-----Split n match-----*-----')
        splited=split_n_match(products, items)
        items["brand"] = items["brand"].replace(splited)

        print('-----*-----Fill brands in items-----*-----')
        fill_brands_in_dataframe(products['brand'].unique(), items)

        print('-----*-----Brand matching-----*-----')
        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
        out_prods=list(set(prod_brand_list)-set(comp_list))
        out_items=list(set(items_brand_list)-set(comp_list))
        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
        items["new_brand"] = items["new_brand"].replace(brand_map_improved)

        items['type']=items['type'].replace(self.type_dict)

        print('-----*-----Unwrap brand cats step 1-----*-----')
        unwrap_b_match=unwrap_brands(products)
        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
        products["brand"] = products["brand"].replace(unwrap_b_match)

        print('-----*-----Unwrap brand cats step 2-----*-----')
        unwrap_b_match=unwrap_brands(products)
        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
        products["brand"] = products["brand"].replace(unwrap_b_match)

        print('-----*-----Finding brands in names-----*-----')
        items['new_brand']=items['new_brand'].replace('none', None)
        i_brands=items[items['new_brand'].isna()]['name'].values
        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)

        print('-----*-----Top inserts-----*-----')
        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list 
                                self.grapes, self.other_words)

        print('-----*-----Adding service categories-----*-----')
        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
        merge_types(items, products, type_merge_dict=self.type_dict)
        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
        merge_types(products, products, type_merge_dict=self.type_dict)


        print('-----*-----Name trimming-----*-----')
        item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
        #items['name']=items['id'].replace(item_timed_names)
        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
        items['gb']=gb
        items['sour']=sour
        items['sour']=items['sour'].replace(self.sour_dict)

        products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
        products['gb']=gb
        products['sour']=sour
        products['sour']=products['sour'].replace(self.sour_dict)

        print('-----*-----Replacing product types-----*-----')
        products['type']=products['type'].replace(self.type_dict)

        return items, products