Spaces:

j-s-v
/

WineMatching

Build error

File size: 31,173 Bytes

import os.path
from preprocess.utils.common.utils import normalize_name
from math import isnan

from preprocess.utils.items.attrs import *
from preprocess.utils.common.brand_matching import *
from preprocess.utils.common.top_inserts import *
from preprocess.utils.products.products import *
import pandas as pd
from processor.matching import prepare_groups_with_ids_ex

class Preprocessor():

    def __init__(self, long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                 type_merge_dict, color_merge_dict,
                 country_list):

        self.long_types_list=[element.lower() for element in long_types_list]
        self.short_types_list=short_types_list
        self.sour=sour_list
        self.type_wine=type_wine
        self.gbs=gbs
        self.grapes=grapes
        self.other_words=other_words

        self.types_n_others=long_types_list+other_words+sour_list+country_list
        self.types_n_others.remove("Шерри")

        self.type_dict=type_merge_dict
        self.color_merge_dict=color_merge_dict
        self.country_list = country_list

        global TYPES_FROM_BRAND_DICT
        updated = {}
        for k, v in TYPES_FROM_BRAND_DICT.items():
            updated[k] = v
            updated[normalize_name(k)] = v
        TYPES_FROM_BRAND_DICT = updated



    def write_log(self, logfn, s):
        print(s + "\n")
        with open(logfn, 'a') as logf:
            logf.write(datetime.now().strftime('[%Y-%m-%d %H:%M:%S]: ') + s + "\n")



    def process_products(self, products):
        result={'index':[], 'id':[], 'orig_brand':[], 'brand':[], 'brand_unwrap':[],
                'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[],
                'orig_name_2':[], 'name_2': [],
                'orig_type':[], 'type':[], 'type_l1':[], 'type_l0':[],
                'orig_type_wine':[], "type_wine":[], 'sour':[],
                "volume":[], "gb":[], "year":[], 'alco':[], 'other': []}#, 'embeddings':[]}

        index = 0
        for idx, row in tqdm(products.iterrows()):
            try:
                #if not row['id'] == 1115:
                #    continue

                #if not isinstance(row['brand'], str):
                #    continue

                #if (row['brand'].lower() == 'Villa Raiano'.lower()) or (row['brand'].lower() == 'bosco'.lower()):
                #    row = row
                #else:
                #    continue
                if isinstance(row['product_type'], (int, float)) and isnan(row['product_type']):
                    print("Product type is not specified or incorrect for product id=[" + str(row['id']) + "]. Product is ignored")
                    continue

                result['index'].append(index)
                result['id'].append(row['id'])

                result['orig_brand'].append(row['brand'])
                #result['orig_name'].append(row['name_long'])
                result['orig_name'].append(row['name'])
                result['orig_name_2'].append(row['name_translit'])
                result['orig_type'].append(row['product_type'])
                result['orig_type_wine'].append(row['category'])

                brand = preprocess_product_brand(row['brand'])
                #name = preprocess_product_name(row['name_long'])
                name = preprocess_product_name(row['name'])
                name_translit = preprocess_product_name(row['name_translit'])


                # First of all let's check if it is sparkling wine
                drink_type, _ = extract_spark(row['product_type'], False)
                drink_type_n, name = extract_spark(name, True)

                if not drink_type:
                    drink_type, _ = extract_type(row['product_type'], False)
                    drink_type_n, name = extract_type(name, True)

                if not drink_type:
                    drink_type = row['product_type'].lower()


                type_wine = None
                sour_wine = ''
                if isinstance(row['type_prefix'], str) and row['type_prefix']:
                    type_wine, sour_wine, _ = extract_color_and_sour(row['type_prefix'], remove=False)
                    if drink_type is None and (type_wine or sour_wine):
                        drink_type='вино'

                volume = is_volume(row['volume'])
                year, _ = extract_production_year(str(row['name_postfix']))
                gb, _ = extract_gb(row['name_postfix'], False)
                alco, _ = extract_alcohol_content(name)



                name, alcohol_n, volume_n, aging_n, year_n, gb_n, color_n, sour_wine_n, other_n = extract_attributes_from_name(name)
                name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ')
                name = normalize_and_clean_name(name)

                name_translit, alcohol_n2, volume_n2, aging_n2, year_n2, gb_n2, color_n2, sour_wine_n2, other_n2 = extract_attributes_from_name(name_translit)
                name_translit = trim_name(name_translit, self.types_n_others).replace(',', ' ').replace('.', ' ')
                name_translit = normalize_and_clean_name(name_translit)


                if not year:
                    year = year_n
                #elif year and year_n and (year != year_n):
                #    print("Product year conflict detected for product id=[" + str(row['id']) + "]: " + str(year) + " vs " + str(year_n))


                if not type_wine:
                    type_wine = color_n
                #elif color_n and type_wine and (color_n != type_wine):
                #    print("Product type_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(type_wine) + " vs " + str(color_n))


                if not sour_wine:
                    sour_wine = sour_wine_n
                #if sour_wine_n and sour_wine and (sour_wine != sour_wine_n):
                #    print("Product sour_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n))


                if not volume:
                    volume = volume_n
                elif volume_n and volume and (volume_n != volume):
                    print("Product volume conflict detected for product id=[" + str(row['id']) + "]: " + str(volume) + " vs " + str(volume_n))



                result['brand'].append(brand)
                result['brand_unwrap'].append('')

                result['name'].append(name)
                result['name_2'].append(name_translit)
                result['name_wo_brand'].append('')
                result['name_with_brand'].append('')

                if not type_wine:
                    type_wine = ''


                result['type'].append(drink_type.lower())
                result['type_wine'].append(type_wine.lower())
                result['type_l1'].append('')
                result['type_l0'].append('')

                if not sour_wine:
                    sour_wine = ''

                result['sour'].append(sour_wine)
                result['volume'].append(volume)
                result['year'].append(year)
                result['gb'].append(gb)
                result['alco'].append(alco)
                result['other'].append(other_n)

                index += 1
            except Exception as ex:
                print("Error processing product id=" + str(idx) + ": " + str(ex))
        return pd.DataFrame(result)


    def process_products_full(self, products_data):
        logfn = os.path.join(products_data['dir'], "update_log.txt")
        try:
            self.write_log(logfn, "Products processing started")

            prods_file = products_data['path']
            products_delimiter = get_delimiter(prods_file)
            # row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
            products = pd.read_csv(prods_file, sep=products_delimiter)

            # 1)
            self.write_log(logfn, '------*-----Prepare products catalogue-----*-----')
            products = self.process_products(products.copy())

            products_data["dict_types"] = products['type'].unique().tolist()

            # 2)
            #products['brand'] = products['brand'].apply(lambda x: str(x).strip().lower())

            # 3)
            #products_data["brand_3"] = products['brand'].unique()

            self.write_log(logfn, '------*-----Unwrapping brands-----*-----')
            products["brand_unwrap"] = products["brand"]
            # 4)
            ##products_data["unwrap_brands_1"] = unwrap_brands(products)
            products_data["unwrap_brands_1"] = {}

            # 5)
            products["brand_unwrap"] = products["brand"].replace(products_data["unwrap_brands_1"])

            # 6)
            #products_data["unwrap_brand_2"] = unwrap_brands(products)

            # 7)
            ##products_data["unwrap_brands_2"] = unwrap_brands(products, products['brand_unwrap'].unique())
            products_data["unwrap_brands_2"] = {}

            # 8)
            products["brand_unwrap"] = products["brand_unwrap"].replace(products_data["unwrap_brands_2"])
            products["brand_unwrap"] = products.apply(lambda row: row["brand_unwrap"] if row["brand_unwrap"] != row["brand"] else '', axis=1)

            # 9)
            self.write_log(logfn, '-----*-----Adding service categories-----*-----')
            merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
            merge_types(products, products, type_merge_dict=self.type_dict)

            # Now we can normalize and clean brands and names (only after trimming)
            products['brand'] = products['brand'].apply(normalize_and_clean_brand)
            products['norm_name'] = products['name']

            # 11)
            self.write_log(logfn, '-----*-----Replacing product types-----*-----')
            products['type']=products['type'].replace(self.type_dict)

            products['new_brand']=products['brand']
            #products["name_with_brand"] = products["name"]

            products["name_wo_brand"] = products.apply(lambda row: remove_brand_from_name(row['name'], row['brand']), axis=1)
            products["name_with_brand"] = products.apply(lambda row: insert_brand_in_name(row['name'], row['brand']), axis=1)
            #products["name_wo_brand_len"] = products['name_wo_brand'].apply(lambda x: len(x))



            #products_data["dict_groups_brand_type_vol_typewine"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume', 'new_type_wine'])
            products_data["groups_brand_type_vol"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume'], "name_wo_brand")

            # Change it from type_wine to type
            products['type_l1'] = products['type'].replace(TYPES_LEVEL_1_DICT)
            products['type_l0'] = products['type_l1'].replace(TYPES_LEVEL_0_DICT)

            products_data["groups_brand_typel1_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l1', 'volume'], "name_wo_brand")
            products_data["groups_brand_typel0_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l0', 'volume'], "name_wo_brand")

            products_data["groups_typewine_type_vol"] = prepare_groups_with_ids_ex(products, ['new_type_wine', 'new_type', 'volume'], "name_with_brand")

            products_data["groups_typel0"] = prepare_groups_with_ids_ex(products, ['type_l0'], "name_with_brand")

            #products_data["dict_groups_typel1_vol"] = prepare_groups_with_ids_ex(products, ['type_l1','volume'])
            #products_data["dict_groups_typel0_vol"] = prepare_groups_with_ids_ex(products, ['type_l0','volume'])
            #products_data["dict_groups_vol"] = prepare_groups_with_ids_ex(products, ['volume'])

            products_data["df_products"] = products
            save_products_data(products_data)

            remove_old_products(products_data)

            self.write_log(logfn, "Products processing finished")
        except Exception as ex:
            self.write_log(logfn, "An error occurred: " + str(ex))
            return None

        return products_data


    def preprocess_item_brand(self, brand):
        if not isinstance(brand, str):
            return str(brand), ''

        parts = brand.split('/', 2)
        if len(parts) > 1:
            return parts[0].strip(), parts[1].strip()

        return brand.strip(), ''



    def detect_language_simple_2(self, name, reverse=False):
        if reverse:
            name = name[::-1]

        ru_count = 0
        en_count = 0

        for ch in name:
            if (ord(ch) >= ord('А') and ord(ch) <= ord('Я')) or \
                (ord(ch) >= ord('а') and ord(ch) <= ord('я')):
                ru_count += 1
            elif (ord(ch) >= ord('A') and ord(ch) <= ord('Z')) or \
                (ord(ch) >= ord('a') and ord(ch) <= ord('z')):
                en_count += 1


        if ru_count < 2 and en_count < 2:
            return 'xx'

        if ru_count > en_count:
            return 'ru'

        return 'en'


    def check_alternative_name(self, name, check_len = True, simple_lang_check=True):
        startpos = 0
        while True:
            pos = name.find("/", startpos)
            if pos == -1:
                return name, ''

            parts = [name[:pos], name[pos+1:]]
            startpos = pos + 1

            if check_len:
                if float(min(len(parts[0]), len(parts[1]))) / max(len(parts[0]), len(parts[1])) < 0.5:
                    continue

                if len(parts[1]) < 3:
                    return name, ''

            lang1 = self.detect_language_simple_2(parts[0], True)
            lang2 = self.detect_language_simple_2(parts[1])
            if (lang1 == 'ru' and lang2=='en') or (lang1 == 'en' and lang2=='ru'):
                return parts[0], parts[1]

        return name, ''


    def merge_multiline_name(self, name_parts):
        name = name_parts[0]
        name_2 = ""

        lang_0 = detect_language(name)
        for n in name_parts[1:]:
            if detect_language(n) == lang_0:
                name += " " + n
            else:
                name_2 += " " + n

        return name, name_2


    def process_multiline_name(self, name, check_len = True, simple_lane_check=True):
        if not name:
            return name, ''

        pos = name.find(" ##### ")
        if pos >= 0:
            parts = name.split(" ##### ")
            # Special processing for complex multiline names like;
            # "Луи Мемори До\nВыдержка: от 30 до 50 лет\nLouis Memory Deau\nAgeing: from 30 to 50 years"
            if len(parts) > 2:
                return self.merge_multiline_name(parts)

            return parts[0], parts[1]

        return name, ''



    def process_items(self, df):
        result={'id':[], 'orig_brand':[], 'brand':[], 'brand_short':[], 'brand_2':[], 'brand_2_short':[], 'alt_brands': [],
                'orig_name':[], 'name':[], 'name_wo_brand':[],  'name_with_brand':[],
                'name_2':[], 'name_2_wo_brand':[], 'name_2_with_brand':[],
                'names_wo_alt_brands': [], 'names_with_alt_brands': [], 'names_2_wo_alt_brands': [], 'names_2_with_alt_brands': [],
                'type':[], 'new_type':[], 'type_n':[],
                "type_wine":[], "new_type_wine":[], "type_wine_n":[],
                "sour":[], "volume":[], 'gb':[], "year":[], 'aging':[], 'alco':[]} #, 'orig_attrs':[],}#, 'embeddings':[]}

        volume_issues = []
        year_issues = []

        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):

            try:
                if not isinstance(i, str) or not i:
                    #print("Skipping item with id=" + str(idf) + " because of incorrect format\n")
                    volume_issues.append(0)
                    year_issues.append(0)
                    continue

                #if not (idf == 2008546 or idf == 2007114 or idf == 2008080) :
                #    continue
                #if not idf == 275213:
                #    continue
                #if not idf == 173796:
                #    continue

                #if idf > 1000:
                #    continue

                i = json.loads(i.lower().replace("\\n", " ##### ").replace("\n", " ##### "))

                result['id'].append(idf)

                if 'brand' in i.keys():
                    result['orig_brand'].append(i['brand'])
                    brand, brand_2 = self.preprocess_item_brand(i['brand'])

                    brand = normalize_and_clean_brand(brand)
                    brand_2 = normalize_and_clean_brand(brand_2)
                else:
                    result['orig_brand'].append(None)
                    brand = brand_2 = None


                name = i['name']
                result['orig_name'].append(name)

                # First of all remove from name specific brands that makes collisions while name parsing and trimming
                name, specific_brand, specific_name = replace_specific_brand_and_name(name)
                if specific_brand:
                    if brand and specific_brand and (brand != specific_brand):
                        print("Conflict between brand and specific brand for item id=[" + str(idf) + "]")
                    else:
                        brand = specific_brand = normalize_and_clean_brand(specific_brand)
                        brand_2 = None

                if specific_name:
                    specific_name  = normalize_and_clean_name(specific_name)


                # Some items contains many lines separated with new line. We can easilty process them because new line is universal separator
                # Other types of multiline names that are separated with \ or / we process later (using process_multiline_name2) after all attributes are extracted
                name, name_2 = self.process_multiline_name(name)


                type_wine = None
                sour_wine = None
                volume = None
                alcohol = None
                year = None

                # First of all let's check if it is sparkling wine
                drink_type, name = extract_spark(name, False)

                if not drink_type and ('type_wine' in i.keys()):
                    drink_type, _ = extract_spark(i['type_wine'], False)

                # Next let's check any other known type
                if not drink_type and ('type' in i.keys()):
                    drink_type, _ = extract_type(i['type'], False)


                if not drink_type and ('type_wine' in i.keys()):
                    drink_type, _ = extract_type(i['type_wine'], False)

                # Next let's check any other known type
                if not drink_type and ('category' in i.keys()):
                    drink_type, _ = extract_type(i['category'], False)

                # Special case for some brands like 'jaegermeister' which sometimes the only thing specified in name
                # so we try to detect drink type using only brand / name if it is possible
                if not drink_type and brand:
                    drink_type = extract_type_by_brand_name(brand)


                if 'type_wine' in i.keys():
                    type_wine, sour_wine, _ = extract_color_and_sour(i['type_wine'], remove=False)
                    if drink_type is None and (type_wine or sour_wine):
                        drink_type='вино'


                # Try to extract type_wine and sour from "color" attribute if exists
                if 'color' in i.keys():
                    if not type_wine:
                        type_wine, _ = extract_color(i['color'])
                        if type_wine and drink_type is None:
                            drink_type='вино'

                    if not sour_wine:
                        sour_wine, _ = extract_sour(i['color'])
                        if sour_wine and drink_type is None:
                            drink_type='вино'


                # Try to extract sour from "sugar" attribute if exists
                if 'sugar' in i.keys():
                    if sour_wine is None:
                        sour_wine, _ = extract_sour(i['sugar'])
                        if sour_wine and drink_type is None:
                            drink_type='вино'


                if 'volume' in i.keys():
                    volume = i['volume']


                if 'year' in i.keys():
                    year = i['year']


                #alco, _ =extract_alcohol_content(i['name'])
                #result['alco'].append(alco)
                drink_type_n, name = extract_type(name, True)

                name, alcohol_n, volume_n, aging, year_n, gb, color_n, sour_wine_n, other_n = extract_attributes_from_name(name)
                name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ')

                # If alternative name is not specified, then it is time to check it
                # (after we removed all attributes that could break the logic, but before normalization in order to save language difference)
                if not name_2:
                    name, name_2 = self.check_alternative_name(name)

                name = normalize_and_clean_name(name)


                if name_2:
                    name_2, _, _, _, _, _, _, _, _ = extract_attributes_from_name(name_2)
                    name_2 = trim_name(name_2, self.types_n_others).replace(',', ' ').replace('.', ' ')
                    name_2 = normalize_and_clean_name(name_2)


                if specific_brand or specific_name:
                    name = restore_specific_brand_and_name(name, specific_brand, specific_name)

                # Check that there is no conflict between values extracted from name and from item attributes

                if not drink_type:
                    drink_type = drink_type_n
                #elif drink_type and drink_type_n and (drink_type != drink_type_n):
                #    print("Item drink_type conflict detected for item id=[" + str(idf) + "]: " + str(drink_type) + " vs " + str(drink_type_n))

                if not alcohol:
                    alcohol = alcohol_n
                #elif alcohol and alcohol_n and (alcohol != alcohol_n):
                #    print("Item alcohol conflict detected for item id=[" + str(idf) + "]: " + str(alcohol) + " vs " + str(alcohol_n))

                vol_issue = 0
                if not volume:
                    volume = volume_n
                elif volume and volume_n and (volume != volume_n):
                    vol_issue = 1
                    #print("Item volume conflict detected for item id=[" + str(idf) + "]: " + str(volume) + " vs " + str(volume_n))

                volume_issues.append(vol_issue)


                year_issue = 0
                if not year:
                    year = year_n
                elif year and year_n and (str(year).strip() != str(year_n).strip()):
                    #print("Item year conflict detected for item id=[" + str(idf) + "]: " + str(year) + " vs " + str(year_n))
                    year_issue = 1

                year_issues.append(year_issue)


                if not type_wine:
                    type_wine = color_n
                #elif type_wine and color_n and (type_wine != color_n):
                #    print("Item type_wine conflict detected for item id=[" + str(idf) + "]: " + str(type_wine) + " vs " + str(color_n))


                if not sour_wine:
                    sour_wine = sour_wine_n
                #elif sour_wine and sour_wine_n and (sour_wine != sour_wine_n):
                #    print("Item sour_wine conflict detected for item id=[" + str(idf) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n))


                # Finally fill in the data
                result['brand'].append(brand)
                result['brand_short'].append('')
                result['brand_2'].append(brand_2)
                result['brand_2_short'].append('')
                result['alt_brands'].append([])

                if name is None:
                    name = name

                if name_2 is None:
                    name_2 = name_2

                result['name'].append(name)
                result['name_wo_brand'].append('')
                result['name_with_brand'].append('')
                result['names_wo_alt_brands'].append([])
                result['names_with_alt_brands'].append([])


                result['name_2'].append(name_2)
                result['name_2_wo_brand'].append('')
                result['name_2_with_brand'].append('')
                result['names_2_wo_alt_brands'].append([])
                result['names_2_with_alt_brands'].append([])

                result['new_type'].append('')
                result['type_n'].append('')
                result['new_type_wine'].append('')
                result['type_wine_n'].append('')

                result['type'].append(drink_type)
                result['type_wine'].append(type_wine)
                result['sour'].append(sour_wine)

                result['aging'].append(aging)
                result['alco'].append(alcohol)
                result['gb'].append(gb)
                result['volume'].append(volume)
                result['year'].append(year)

            except Exception as ex:
                print("Error occurred while processing item id=" + str(idf), ex)

        #df = df.assign(volume_issues=volume_issues)
        #df = df.assign(year_issues=year_issues)
        #df.to_csv("c:\\!\\feed_items_issues.csv")
        #exit(0)

        return pd.DataFrame(result)


    def prcess_text(self, text):
        #text=''+origin
        #text=str(split_russian_and_english(text))
        gb=find_full_word(text, self.gbs)#get_GB(text)
        if gb is not None:
            text=text.replace(str(gb), ' ')
        #text = remove_full_words(text, self.gbs)

        alcohol, text = extract_alcohol_content(text, True)
        #if alcohol is not None:
        #    alco_w_comma=alcohol.replace('.', ',')
        #    text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')

        years, text = extract_years(text, True)
        if years is not None:
            text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ')

        production_year, text = extract_production_year(text, True)

        volume_or_number, text = extract_volume_or_number(text, True)
        #if volume_or_number is not None:
            #text = text.replace(vol_text, " ")
            #volume_with_comma=str(volume_or_number).replace('.', ',')
            #text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
            #text = re.sub(r'\s+\b[лЛlL].\b', ' ', text)
            #text = re.sub(r'\s+\b[лЛlL]\b', ' ', text)
            #test=clean_wine_name(text) #remove_l(text)
            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
        # else:
        #     volume_or_number=re_extract_volume(text)
        #     if volume_or_number is not None:
        #         volume_with_comma=volume_or_number.replace('.', ',')
        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')

        #if production_year is not None:
        #    text = re.sub(r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text)

        color, sour, text = extract_color_and_sour(text, True)

        #color=find_full_word(text, self.type_wine)
        #if color is not None:
        #    if not find_word(text, SPECIFIC_NAMES):
        #        text=text.replace(str(color), '')

        #sour=find_full_word(text, self.sour) #get_sour(text)
        #if sour is not None:
        #    text=text.replace(str(sour), '')

        # re_extracted_volume=re_extract_volume(text)
        # if re_extracted_volume is not None:
        #     volume_with_comma=re_extracted_volume.replace('.', ',')
        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')

        # else:
        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
        # volume_or_number=re_extracted_volume

        return text, alcohol, volume_or_number, years, production_year, gb, color, sour


    def process_new(self, products_data, items):

        if not "df_products" in products_data.keys():
            products_data = self.process_products_full(products_data)

        print('------*-----Prepare items catalogue-----*-----')
        items=self.process_items(items.copy())

        products = products_data["df_products"]
        products_brands = products['brand'].unique()

        items['type']=items['type'].replace(self.type_dict)

        print('-----*-----Adding service categories-----*-----')
        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
        merge_types(items, products, type_merge_dict=self.type_dict, product_types=products_data["dict_types"])


        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())

        print('-----*-----Fill brands in items-----*-----')
        fill_brands_in_dataframe(products_brands, items)
        fill_brands_in_dataframe_2(products_brands, items)

        print('-----*-----Brand matching-----*-----')
        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
        out_prods=list(set(prod_brand_list)-set(comp_list))
        out_items=list(set(items_brand_list)-set(comp_list))
        brand_map_improved=match_brands_improved(out_items, list(products_brands))
        items["new_brand"] = items["new_brand"].replace(brand_map_improved)


        print('-----*-----Finding brands in names-----*-----')
        items['new_brand']=items['new_brand'].replace('none', None)
        #i_brands=items[items['new_brand'].isna()]['name'].values
        i_brands = items['name'].values
        p_brands=[i for i in products_brands if i is not None and len(i)>3]
        #new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands, threshold=30)
        new_found_brands = check_brands_in_strings_pqdm(i_brands, p_brands)
        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)

        print('-----*-----Top inserts-----*-----')
        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, self.grapes, self.other_words)

        items['brand']=items['brand'].replace('none', None)

        #print('-----*-----Replacing product types-----*-----')
        # 11)
        items['new_type'] = items['new_type'].replace(self.type_dict)

        items['type_l1'] = items['type'].replace(TYPES_LEVEL_1_DICT)
        items['type_l0'] = items['type_l1'].replace(TYPES_LEVEL_0_DICT)

        #fullpath = os.path.join("c:\\!!\\_items_with_types.pkl")
        #save_df_to_file(items, fullpath, True)
        #exit(1)

        return items, products