Spaces:

j-s-v
/

WineMatching

Build error

File size: 32,000 Bytes

import json
import datetime
import settings

from preprocess.utils.common.extracters import *
from multiprocessing import Process, Queue

import pandas as pd
from rapidfuzz import fuzz, process
from math import isnan
from preprocess.utils.common.utils import *
from time import perf_counter

SCORE_EX_EMPTY = [0,0,0,0,0,0,0,0]

SCORE_EX_BRAND_INDEX = 0
SCORE_EX_NAME_INDEX = 1
SCORE_EX_SIMILARITY_INDEX = 2
SCORE_EX_TYPE_INDEX = 3
SCORE_EX_COLORSOUR_INDEX = 4
SCORE_EX_VOLUME_INDEX = 5
SCORE_EX_YEAR_INDEX = 6
SCORE_EX_GB_INDEX = 7

'''def compare_names(name1, name2, scorer=fuzz.ratio, score_cutoff=50):
    print("Scoring: " + name1 + " vs " + name2)
    words1 = name1.split(" ")
    words2 = name2.split(" ")

    score = 0
    for w1 in words1:
        for w2 in words2:
            r = scorer(w1, w2)
            print("\t " + w1 + " - " + w2 + " ; " + str(r))
            if r >= score_cutoff:
                score = score + r

    print("Score result: " + str(score / (100*len(words1))))
    return score / (100*len(words1))

def compare_name_with_list(name, names_list, scorer=fuzz.ratio, score_cutoff=50):
    result = []
    index = 0
    for name2 in names_list:
        result.append((name2, compare_names(name, name2, scorer, score_cutoff), index))
        index = index + 1
    return result'''


'''def prepare_groups_with_ids(items_df, brand_col_name = "new_brand"):
    """
    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
    с учетом нормализованного названия.

    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.

    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
    """
    #items_df = items_df.copy()
    #items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)

    #grouped = items_df.groupby([brand_col_name, 'type', 'volume', 'new_type_wine', 'sour']).apply(
    grouped = items_df.groupby([brand_col_name, 'type', 'volume', 'new_type_wine']).apply(
        lambda x: list(zip(x['id'], x[brand_col_name], x['name'], x['orig_name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
    ).to_dict()

    #print(grouped)

    return grouped'''

def split_name(name):
    return name.split(" ")

def prepare_groups_with_ids_ex(items_df, key_cols, name_col="name"):
    """
    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
    с учетом нормализованного названия.

    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.

    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
    """
    items_df[name_col + "_splitted"] = items_df[name_col].apply(split_name)
    items_df["name_2_splitted"] = items_df["name_2"].apply(split_name)

    grouped = items_df.groupby(key_cols).apply(
        #lambda x: list(zip(x['id'], x["new_brand"], x['name'], x['orig_name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
        #lambda x: list(zip(x['id'], x["new_brand"], x['name'], x['name_2'], x['norm_name'], x['volume'], x['new_type_wine'],x['sour'], x['year']))
        #lambda x: list(zip(x['index'], x['norm_name'], x['name_2']))
        lambda x: [list(x['index']), list(x['id']), list(x[name_col]), list(x[name_col + "_splitted"]), list(x["name_2"]), list(x["name_2_splitted"])]
    ).to_dict()

    return grouped


def parse_year(year):
    if not year:
        return False
    elif isinstance(year, str):
        return int(year)
    elif isinstance(year, (int, float)) and not isnan(year):
        return int(year)

    return False

def order_by_best_year(matched_items, year):
    best_matched_items = []
    max_year_matched_items = []
    other_matched_items = []
    max_year = 0

    year = parse_year(year)

    for mi in matched_items:
        # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
        try:
            if (isinstance(mi['year'], (int, float)) and not isnan(mi['year'])) or isinstance(mi['year'], str):
                mi_year = int(mi['year'])
            else:
                mi_year = False

            if year and mi_year and (mi_year == year):
                best_matched_items.append(mi['id'])
                mi['score_year'] = 3
            elif mi_year:
                if mi_year > max_year:
                    max_year_matched_items = [mi]
                    max_year = mi_year
                elif mi_year == max_year:
                    max_year_matched_items.append(mi)
        except Exception as ex:
            print("Error processing best year for product " + str(mi["id"]) + " value " + str(mi['year']) + ": " + str(ex))

    for m in matched_items:
        if year:
            if m['id'] in best_matched_items:
                m['score_year'] = 3
            elif m['id'] in max_year_matched_items:
                m['score_year'] = 2
            else:
               m['score_year'] = 0
        else:
            m['score_year'] = 3


    return matched_items

time_11s = time_11 = 0
time_12s = time_12 = 0
time_13s = time_13 = 0
time_14s = time_14 = 0


time_20s = time_20 = 0
time_21s = time_21 = 0
time_22s = time_22 = 0
time_23s = time_23 = 0
time_24s = time_24 = 0
time_25s = time_25 = 0


def compare_names_for_same_brand(name, name_candidates, name_candidates_splitted, scorer, score_cutoff, limit):
    if not name:
        return []

    result = []
    parts = name.split(" ")

    for idx_candidate in range(len(name_candidates)):
        parts_c = name_candidates_splitted[idx_candidate]
        similar_words_count = 0
        for p1 in parts:
            if p1 in parts_c:
                similar_words_count += 1
        #for p1 in parts:
        #    match, score, _ = process.extractOne(p1, parts_c, scorer=scorer)
        #    if score > 90:
        #        total_score += score

        if similar_words_count > 0:
            score = 100

        if similar_words_count == len(parts):
            similarity = 3
        else:
            similarity = 2


        if score >= score_cutoff:
            result.append((name_candidates[idx_candidate], score, similarity, idx_candidate))
        time_25 += perf_counter() - time_25s

        idx_candidate += 1
    time_22 += perf_counter() - time_22s

    time_20 += perf_counter() - time_20s
    return result


def compare_names_invariant_order(name, name_candidates, name_candidates_splitted, scorer, score_cutoff, limit):
    result = []
    idx_candidate = 0

    if not name:
        return []

    parts = name.split(" ")
    for idx_candidate in range(len(name_candidates)):
        parts_c = name_candidates_splitted[idx_candidate]
        total_score = 0
        for p1 in parts:
            if p1 in parts_c:
                total_score += 100
        #    match, score, _ = process.extractOne(p1, parts_c, scorer=scorer)
        #    if score > 90:
        #        total_score += score
        score = total_score / len(parts)

        similarity = 3
        if len(parts) != len(parts_c):
            similarity = 2

        if score >= score_cutoff:
            result.append((name_candidates[idx_candidate], score, similarity, idx_candidate))

    return result


def show_stat():
    global time_20s, time_20, time_21s, time_21, time_22s, time_22, time_23s, time_23, time_24s, time_24, time_25s, time_25

    print("20 : " + str(time_20) + "\n" +
          "21 : " + str(time_21) + "\n" +
          "22 : " + str(time_22) + "\n" +
          "23 : " + str(time_23) + "\n" +
          "24 : " + str(time_24) + "\n" +
          "25 : " + str(time_25) + "\n")


def find_matches_from_candidates(item_name, candidates, order_invariant_names_matching, name_threshold, limit, brand_score):
    global time_11s, time_11, time_12s, time_12, time_13s, time_13, time_14s, time_14

    if not candidates or len(candidates) == 0:
        return []

    time_11s = perf_counter()
    products_indexes = candidates[0]
    products_ids = candidates[1]
    products_names = candidates[2]
    products_names_splitted = candidates[3]
    products_names_2 = candidates[4]
    products_names_2_splitted = candidates[5]
    time_11 += perf_counter() - time_11s

    time_12s = perf_counter()
    matches = alt_matches = []
    if brand_score > 0:
        name_threshold = 0
        limit = 100

    matches = process.extract(item_name, list(products_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=limit)
    matches_2 = process.extract(item_name, list(products_names_2), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=limit)
    matches.extend(matches_2)

    alt_matches = []
    if order_invariant_names_matching:
        alt_matches = compare_names_invariant_order(item_name, list(products_names), list(products_names_splitted), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=limit)
        matches_2 = compare_names_invariant_order(item_name, list(products_names_2), list(products_names_2_splitted), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=limit)
        alt_matches.extend(matches_2)

    time_12 += perf_counter() - time_12s

    '''time_13s = perf_counter()
    duplicate_indexes = []
    matches_new = []
    for match, score, idx_candidate in matches:
        if not idx_candidate in duplicate_indexes:
            matches_new.append((match, score, idx_candidate))
            duplicate_indexes.append(idx_candidate)
    time_13 += perf_counter() - time_13s

    matches = matches_new'''

    time_14s = perf_counter()
    matched_products = []

    if matches:
        for match, score, idx_candidate in matches:
            score_ex = SCORE_EX_EMPTY.copy()
            score_ex[SCORE_EX_BRAND_INDEX] = brand_score
            score_ex[SCORE_EX_NAME_INDEX] = score
            score_ex[SCORE_EX_SIMILARITY_INDEX] = 3
            matched_products.append((products_indexes[idx_candidate], products_ids[idx_candidate], score, match, score_ex))


    if alt_matches:
        for match, score, similarity, idx_candidate in alt_matches:
            score_ex = SCORE_EX_EMPTY.copy()
            score_ex[SCORE_EX_BRAND_INDEX] = brand_score
            score_ex[SCORE_EX_NAME_INDEX] = score
            score_ex[SCORE_EX_SIMILARITY_INDEX] = similarity
            matched_products.append((products_indexes[idx_candidate], products_ids[idx_candidate], score, match, score_ex))

    time_14 += perf_counter() - time_14s

    return matched_products


def score_and_filter_matched_items_by_attributes(matched_items, item):
    filtered_matched_items = []

    for mi in matched_items:
        if (not item['volume'] and not mi['volume']):
            mi['score_volume'] = 3
        elif (not item['volume'] or not mi['volume']):
            mi['score_volume'] = 2
        else:
            mi_vol = float(mi['volume'])
            i_vol = float(item['volume'])

            if abs(mi_vol - i_vol) / max(mi_vol, i_vol) < 0.15:
                mi['score_volume'] = 3
            else:
                mi['score_volume'] = 0
                mi['alternative'] = 1


        if item['type'] == mi['type']:
            mi['score_type'] = 3
        elif item['type_l1'] == mi['type_l1'] or item['type_l0'] == "unmatched":
            mi['score_type'] = 2
        elif item['type_l0'] == mi['type_l0']:
            mi['score_type'] = 1


        type_wine_match = sour_match = 0
        if item['type_wine'] and mi['color'] and (item['type_wine'] == mi['color']):
            type_wine_match = 2
        if not item['type_wine'] and not mi['color']:
            type_wine_match = 2
        elif not item['type_wine'] or not mi['color']:
            type_wine_match = 1


        if item['sour'] and mi['sour'] and (item['sour'] == mi['sour']):
            sour_match = 2
        if not item['sour'] and not mi['sour']:
            sour_match = 2
        elif not item['sour'] or not mi['sour']:
            sour_match = 1


        if type_wine_match and sour_match:
            mi['score_colorsour'] = 3
        elif type_wine_match and not sour_match:
            mi['score_colorsour'] = 2
        elif not type_wine_match and sour_match:
            mi['score_colorsour'] = 1
        else:
            mi['score_colorsour'] = 0
            mi['alternative'] = 1

        #if item['sour']:
        #    if mi['sour'] and mi['sour'] != item['sour']:
        #        if SETTINGS_MATCHING_INCLUDE_ALTERNATIVES:
        #            mi['alternative'] = 1
        #            mi['score'] *= 0.8

        if (item['gb'] and mi['gb']) or (not item['gb'] and not mi['gb']):
            mi['score_gb'] = 3
        else:
            mi['score_gb'] = 0
            mi['alternative'] = 1

        if mi['alternative'] and not SETTINGS_MATCHING_INCLUDE_ALTERNATIVES:
            continue

        filtered_matched_items.append(mi)

    return filtered_matched_items


def find_matches_for_brand(brand, item,
                           products_groups_brand_type_vol,
                           products_groups_brand_typel1_vol,
                           products_groups_brand_typel0_vol,
                           order_invariant_names_matching,
                           name_threshold,
                           brand_score):
    item_type = item['type']
    item_name = item['name']
    item_name_2 = item['name_2']
    item_volume = item['volume']
    item_type_l1 = item['type_l1']
    item_type_l0 = item['type_l0']


    item_name_wo_brand = item_name
    if brand and brand in item_name:
        item_name_x = item_name.replace(brand, '').strip()
        if len(item_name_x) > 2:
            item_name_wo_brand = item_name_x


    item_name_2_wo_brand = item_name_2
    if brand and brand in item_name_2:
        item_name_x = item_name_2.replace(brand, '').strip()
        if len(item_name_x) > 2:
            item_name_2_wo_brand = item_name_x



    matches = []

    key = (brand, item_type, item_volume)
    products_candidates = products_groups_brand_type_vol.get(key, [])
    matches_x0xx = find_matches_from_candidates(item_name_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
    matches.extend(matches_x0xx)

    if item_name_2_wo_brand:
        matches_x0xx = find_matches_from_candidates(item_name_2_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
        matches.extend(matches_x0xx)



    key = (brand, item_type_l1, item_volume)
    products_candidates = products_groups_brand_typel1_vol.get(key, [])
    matches_x1xx = find_matches_from_candidates(item_name_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
    matches.extend(matches_x1xx)

    if item_name_2_wo_brand:
        matches_x1xx = find_matches_from_candidates(item_name_2_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
        matches.extend(matches_x1xx)



    key = (brand, item_type_l0, item_volume)
    products_candidates = products_groups_brand_typel0_vol.get(key, [])
    matches_x2xx = find_matches_from_candidates(item_name_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
    matches.extend(matches_x2xx)

    if item_name_2_wo_brand:
        matches_x2xx = find_matches_from_candidates(item_name_2_wo_brand, products_candidates, order_invariant_names_matching, name_threshold, 100, brand_score)
        matches.extend(matches_x2xx)


    return matches


def calculate_total_score(all_matched_items):

    for mi in all_matched_items:
        total_score = 28.0 * mi['score_brand']/3
        total_score += 45.0 * mi['score_name']/100
        total_score += 0.0 * mi['score_similarity'] / 3
        total_score += 10.0 * mi['score_year'] / 3
        total_score += 4.0 * mi['score_volume'] / 3
        total_score += 4.0 * mi['score_type'] / 3
        total_score += 5.0 * mi['score_colorsour'] / 3
        total_score += 4.0 * mi['score_gb'] / 3

        mi['score'] = total_score


def new_find_matches_with_ids_func(items_df, products_df, name_threshold=85,
                                products_groups_brand_type_vol=None,
                                products_groups_brand_typel1_vol=None,
                                products_groups_brand_typel0_vol=None,
                                products_groups_typewine_type_vol=None,
                                order_invariant_names_matching=False,
                                index=None,
                                qresult=None):
    """
    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
    нормализованные группы.

    Производится два прохода:
    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
      исключая итемы с исходным брендом.

    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.

    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
    :param name_threshold: Порог сходства для fuzzy matching.
    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
    """
    results = []

    print("starting [" + str(index) + "]")

    if name_threshold < 50:
        name_threshold = 50

    all_products_indexes = list(products_df["index"])
    all_products_ids = list(products_df["id"])
    all_products_brands = list(products_df["new_brand"])
    all_products_names = list(products_df["name_wo_brand"])
    all_products_names_splitted = list(products_df['name_wo_brand'].apply(split_name))
    all_products_names_with_brand = list(products_df["name_with_brand"])
    all_products_names_with_brand_splitted = list(products_df['name_with_brand'].apply(split_name))

    all_products_names_2 = list(products_df["name_2"])
    all_products_names_2_splitted = list(products_df['name_2'].apply(split_name))
    #all_products_names_wo_brand = list(products_df["name_wo_brand"])

    all_products_orig_names = list(products_df["orig_name"])
    all_products_volumes = list(products_df["volume"])
    all_products_types = list(products_df["type"])
    all_products_types_l1 = list(products_df["type_l1"])
    all_products_types_l0 = list(products_df["type_l0"])
    all_products_type_wine = list(products_df["new_type_wine"])
    all_products_sour = list(products_df["sour"])
    all_products_year = list(products_df["year"])
    all_products_gbs = list(products_df["gb"])

    all_products = [all_products_indexes, all_products_ids, all_products_names, all_products_names_splitted, all_products_names_2, all_products_names_2_splitted]
    all_products_with_brands = [all_products_indexes, all_products_ids, all_products_names_with_brand, all_products_names_with_brand_splitted, all_products_names_2, all_products_names_2_splitted]

    time_0s = time_0 = 0
    time_1s = time_1 = 0
    time_2s = time_2 = 0
    time_3s = time_3 = 0
    time_4s = time_4 = 0
    time_5s = time_5 = 0
    time_6s = time_6 = 0
    time_7s = time_7 = 0
    time_8s = time_8 = 0
    time_9s = time_9 = 0

    #for idx, item in tqdm(items_df.iterrows(), total=len(items_df)):
    total=len(items_df)
    row_index = 0
    for idx, item in items_df.iterrows():
        time_0s = perf_counter()
        #print("Matching row " + str(index) + " - " + str(row_index) + "/" + str(total))
        row_index += 1

        time_1s = perf_counter()

        item_brand = item['brand']
        item_brand_2 = item['brand_2']
        item_type = item['type']
        item_name = item['name']
        item_name_2 = item['name_2']
        #item_name_with_brand = item['name_with_brand']
        item_volume = item['volume']
        item_type_wine = item['new_type_wine']
        item_sour = item['sour']
        item_type_l1 = item['type_l1']
        item_type_l0 = item['type_l0']
        matched_items = []

        time_1 += perf_counter() - time_1s
        time_2s = perf_counter()

        all_matches = []

        # First let's find matches for all brands we found for the item so far
        used_brands = []
        if item['brand']:
            matches = find_matches_for_brand(item['brand'], item, products_groups_brand_type_vol, products_groups_brand_typel1_vol,
                                                  products_groups_brand_typel0_vol, order_invariant_names_matching,
                                                    name_threshold, 3)
            all_matches.extend(matches)
            used_brands.append(item['brand'])


        if item['brand_2']:
            matches = find_matches_for_brand(item['brand_2'], item, products_groups_brand_type_vol, products_groups_brand_typel1_vol,
                                                  products_groups_brand_typel0_vol, order_invariant_names_matching,
                                                    name_threshold, 3)
            all_matches.extend(matches)
            used_brands.append(item['brand_2'])


        if item['new_brand'] and (not item['new_brand'] in used_brands):
            matches = find_matches_for_brand(item['new_brand'], item, products_groups_brand_type_vol, products_groups_brand_typel1_vol,
                                                  products_groups_brand_typel0_vol, order_invariant_names_matching,
                                                    name_threshold, 2)
            all_matches.extend(matches)
            used_brands.append(item['new_brand'])


        for ab in item['alt_brands']:
            if not ab in used_brands:
                matches = find_matches_for_brand(ab, item, products_groups_brand_type_vol, products_groups_brand_typel1_vol,
                                                  products_groups_brand_typel0_vol, order_invariant_names_matching,
                                                    name_threshold, 2)
                all_matches.extend(matches)
                used_brands.append(ab)


        # All further searchings is performed using full name with brand
        item_name_with_brand = item_name
        if item_brand and not item_brand in item_name:
            item_name_with_brand = item_brand + " " + item_name


        item_name_2_with_brand = item_name_2
        if item_name_2 and item_brand and not item_brand in item_name_2:
            item_name_2_with_brand = item_brand + " " + item_name_2


        alt_key = (item_type_wine, item_type, item_volume)
        products_candidates = products_groups_typewine_type_vol.get(alt_key, [])
        matches = find_matches_from_candidates(item_name_with_brand, products_candidates, order_invariant_names_matching, name_threshold, 30, 0)
        all_matches.extend(matches)

        if item_name_2_with_brand:
            matches = find_matches_from_candidates(item_name_2_with_brand, products_candidates, order_invariant_names_matching, name_threshold, 30, 0)
            all_matches.extend(matches)



        # Finally search among all products
        matches = find_matches_from_candidates(item_name_with_brand, all_products_with_brands, order_invariant_names_matching, name_threshold, 30, 0)
        all_matches.extend(matches)

        if item_name_2_with_brand:
            matches = find_matches_from_candidates(item_name_2_with_brand, all_products_with_brands, order_invariant_names_matching, name_threshold, 30, 0)
            all_matches.extend(matches)


        if not item['brand']:
            matches = find_matches_from_candidates(item_name_with_brand, all_products, order_invariant_names_matching, name_threshold, 30, 0)
            all_matches.extend(matches)

            if item_name_2_with_brand:
                matches = find_matches_from_candidates(item_name_2_with_brand, all_products, order_invariant_names_matching, name_threshold, 30, 0)
                all_matches.extend(matches)



        all_matched_items = [
            {
                'id': all_products_ids[product_index],
                'brand': all_products_brands[product_index],
                'item_name': all_products_names[product_index],
                'score': 0,
                'alternative': 0,
                'score_ex': score_ex,
                'score_brand': score_ex[SCORE_EX_BRAND_INDEX],
                'score_name': int(score_ex[SCORE_EX_NAME_INDEX]),
                'score_similarity': score_ex[SCORE_EX_SIMILARITY_INDEX],
                'score_type': score_ex[SCORE_EX_TYPE_INDEX],
                'score_colorsour': score_ex[SCORE_EX_COLORSOUR_INDEX],
                'score_volume': score_ex[SCORE_EX_VOLUME_INDEX],
                'score_year': score_ex[SCORE_EX_YEAR_INDEX],
                'score_gb': score_ex[SCORE_EX_GB_INDEX],
                'item_orig_name': all_products_orig_names[product_index],
                'volume': all_products_volumes[product_index],
                'type': all_products_types[product_index],
                'type_l1': all_products_types_l1[product_index],
                'type_l0': all_products_types_l0[product_index],
                'color': all_products_type_wine[product_index],
                'sour': all_products_sour[product_index],
                'year': all_products_year[product_index],
                'gb': all_products_gbs[product_index],
            }
            for product_index, product_id, score, match, score_ex in all_matches
        ]

        all_matched_items = score_and_filter_matched_items_by_attributes(all_matched_items, item)
        all_matched_items = order_by_best_year(all_matched_items, item['year'])

        calculate_total_score(all_matched_items)

        # Now it's time to sort by all scores
        all_matched_items = sorted(all_matched_items, key=lambda d: d['score'], reverse=True)

        duplicate_ids = []
        best_score_ex = ''
        all_matched_items_new = []
        for product in all_matched_items:
            if not product['id'] in duplicate_ids:
                score_ex = 'B' + str(product["score_brand"]) + ',' + \
                           'N' + str(product["score_name"]) + ',' + \
                           'S' + str(product["score_similarity"]) + ',' + \
                           'T' + str(product["score_type"]) + ',' + \
                           'C' + str(product["score_colorsour"]) + ',' + \
                           'V' + str(product["score_volume"]) + ',' + \
                           'Y' + str(product["score_year"]) + ',' + \
                           'G' + str(product["score_gb"])

                if not best_score_ex:
                    best_score_ex = score_ex

                product['score_ex'] = score_ex

                all_matched_items_new.append(product)
                duplicate_ids.append(product['id'])

        results.append({
            'item_id': item['id'],
            #"matched_top_id": top_matched_id,
            'best_score_ex':  best_score_ex,
            'matched_items': all_matched_items_new[:10],
            #"alternative_top_id": "",
            #'alternative': []  # Заполняется во втором проходе
        })

        #results[idx]['matched_items'].extend(alt_matched_items)
        #results[idx]['match_type'] = "".join(match_type)

        time_0 += perf_counter() - time_0s


    print("finished [" + str(index) + "]")
    if qresult:
        qresult.put(results)


    return results


def new_find_matches_with_ids(items_df, products_df, name_threshold=85,
                                products_groups_brand_type_vol=None,
                                products_groups_brand_typel1_vol=None,
                                products_groups_brand_typel0_vol=None,
                                products_groups_typewine_type_vol=None,
                                order_invariant_names_matching = False,
                                thread_count = 8):

    print("Started matching at " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")

    if len(items_df) < 1000:
        results = new_find_matches_with_ids_func(items_df, products_df, name_threshold,
                            products_groups_brand_type_vol,
                            products_groups_brand_typel1_vol,
                            products_groups_brand_typel0_vol,
                            products_groups_typewine_type_vol,
                            order_invariant_names_matching,
                            0)

        show_stat()

    else:
        results = []

        threads_data = list()
        chunk_size = len(items_df) // thread_count + 1
        num_chunks = len(items_df) // chunk_size + 1
        for i in range(num_chunks):
        #for i in range(1):
            chunk = items_df[i * chunk_size:(i + 1) * chunk_size]
            data = {"index": i, "items_df": chunk, "products_df": products_df, "name_threshold":name_threshold,
                    "products_groups_brand_type_vol":products_groups_brand_type_vol,
                    "products_groups_brand_typel1_vol":products_groups_brand_typel1_vol,
                    "products_groups_brand_typel0_vol": products_groups_brand_typel0_vol,
                    "products_groups_typewine_type_vol": products_groups_typewine_type_vol,
                    "order_invariant_names_matching": order_invariant_names_matching}

            q = Queue()
            p = Process(target=new_find_matches_with_ids_func, args=(chunk, products_df, name_threshold,
                                                                     products_groups_brand_type_vol,
                                                                     products_groups_brand_typel1_vol,
                                                                     products_groups_brand_typel0_vol,
                                                                     products_groups_typewine_type_vol,
                                                                     order_invariant_names_matching,
                                                                     i, q))
            p.start()

            threads_data.append({"index": i, "q": q})


        for td in threads_data:
            td["result"] = td["q"].get()

        for td in threads_data:
            results.extend(td["result"])

    for r in results:
        r['matched_items'] = json.dumps(r['matched_items'], ensure_ascii=False)

    results_df = pd.DataFrame(results)
    merged_df = items_df.merge(results_df, left_on='id', right_on='item_id').drop(columns=['item_id'])

    print("Finished matching at " + datetime.datetime.now().strftime("Started at: %Y-%m-%d %H:%M:%S") + "\n")
    return merged_df