Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Upload 12 files

by Gainward777 - opened Mar 3, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1298

-37

Files changed (12) hide show

app.py +27 -37
constants/constants.py +285 -0
preprocess/preprocess.py +223 -0
preprocess/utills/common/brand_matching.py +137 -0
preprocess/utills/common/extracters.py +66 -0
preprocess/utills/common/parallel_brand_mutching.py +97 -0
preprocess/utills/common/top_inserts.py +66 -0
preprocess/utills/common/utils.py +130 -0
preprocess/utills/items/attrs.py +40 -0
processor/matching.py +157 -0
processor/processor.py +25 -0
ui/gradio_ui.py +45 -0

app.py CHANGED Viewed

@@ -1,37 +1,27 @@
-import gradio as gr
-from Funcs import new_run
-import pandas as pd
-import tempfile
-def process_files(file1, file2, threshold):
-    row_items=pd.read_csv(file2, sep='\t')
-    row_products=pd.read_csv(file1, sep='\t', on_bad_lines='skip')
-    df, items, products= new_run(row_products, row_items, threshold)
-    # Создаём временный CSV файл для сохранения результата
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
-        output_csv = tmp.name
-    df.to_csv(output_csv, sep='\t', index=False)
-    return output_csv
-# Определяем пользовательский интерфейс с помощью gr.Blocks
-with gr.Blocks() as demo:
-    gr.Markdown("## Обработка CSV файлов")
-    with gr.Row():
-        # Используем type="filepath", чтобы получить путь к файлу
-        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-        file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
-    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
-    process_button = gr.Button("Обработать файлы")
-    output_file = gr.File(label="Скачать результат (CSV)")
-    # При нажатии кнопки вызывается функция process_files
-    process_button.click(fn=process_files, inputs=[file_input1, file_input2, threshold_input], outputs=output_file)
-demo.launch()

+from processor.processor import Processor
+from constants.constants import *
+from ui.gradio_ui import GradioUI
+processor=Processor(LONG_TYPES_LIST,
+                    SHORT_TYPES_LIST,
+                    SOUR,
+                    WINE_TYPES,
+                    GBS,
+                    COLORS_FOR_TRIM,
+                    GRAPES,
+                    OTHER_WORDS,
+                    SOUR_MERGE_DICT,
+                    TYPES_WINES_DICT,
+                    COLOR_MERGE_DICT)
+ui=GradioUI(processor)
+ui.run_ui()

constants/constants.py ADDED Viewed

	@@ -0,0 +1,285 @@

+LONG_TYPES_LIST = [
+    "Пиво",
+    "Вино",  # объединяет красное, белое, розовое, игристое и шампанское
+    "Водка",
+    "Виски",
+    "Бурбон",
+    "Коньяк",
+    "Бренди",
+    "Арманьяк",
+    "Ром",
+    "Джин",
+    "Текила",
+    "Мескаль",
+    "Ликер",
+    "Самбука",
+    "Сидр",
+    "Саке",
+    "Абсент",
+    "Граппа",
+    "Портвейн",
+    "Мадера",
+    "Шерри",
+    "Кальвадос",
+    "Писко",
+    "Вермута",
+    "Вермут",
+    "Аперитив",
+    "Биттер",
+    "Эль",
+    "Глинтвейн",
+    "Пунш",
+    "Медовуха",
+    "Ламбик",
+    "Крем-ликер",
+    "Арак",
+    "Чача",
+    "Самогон",
+    "Кумыс",
+    "Сливовица",
+    "Шнапс",
+    "Настойка",
+    "Наливка",
+    "Игристое вино",
+    "Херес",
+    "Пуаре",
+    "Пуарэ",
+    "Ликер",
+    "Ликёр",
+    "Спиртной напиток со вкусом",
+    "Напиток винный",
+    "Винный напиток",
+    "Шомпань",
+    'Сироп',
+    'Конфеты',
+    'Шоколад'
+    'Сок',
+    'Вода',
+    'Табачная продукция',
+    ]
+SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
+SOUR = [
+        'brut',
+        'semi-sweet',
+        'sweet',
+        'брют',
+        'сухое',
+        'полусухое',
+        'полусладкое',
+        'сладкое',
+        'п/сух',
+        'п/сл',
+        'п/с',
+        'сл',
+        'сух',
+    ]
+WINE_TYPES = [
+    'красное',
+    'белое',
+    'розовое',
+    'роз',
+    'кр',
+    'бел',
+    'розе',
+    'rosso',
+    'roso',
+    'roseto',
+    'rosetto',
+    'red',
+    'white',
+    "игристое",
+    "игр",
+    "шомпанское",
+    "шомп",
+    ]
+GBS = [
+    'cristal decanter in oak gift box',
+    'in the carton gift box with 2 glasses',
+    'decanter in the carton gift box',
+    'in the carton gift box',
+    'in the wooden gift box',
+    'in gift box in the carton',
+    'in gift box in carton',
+    'gift box in the carton',
+    'gift box in carton',
+    'in gift box in the wood',
+    'in gift box in wood',
+    'gift box in the wood',
+    'gift box in wood',
+    'gift box with 2 glasses',
+    'in gift box',
+    'gift box',
+    'in carton',
+    'in wooden case',
+    'in wooden box',
+    'in wood case'
+    'in wood box',
+    'in wood',
+    'хрустальный декантер в подарочной упаковке из дуба',
+    'декантер в подарочной упаковке из картона',
+    'в подарочной упаковке из картона с 2 бокалами'
+    'в подарочной упаковке из картона',
+    'в подарочной упаковке из Дуба',
+    'в П У графин и деревянная коробка',
+    'в подарочной упаковке',
+    'подарочная упаковка',
+    'подарочный набор',
+    'в деревянной коробке',
+    'деревянная коробка',
+    'в п/у+2 бокаланов',
+    'в п/у из картона',
+    'в п/у+бокал',
+    'в п/у (дер.коробке)',
+    'в п/у солома',
+    'в п/у',
+    'в п у',
+    'п/уп',
+    'п/у',
+    'в тубе',
+    'туба',
+    'ПУ',
+    ]
+COLORS_FOR_TRIM = [
+    'красное',
+    'белое',
+    'розовое'
+    'кр',
+    'бел',
+    'розе',
+    'rosso',
+    'roso',
+    'roseto',
+    'rosetto',
+    'red',
+    'white',
+    ]
+GRAPES = [
+    "Каберне Совиньон",
+    "Каберне-Совиньон",
+    "Каберне",# Cabernet Sauvignon
+    "Мерло",               # Merlot
+    "Пино Нуар",           # Pinot Noir
+    "Шардоне",            # Chardonnay
+    "Совиньон Блан",       # Sauvignon Blanc
+    "Сира",                # Syrah
+    "Гренаш",             # Grenache
+    "Рислинг",            # Riesling
+    "Мальбек",            # Malbec
+    "Темпранильо",         # Tempranillo
+    "Зинфандель",         # Zinfandel
+    "Санджовезе",         # Sangiovese
+    "Каберне Фран",       # Cabernet Franc
+    "Вионье",             # Viognier
+    "Мурведр",            # Mourvèdre
+    "Шенен Блан",         # Chenin Blanc
+    "П��но Гри",           # Pinot Grigio
+    "Гевюрцтраминер",     # Gewürztraminer
+    "Неббиоло",           # Nebbiolo
+    "Барбера",            # Barbera
+    "Petit Verdot",       # Petit Verdot (обычно оставляют в оригинале)
+    "Карменер",           # Carmenère
+    "Таннат",             # Tannat
+    "Гамей",              # Gamay
+    "Семильон",           # Semillon
+    "Мускат",             # Muscat
+    "Верментино",         # Vermentino
+    "Фиано",              # Fiano
+    "Аглианико",          # Aglianico
+    "Кариньян",           # Carignan (также может встречаться как Cariñena)
+    "Торронтес",
+    "Рислинг",
+    "Кефессия",
+    "Алиготе",
+    "Фурминт"# Torrontés (особенно для аргентинских вин)
+]
+OTHER_WORDS=[
+    "Игристое",
+    "Жемчужное",
+    "Газированный",
+    "Традиционный",
+    "Двухслойный",
+    "Кофе",
+    "Напиток",
+    "Спиртной",
+    "Горькая",
+    "Виноградная",
+    "Выдержанная",
+    "Шотландский",
+    "Купажированный",
+    "креп",
+    "Ординарный",
+    "Выдержанный",
+    "Отборное",
+    "Десертный",
+    "Вкус",
+    "Сорт",
+    ]
+SOUR_MERGE_DICT={
+    'brut':'брют',
+    'semi-sweet':'полусладкое',
+    'sweet':'сладкое',
+    'сухое':'сухое',
+    'п/сух':'полусухое',
+    'п/сл':'полусладкое',
+    'п/с':'полусухое',
+    'сл':'сладкое',
+    'сух':'сухое',
+    None: 'unmatched',
+    }
+TYPES_WINES_DICT={
+    'Пуарэ':'Слабоалкогольные и энергетические напитки',
+    'Пуаре':'Слабоалкогольные и энергетические напитки',
+    'Сидр':'Слабоалкогольные и энергетические напитки',
+    'Шампань': 'Шампанское',
+    'Игристое': 'Шампанское',
+    'Сироп':'Сиропы',
+    'Арманьяк':'Коньяк',
+    'Бренди':'Коньяк',
+    'Ликер':'Ликеры',
+    'Граппа':'Водка',
+    'Настойка':'Водка',
+    'Конфеты':'Сладости',
+    'Портвейн':'Вино',
+    'Херес':'Вино',
+    'Кальвадос':'Коньяк',
+    'Винный напиток': "Вино",
+    "Игристое вино":'Шампанское',
+    "Самогон": "Водка",
+    }
+COLOR_MERGE_DICT={
+    "кр":'красное',
+    "red":"красное",
+    "бел":"белое",
+    "white":"белое",
+    "роз":'розовое',
+    "розе":'розовое',
+    "roso":'розовое',
+    "rosso":'розовое',
+    "rose":'розовое',
+    "rosetto":'розовое',
+    "roseto":'розовое',
+    "игр":"игристое",
+    "шомп":"шомпанское",
+    None: 'unmatched'
+    }

preprocess/preprocess.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import json
+from tqdm import tqdm
+from preprocess.utils.items.attrs import *
+from preprocess.utils.common.extracters import *
+from preprocess.utils.common.brand_matching import *
+from preprocess.utils.common.parallel_brand_matching import *
+from preprocess.utils.common.utils import *
+import pandas as pd
+class Preprocessor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.long_types_list=long_types_list
+        self.short_types_list=short_types_list
+        self.sour=sour_list
+        self.type_wine=type_wine
+        self.gbs=gbs
+        self.colors_ft=colors_for_trim
+        self.grapes=grapes
+        self.other_words=other_words
+        self.types_n_others=long_types_list+other_words
+        self.sour_dict=sour_merge_dict
+        self.type_dict=type_merge_dict
+        self.color_merge_dict=color_merge_dict
+    def process_items(self, df):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+    #counter=0
+        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
+            try:
+                i=json.loads(i)
+                result['id'].append(idf)
+                if 'brand' in i.keys():
+                    result['brand'].append(i['brand'])
+                else: result['brand'].append(None)
+                result['name'].append(i['name'])
+                drink_type=get_type(i, self.types)
+                if drink_type is None:
+                    drink_type=check_spark(i)
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i)
+                if drink_type is None:
+                    drink_type=check_spark(i, col_name='type_wine')
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i, types=self.sour)
+                #if 'type' in i.keys():
+                result['type'].append(drink_type)#i['type'])
+                #else: dd['type'].append(None)
+                if 'volume' in i.keys():
+                    result['volume'].append(i['volume'])
+                else:
+                    vol=extract_volume_or_number(i['name'])
+                    result['volume'].append(vol)
+                if 'year' in i.keys():
+                    result['year'].append(i['year'])
+                else:
+                    year=extract_production_year(i['name'])
+                    result['year'].append(year)
+                alco=extract_alcohol_content(i['name'])
+                if 'type_wine' in i.keys():
+                    result['type_wine'].append(i['type_wine'])
+                else: result['type_wine'].append(None)
+                #f alco is not None:
+                result['alco'].append(alco)
+                #else: dd['type_wine'].append(None)
+            except Exception as ex:
+                print(idf, ex)
+        return pd.DataFrame(result)
+    def process_products(self, products):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+        for idx, row in tqdm(products.iterrows()):
+            try:
+                result['id'].append(row['id'])
+                result['brand'].append(row['brand'])
+                result['type_wine'].append(row['category'])
+                result['type'].append(row['product_type'])
+                result['name'].append(row['name_long'])
+                vol=extract_volume_or_number(row['name'])
+                result['volume'].append(vol)
+                #year=extract_production_year(row['name'])
+                year=extract_production_year(str(row['name_postfix']))
+                result['year'].append(year)
+                #rr['year'].append(row['name_postfix'])
+                alco=extract_alcohol_content(row['name'])
+                #f alco is not None:
+                result['alco'].append(alco)
+            except Exception as ex:
+                print(ex)
+        return pd.DataFrame(result)
+    def prcess_text(self, text):
+        #text=''+origin
+        #text=str(split_russian_and_english(text))
+        gb=find_full_word(text, self.gbs)#get_GB(text)
+        if gb is not None:
+            text=text.replace(str(gb), '')
+        alcohol = extract_alcohol_content(text)
+        if alcohol is not None:
+            alco_w_comma=alcohol.replace('.', ',')
+            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
+        volume_or_number = extract_volume_or_number(text)
+        if volume_or_number is not None:
+            volume_with_comma=str(volume_or_number).replace('.', ',')
+            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+            test=clean_wine_name(text) #remove_l(text)
+            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
+        # else:
+        #     volume_or_number=re_extract_volume(text)
+        #     if volume_or_number is not None:
+        #         volume_with_comma=volume_or_number.replace('.', ',')
+        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+        years = extract_years(text)
+        if years is not None:
+            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
+        production_year = extract_production_year(text)
+        if production_year is not None:
+            text=text.replace(str(production_year), '')
+        color=find_full_word(text, self.colors_ft)
+        if color is not None:
+            text=text.replace(str(color), '')
+        sour=find_full_word(text, self.sour) #get_sour(text)
+        if sour is not None:
+            text=text.replace(str(sour), '')
+        # re_extracted_volume=re_extract_volume(text)
+        # if re_extracted_volume is not None:
+        #     volume_with_comma=re_extracted_volume.replace('.', ',')
+        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
+        # else:
+        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
+        # volume_or_number=re_extracted_volume
+        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
+    def process(self, items, products):
+        print('------*-----Prepare items catalogue-----*-----')
+        items=self.process_items(items.copy())
+        print('-----*-----Prepare products catalogue-----*-----')
+        products=self.process_products(products.copy())
+        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
+        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
+        print('-----*-----Split n match-----*-----')
+        splited=split_n_match(products, items)
+        items["brand"] = items["brand"].replace(splited)
+        print('-----*-----Fill brands in items-----*-----')
+        fill_brands_in_dataframe(products['brand'].unique(), items)
+        print('-----*-----Brand matching-----*-----')
+        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
+        out_prods=list(set(prod_brand_list)-set(comp_list))
+        out_items=list(set(items_brand_list)-set(comp_list))
+        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
+        items["new_brand"] = items["new_brand"].replace(brand_map_improved)
+        items['type']=items['type'].replace(self.type_dict)
+        print('-----*-----Unwrap brend cats step 1-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Unwrap brend cats step 2-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Finding brands in names-----*-----')
+        items['new_brand']=items['new_brand'].replace('none', None)
+        i_brands=items[items['new_brand'].isna()]['name'].values
+        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
+        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
+        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
+        print('-----*-----Top inserts-----*-----')
+        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
+                                self.grapes, self.other_words)
+        print('-----*-----Adding service categories-----*-----')
+        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(items, products)
+        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(products, products)
+        print('-----*-----Name trimming-----*-----')
+        item_timed_names, gb, sour=name_trimmer(items, self.types_n_others)
+        #items['name']=items['id'].replace(item_timed_names)
+        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
+        items['gb']=gb
+        items['sour']=sour
+        items['sour']=items['sour'].replace(self.sour_dict)
+        products_trimed_names, gb, sour=name_trimmer(products, self.types_n_others)
+        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
+        products['gb']=gb
+        products['sour']=sour
+        products['sour']=products['sour'].replace(self.sour_dict)
+        print('-----*-----Replacing product types-----*-----')
+        products['type']=products['type'].replace(self.type_dict)
+        return items, products

preprocess/utills/common/brand_matching.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from tqdm import tqdm
+import re
+from ahocorasick import Automaton
+from rapidfuzz import fuzz, process
+def contains_full_word(word, text, case_sensitive=True):
+    """
+    Проверяет, содержится ли слово word в строке text как отдельное слово.
+    Параметр case_sensitive задаёт, учитывать ли регистр.
+    """
+    flags = 0 if case_sensitive else re.IGNORECASE
+    pattern = r'\b' + re.escape(word) + r'\b'
+    return re.search(pattern, text, flags) is not None
+def unwrap_brands(products):
+    res={}
+    #brands=items['brand'].unique()
+    new_brands=sorted([x for x in products['brand'].unique() if isinstance(x, str)], key=len)
+    #items['new_brand'].unique() if isinstance(x, str)], key=len)
+    for i in tqdm(new_brands):
+        for j in new_brands:
+            if contains_full_word(i, j, case_sensitive=False):
+                if i != j:
+                    #if len(i)>1:#i != 'А' and i  != "Я":
+                        res[j]=i
+    return res
+def split_n_match(products, items, th_len=3):
+    result={}
+    conditionally_spited=[]
+    for i in tqdm(items['brand'].unique()):
+        if '/' in i:
+            conditionally_spited.append(i)
+    for i in tqdm(products['brand'].unique()):
+        for j in conditionally_spited:
+            if len(i)>th_len and contains_full_word(i,j):
+                result[j]=i
+    return result
+def fill_brands_in_dataframe(brands, df, col_name='new_brand', is_brand=True):
+    """
+    Заполняет колонку 'brand' в DataFrame найденными брендами.
+    :param brands: Список брендов.
+    :param df: DataFrame с колонками ['id', 'brand', 'name', ...].
+    :return: DataFrame с обновлённой колонкой 'brand'.
+    """
+    # Инициализируем автомат для быстрого поиска брендов
+    automaton = Automaton()
+    # Добавляем бренды в автомат
+    for idx, brand in enumerate(brands):
+        if isinstance(brand, str) and brand:
+            automaton.add_word(brand.lower(), (idx, brand))
+    automaton.make_automaton()
+    def find_brand(name):
+        """
+        Находит лучший бренд для данного имени.
+        """
+        matched_brands = set()
+        for _, (_, brand) in automaton.iter(name.lower()):
+            # Проверяем, что бренд встречается как отдельное слово
+            if re.search(rf'\b{re.escape(brand.lower())}\b', name.lower()):
+                matched_brands.add(brand)
+        # Возвращаем бренд с максимальной длиной (более точное совпадение)
+        return max(matched_brands, key=len) if matched_brands else None
+    # Обновляем колонку brand только для пустых значений
+    # df['new_brand'] = df.apply(
+    #     lambda row: find_brand(row['name']), #if pd.isna(row['brand']) else row['brand'],
+    #     axis=1
+    # )
+    if is_brand==True:
+        df[col_name] = df.apply(lambda row: find_brand(row['name']) or row['brand'], axis=1)
+    else:
+        df[col_name] = df.apply(lambda row: find_brand(row['name']) or None, axis=1)
+def get_same_brands(products, items):
+    comp_list=[]
+    #not_comp_prods=[]
+    #not_comp_items=[]
+    prod_brand_list=list(products['brand'].unique())
+    items_brand_list=list(items['new_brand'].unique())
+    for i in tqdm(prod_brand_list):
+        if i in items_brand_list:
+            comp_list.append(i)
+    return comp_list, prod_brand_list, items_brand_list
+def match_brands_improved(items_brands, prods_brands, threshold=85):
+    """
+    Улучшенный алгоритм сопоставления брендов с учётом нечёткого поиска и фильтрации ошибок.
+    :param items_brands: Список брендов из датафрейма items.
+    :param prods_brands: Список брендов из датафрейма prods.
+    :param threshold: Порог сходства для нечёткого поиска.
+    :return: Словарь соответствий {бренд из items: ближайший бренд из prods}.
+    """
+    brand_mapping = {}
+    for item_brand in tqdm(items_brands):
+        if isinstance(item_brand, str):
+            # Разделяем бренд на части
+            parts = [part.strip() for part in re.split(r"[\/\(\)]", item_brand) if part.strip()]
+            best_match = None
+            best_score = 0
+            for part in parts:
+                match, score, _ = process.extractOne(part, prods_brands, scorer=fuzz.ratio)
+                # Фильтрация по длине строк и порогу
+                if score >= threshold and abs(len(part) - len(match)) / len(part) <= 0.3:
+                    if score > best_score:
+                        best_match = match
+                        best_score = score
+            # Сохранение результата
+            if best_match:
+                brand_mapping[item_brand] = best_match#, best_score)
+    return brand_mapping

preprocess/utills/common/extracters.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import re
+def extract_years(text):
+    """
+    Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years').
+    """
+    # Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра
+    match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет|years)\b', text, re.IGNORECASE)
+    if match:
+        # Приводим слово 'лет' или 'years' к исходному регистру
+        return f"{match.group(1)} {match.group(2)}"
+    return None
+def extract_production_year(text):
+    """
+    Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки.
+    Например: '2019'.
+    """
+    match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
+    if match:
+        return match.group(1)
+    return None
+def extract_alcohol_content(text):
+    """
+    Извлекает содержание алкоголя из строки.
+    Например: '40%'.
+    """
+    match = re.search(r'(\d{1,2}(?:[.,]\d+)?\s*%)', text)
+    if match:
+        # Заменяем запятую на точку для единообразия (если нужно)
+        return match.group(1).replace(' ', '').replace(',', '.')
+    return None
+def is_volume(value):
+    """
+    Проверяет, является ли значение валидным объемом (<= 10 литров).
+    """
+    try:
+        volume = float(value)
+        return volume if volume <= 10 else None
+    except ValueError:
+        return None
+def extract_volume_or_number(text):
+    """
+    Извлекает объем в литрах или число с плавающей точкой из строки.
+    Например: '0,75л', '0.5', или '1,5 л'.
+    """
+    # Попытка найти объем с буквой 'л' или без пробела перед ней
+    match_with_l = re.search(r'(\d+(?:[\.,]\d+)?\s*[лЛ]|(?:\d+(?:[\.,]\d+)?[лЛ]))', text)
+    if match_with_l:
+        return is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('Л', '').strip())
+    # Если не найдено, ищем просто число с плавающей точкой
+    match_number = re.search(r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))\b(?!\s*(№|-er|er|\d{3,}))', text)
+    if match_number:
+        return is_volume(match_number.group(1).replace(',', '.'))
+    return None

preprocess/utills/common/parallel_brand_mutching.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import re
+from ahocorasick import Automaton
+from rapidfuzz import fuzz, process
+from unidecode import unidecode
+from pqdm.threads import pqdm
+def normalize(text):
+    """
+    Приводит текст к нижнему регистру и транслитерирует его в латиницу.
+    """
+    return unidecode(text.lower())
+def build_regex_for_brands(brands):
+    """
+    Нормализует бренды и создаёт одно регулярное выражение для точного поиска.
+    Возвращает скомпилированный паттерн и словарь: нормализованное название -> оригинальное название.
+    """
+    norm_to_brand = {}
+    for brand in brands:
+        norm_brand = normalize(brand)
+        if norm_brand not in norm_to_brand:
+            norm_to_brand[norm_brand] = brand
+    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(nb) for nb in norm_to_brand.keys()) + r')\b')
+    return pattern, norm_to_brand
+def process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold):
+    """
+    Обрабатывает одну строку:
+      1. Пытается найти бренд через регулярное выражение.
+      2. Если точного совпадения нет – разбивает строку и выполняет нечёткий поиск.
+    Возвращает кортеж: (исходная строка, найденный бренд или None).
+    """
+    norm_s = normalize(s)
+    # Пытаемся найти бренд через регулярное выражение
+    match = regex_pattern.search(norm_s)
+    if match:
+        return s, norm_to_brand[match.group(0)]
+    # Если точного совпадения нет, разбиваем строку по разделителям и анализируем части
+    parts = [part.strip() for part in re.split(r"[\/\(\)]", s) if part.strip()]
+    parts.append(s)  # анализ всей строки
+    best_match = None
+    best_score = 0
+    for part in parts:
+        norm_part = normalize(part)
+        res = process.extractOne(norm_part, norm_brand_list, scorer=fuzz.ratio, score_cutoff=threshold)
+        if res is not None:
+            match_norm, score, idx = res
+            if score > best_score:
+                best_match = index_to_brand[idx]
+                best_score = score
+                if best_score == 100:
+                    break
+    if best_match:
+        return s, best_match
+    return s, None
+def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
+    """
+    Поиск брендов в строках с учетом вариантов написания и транслитерации.
+    Использует предварительный поиск через регулярное выражение и, при необходимости,
+    нечёткий поиск. Обработка выполняется параллельно с отображением прогресса с помощью pqdm.
+    :param strings: Список строк для поиска брендов.
+    :param brands: Список брендов для поиска.
+    :param threshold: Порог сходства для нечёткого поиска.
+    :param n_jobs: Число рабочих потоков (или процессов, если использовать pqdm.processes).
+    :return: Словарь вида {строка: найденный бренд}.
+    """
+    # Подготавливаем список нормализованных брендов и сопоставление индексов с оригинальными брендами.
+    norm_brand_list = []
+    index_to_brand = []
+    for brand in brands:
+        norm_brand = normalize(brand)
+        norm_brand_list.append(norm_brand)
+        index_to_brand.append(brand)
+    # Создаем комбинированный паттерн для точного поиска.
+    regex_pattern, norm_to_brand = build_regex_for_brands(brands)
+    # Определяем вспомогательную функцию, закрывающую необходимые параметры.
+    def process_string_wrapper(s):
+        return process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold)
+    # Обрабатываем строки параллельно с отображением прогресса.
+    results = pqdm(strings, process_string_wrapper, n_jobs=n_jobs)
+    brand_mapping = {}
+    for s, matched_brand in results:
+        if matched_brand:
+            brand_mapping[s] = matched_brand
+    return brand_mapping

preprocess/utills/common/top_inserts.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from preprocess.utils.common.extracters import *
+from preprocess.utils.common.utils import *
+from preprocess.utils.common.parallel_brand_matching import *
+from tqdm import tqdm
+import re
+import math
+import numpy as np
+def top_inserts_matching(other_brands, p_brands, items, th=65):
+    replaced={}
+    for i in other_brands:
+        l=i.split('/')
+        if len(l)>2:
+            replaced[l[0].replace('Шато','')]=i
+        else:
+            if 'Шато' in i:
+                replaced[i.replace('Шато','')]=i
+    ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
+    ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
+    result={}
+    for k in ob_in_pb.keys():
+        if k in replaced.keys():
+            result[replaced[k]]=ob_in_pb[k]
+        else:
+            result[k]=ob_in_pb[k]
+    items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)
+def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
+    result={}
+    for n in tqdm(items[items['new_brand'].isna()]['name'].values):
+        name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
+        #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
+        name=trim_name(name, types)
+        name=trim_name(name, grape_varieties)
+        name=trim_name(name, onther_words)
+        name=name.replace('.','').replace(',','').replace('(','').replace(')','')
+        #result.append(clean_wine_name(name).strip())
+        result[n]=clean_wine_name(name).strip()
+    items['new_name']=None
+    items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)
+    u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
+    res={}
+    for i in tqdm(u_nn):
+        lenta=len(items[items['new_name']==i])
+        if lenta>1:
+            res[i]=lenta
+    th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
+    other_brands=[i for i,j in res.items() if j>th]
+    reess=check_brands_in_strings_pqdm(other_brands, p_brands)
+    items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)
+    top_inserts_matching(other_brands, p_brands, items)

preprocess/utills/common/utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import re
+from tqdm import tqdm
+def remove_quotes(text):
+    return re.sub(r'["\']', '', text)
+def remove_l(text):
+    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
+  # Убираем возможные лишние пробелы, возникающие после удаления
+    result = re.sub(r'\s{2,}', ' ', result).strip()
+    return result
+def clean_wine_name(name):
+    """
+    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
+    Например, "токай   л" превратится в "токай".
+    """
+    # Регулярное выражение ищет:
+    # \s+        – один или несколько пробельных символов;
+    # \b         – граница слова;
+    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
+    # \b         – граница слова;
+    # \s*$       – любые пробелы до конца строки.
+    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
+def find_full_word(text, word_list):
+    """
+    Ищет первое полное вхождение слова из word_list в строке text.
+    Возвращает найденное слово или None, если совпадение не найдено.
+    """
+    for word in word_list:
+        pattern = r'\b' + re.escape(word) + r'\b'
+        if re.search(pattern, text, re.IGNORECASE):
+            return word
+    return None
+def merge_wine_type(items, colors=None, color_merge_dict=None):
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            if row[1]['type_wine'] is not None:
+                color=find_full_word(row[1]['type_wine'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    color=find_full_word(row[1]['name'], colors)
+                    if color is not None:
+                        result.append(color)
+                    else:
+                        result.append(None)
+            else:
+                color=find_full_word(row[1]['name'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type_wine']=result
+    items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
+def merge_types(items, products):
+    alco_types=[i.strip().lower() for i in products['type'].unique()]
+    alco_types.append('ликёр')
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            type_in_name=find_full_word(row[1]['name'], alco_types)
+            if type_in_name is not None:
+                result.append(type_in_name)
+                continue
+            if row[1]['type'] is not None:
+                type_in_type=find_full_word(row[1]['type'], alco_types)
+                if type_in_type is not None:
+                    result.append(type_in_type)
+                else:
+                    result.append(row[1]['type'])
+            else:
+                result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type']=result
+    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
+def trim_name(text, words_to_remove):
+    """
+    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
+    :param text: Исходная строка.
+    :param words_to_remove: Список слов, которые необходимо удалить.
+    :return: Обновлённая строка с удалёнными словами.
+    """
+    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
+    # Используем re.escape, чтобы экранировать спецсимволы в словах.
+    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
+    #print(pattern)
+    # Заменяем найденные полные слова на пустую строку.
+    new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    # Убираем лишние пробелы, возникающие после удаления слов.
+    new_text = re.sub(r'\s+', ' ', new_text).strip()
+    return new_text
+def name_trimmer(df, prcess_text, types_and_others):
+    result={}
+    gbs=[]
+    sours=[]
+    for idx, row in tqdm(df.iterrows()):
+        text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
+        text=trim_name(text, types_and_others).replace(',','').replace('.','')
+        result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
+        gbs.append(gb)
+        sours.append(sour)
+    return result, gbs, sours

preprocess/utills/items/attrs.py ADDED Viewed

	@@ -0,0 +1,40 @@

+def check_spark(row, col_name='name', types=['Игристое', 'игр']):
+    if col_name in row.keys():
+        for t in types:
+            if t.lower() in row[col_name].lower() and 'Пилигрим' not in row[col_name].lower():
+                return 'Игристое'
+        return None
+def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
+    if col_name in row.keys():
+        for t in types:
+            if t.lower() in row[col_name].lower():
+                return 'Вино'
+        return None
+def is_type_exist(row, types):
+    for t in types:
+        if t.lower() in row['type'].lower():  # Сравнение без учета регистра
+            return t
+    return None
+def check_type(row, types):
+    #checker=False
+    for t in types:
+        if t.lower() in row['name'].lower():  # Сравнение без учета регистра
+            return t
+    return None
+def get_type(row, types):
+    if 'type' not in row.keys():
+      return check_type(row, types)
+    elif 'type' in row.keys():
+      semi_res=is_type_exist(row, types)
+      if semi_res!=None:
+        return semi_res
+      else:
+        return check_type(row, types)
+    return None

processor/matching.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from tqdm import tqdm
+from transliterate import translit, detect_language
+import pandas as pd
+from rapidfuzz import fuzz, process
+def normalize_name(name):
+    """
+    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
+    приводит к нижнему регистру.
+    """
+    try:
+        if detect_language(name) == 'ru':
+            return translit(name, 'ru', reversed=True).lower()
+    except Exception:
+        pass
+    return name.lower()
+def prepare_groups_with_ids(items_df):
+    """
+    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
+    с учетом нормализованного названия.
+    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
+    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
+    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour']))
+    ).to_dict()
+    return grouped
+def prepare_groups_by_alternative_keys(items_df):
+    """
+    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
+    оригинального и нормализованного имени.
+    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
+    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour']))
+    ).to_dict()
+    return grouped
+def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
+    """
+    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
+    нормализованные группы.
+    Производится два прохода:
+    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
+    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
+      исключая итемы с исходным брендом.
+    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
+    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
+    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
+    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
+    :param name_threshold: Порог сходства для fuzzy matching.
+    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
+    """
+    results = []
+    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
+    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
+    for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
+        product_brand = product['brand']
+        product_type = product['type']
+        product_name = product['name']
+        product_volume = product['volume']
+        product_type_wine = product['new_type_wine']
+        product_sour = product['sour']
+        key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
+        items_data = items_groups.get(key, [])
+        if items_data:
+            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = zip(*items_data)
+        else:
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = ([], [], [], [], [], [])
+        norm_product_name = normalize_name(product_name)
+        matches = process.extract(
+            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        matched_items = [
+            {
+                'item_id': items_ids[idx_candidate],
+                'item_name': items_names[idx_candidate],
+                'score': score,
+                'volume': items_volumes[idx_candidate],
+                'color': item_type_wine[idx_candidate],
+                'sour': items_sour[idx_candidate]
+            }
+            for match, score, idx_candidate in matches
+        ]
+        if not matched_items:
+            no_match_products.append((idx, product))
+        results.append({
+            'product_id': product['id'],
+            'matched_items': matched_items,
+            'alternative': []  # Заполняется во втором проходе
+        })
+    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
+    groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
+    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
+    for idx, product in tqdm(no_match_products):
+        product_brand = product['brand']
+        product_type_wine = product['new_type_wine']
+        product_type = product['new_type']
+        product_volume = product['volume']
+        product_name = product['name']
+        product_sour = product['sour']
+        alt_key = (product_type_wine, product_type, product_volume, product_sour)
+        type_items = groups_by_alternative_keys.get(alt_key, [])
+        # Фильтруем, исключая итемы с исходным брендом
+        filtered_items = [item for item in type_items if item[1] != product_brand]
+        if filtered_items:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = zip(*filtered_items)
+        else:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = ([], [], [], [], [], [], [])
+        norm_product_name = normalize_name(product_name)
+        alt_matches = process.extract(
+            norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        alt_matched_items = [
+            {
+                'item_id': alt_ids[idx_candidate],
+                'item_name': alt_names[idx_candidate],
+                'score': score,
+                'volume': alt_volumes[idx_candidate],
+                'color': alt_type_wine[idx_candidate],
+                'sour': alt_sour[idx_candidate]
+            }
+            for match, score, idx_candidate in alt_matches
+        ]
+        results[idx]['alternative'] = alt_matched_items
+    results_df = pd.DataFrame(results)
+    merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
+    return merged_df

processor/processor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from preprocess.preprocess import Preprocessor
+from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
+class Processor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict)
+    def process(self, items, products, th=65):
+        items, products=self.preprocessor.process(self, items, products)
+        print('-----*-----Matching-----*-----')
+        items_groups = prepare_groups_with_ids(items)
+        res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
+        return res.drop(['type','type_wine','year','alco','gb'], axis=1), items, products

ui/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+import pandas as pd
+import tempfile
+class GradioUI():
+    def __init__(self, processor):
+        self.processor=processor
+    def process_files(self, file1, file2, threshold):
+        row_items=pd.read_csv(file2, sep='\t')
+        row_products=pd.read_csv(file1, sep='\t', on_bad_lines='skip')
+        df, items, products= self.processor.process(row_products, row_items, threshold)
+        # Создаём временный CSV файл для сохранения результата
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
+            output_csv = tmp.name
+        df.to_csv(output_csv, sep='\t', index=False)
+        return output_csv
+    def run_ui(self):
+        # Определяем пользовательский интерфейс с помощью gr.Blocks
+        with gr.Blocks() as demo:
+            gr.Markdown("## Обработка CSV файлов")
+            with gr.Row():
+                # Используем type="filepath", чтобы получить путь к файлу
+                file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
+            threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
+            process_button = gr.Button("Обработать файлы")
+            output_file = gr.File(label="Скачать результат (CSV)")
+            # При нажатии кнопки вызывается функция process_files
+            process_button.click(fn=self.process_files,
+                                 inputs=[file_input1, file_input2, threshold_input],
+                                 outputs=output_file)
+        demo.launch()