Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Gainward777 commited on Apr 9, 2025

Commit

1f22e94

verified ·

1 Parent(s): 15d09e5

Upload 22 files

Browse files

Files changed (9) hide show

api.py +1 -0
app.py +4 -1
constants/constants.py +312 -286
preprocess/preprocess.py +25 -8
preprocess/utils/common/utils.py +18 -3
preprocess/utils/items/attrs.py +1 -1
processor/matching.py +102 -21
processor/processor.py +4 -2
ui/gradio_ui.py +1 -1

api.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import os
 import datetime

+import csv
 import json
 import os
 import datetime

app.py CHANGED Viewed

@@ -13,7 +13,10 @@ processor=Processor(LONG_TYPES_LIST,
                     OTHER_WORDS,
                     SOUR_MERGE_DICT,
                     TYPES_WINES_DICT,
-                    COLOR_MERGE_DICT)
 searcher=Searcher()

                     OTHER_WORDS,
                     SOUR_MERGE_DICT,
                     TYPES_WINES_DICT,
+                    COLOR_MERGE_DICT,
+                    COUNTRY_LIST,
+                    NORMALIZED_NAMES_ALTERNATIVES_DICT
+                    )
 searcher=Searcher()

constants/constants.py CHANGED Viewed

@@ -1,286 +1,312 @@
-LONG_TYPES_LIST = [
-    "Пиво",
-    "Вино",  # объединяет красное, белое, розовое, игристое и шампанское
-    "Водка",
-    "Виски",
-    "Бурбон",
-    "Коньяк",
-    "Бренди",
-    "Арманьяк",
-    "Ром",
-    "Джин",
-    "Текила",
-    "Мескаль",
-    "Ликер",
-    "Самбука",
-    "Сидр",
-    "Саке",
-    "Абсент",
-    "Граппа",
-    "Портвейн",
-    "Мадера",
-    "Шерри",
-    "Кальвадос",
-    "Писко",
-    "Вермута",
-    "Вермут",
-    "Аперитив",
-    "Биттер",
-    "Эль",
-    "Глинтвейн",
-    "Пунш",
-    "Медовуха",
-    "Ламбик",
-    "Крем-ликер",
-    "Арак",
-    "Чача",
-    "Самогон",
-    "Кумыс",
-    "Сливовица",
-    "Шнапс",
-    "Настойка",
-    "Наливка",
-    "Игристое вино",
-    "Херес",
-    "Пуаре",
-    "Пуарэ",
-    "Ликер",
-    "Ликёр",
-    "Спиртной напиток со вкусом",
-    "Напиток винный",
-    "Винный напиток",
-    "Шомпань",
-    'Сироп',
-    'Конфеты',
-    'Шоколад'
-    'Сок',
-    'Вода',
-    'Табачная продукция',
-    ]
-SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
-SOUR = [
-        'brut',
-        'semi-sweet',
-        'sweet',
-        'брют',
-        'сухое',
-        'полусухое',
-        'полусладкое',
-        'сладкое',
-        'п/сух',
-        'п/сл',
-        'п/с',
-        'сл',
-        'сух',
-    ]
-WINE_TYPES = [
-    'красное',
-    'белое',
-    'розовое',
-    'роз',
-    'кр',
-    'бел',
-    'розе',
-    'rosso',
-    'roso',
-    'roseto',
-    'rosetto',
-    'red',
-    'white',
-    "игристое",
-    "игр",
-    "шомпанское",
-    "шомп",
-    ]
-GBS = [
-    'cristal decanter in oak gift box',
-    'in the carton gift box with 2 glasses',
-    'decanter in the carton gift box',
-    'in the carton gift box',
-    'in the wooden gift box',
-    'in gift box in the carton',
-    'in gift box in carton',
-    'gift box in the carton',
-    'gift box in carton',
-    'in gift box in the wood',
-    'in gift box in wood',
-    'gift box in the wood',
-    'gift box in wood',
-    'gift box with 2 glasses',
-    'in gift box',
-    'gift box',
-    'in carton',
-    'in wooden case',
-    'in wooden box',
-    'in wood case'
-    'in wood box',
-    'in wood',
-    'хрустальный декантер в подарочной упаковке из дуба',
-    'декантер в подарочной упаковке из картона',
-    'в подарочной упаковке из картона с 2 бокалами'
-    'в подарочной упаковке из картона',
-    'в подарочной упаковке из Дуба',
-    'в П У графин и деревянная коробка',
-    'в подарочной упаковке',
-    'подарочная упаковка',
-    'подарочный набор',
-    'в деревянной коробке',
-    'деревянная коробка',
-    'в п/у+2 бокаланов',
-    'в п/у из картона',
-    'в п/у+бокал',
-    'в п/у (дер.коробке)',
-    'в п/у солома',
-    'в п/у',
-    'в п у',
-    'п/уп',
-    'п/у',
-    'в тубе',
-    'туба',
-    'ПУ',
-    ]
-COLORS_FOR_TRIM = [
-    'красное',
-    'белое',
-    'розовое'
-    'кр',
-    'бел',
-    'розе',
-    'rosso',
-    'roso',
-    'roseto',
-    'rosetto',
-    'red',
-    'white',
-    ]
-GRAPES = [
-    "Каберне Совиньон",
-    "Каберне-Совиньон",
-    "Каберне",# Cabernet Sauvignon
-    "Мерло",               # Merlot
-    "Пино Нуар",           # Pinot Noir
-    "Шардоне",            # Chardonnay
-    "Совиньон Блан",       # Sauvignon Blanc
-    "Сира",                # Syrah
-    "Гренаш",             # Grenache
-    "Рислинг",            # Riesling
-    "Мальбек",            # Malbec
-    "Темпранильо",         # Tempranillo
-    "Зинфандель",         # Zinfandel
-    "Санджовезе",         # Sangiovese
-    "Каберне Фран",       # Cabernet Franc
-    "Вионье",             # Viognier
-    "Мурведр",            # Mourvèdre
-    "Шенен Блан",         # Chenin Blanc
-    "Пино Гри",           # Pinot Grigio
-    "Гевюрцтраминер",     # Gewürztraminer
-    "Неббиоло",           # Nebbiolo
-    "Барбера",            # Barbera
-    "Petit Verdot",       # Petit Verdot (обычно оставляют в оригинале)
-    "Карменер",           # Carmenère
-    "Таннат",             # Tannat
-    "Гамей",              # Gamay
-    "Семильон",           # Semillon
-    "Мускат",             # Muscat
-    "Верментино",         # Vermentino
-    "Фиано",              # Fiano
-    "Аглианико",          # Aglianico
-    "Кариньян",           # Carignan (также может встречаться как Cariñena)
-    "Торронтес",
-    "Рислинг",
-    "Кефессия",
-    "Алиготе",
-    "Фурминт"# Torrontés (особенно для аргентинских вин)
-]
-OTHER_WORDS=[
-    "Игристое",
-    "Жемчужное",
-    "Газированный",
-    "Традиционный",
-    "Двухслойный",
-    "Кофе",
-    "Напиток",
-    "Спиртной",
-    "Горькая",
-    "Виноградная",
-    "Выдержанная",
-    "Шотландский",
-    "Купажированный",
-    "креп",
-    "Ординарный",
-    "Выдержанный",
-    "Отборное",
-    "Десертный",
-    "Вкус",
-    "Сорт",
-    ]
-SOUR_MERGE_DICT={
-    'brut': 'сухое',
-    'брют': 'сухое',
-    'semi-sweet':'полусладкое',
-    'sweet':'сладкое',
-    'сухое':'сухое',
-    'п/сух':'полусухое',
-    'п/сл':'полусладкое',
-    'п/с':'полусухое',
-    'сл':'сладкое',
-    'сух':'сухое',
-    None: 'unmatched',
-    }
-TYPES_WINES_DICT={
-    'Пуарэ':'Слабоалкогольные и энергетические напитки',
-    'Пуаре':'Слабоалкогольные и энергетические напитки',
-    'Сидр':'Слабоалкогольные и энергетические напитки',
-    'Шампань': 'Шампанское',
-    'Игристое': 'Шампанское',
-    'Сироп':'Сиропы',
-    'Арманьяк':'Коньяк',
-    'Бренди':'Коньяк',
-    'Ликер':'Ликеры',
-    'Граппа':'Водка',
-    'Настойка':'Водка',
-    'Конфеты':'Сладости',
-    'Портвейн':'Вино',
-    'Херес':'Вино',
-    'Кальвадос':'Коньяк',
-    'Винный напиток': "Вино",
-    "Игристое вино":'Шампанское',
-    "Самогон": "Водка",
-    }
-COLOR_MERGE_DICT={
-    "кр":'красное',
-    "red":"красное",
-    "бел":"белое",
-    "white":"белое",
-    "роз":'розовое',
-    "розе":'розовое',
-    "roso":'розовое',
-    "rosso":'розовое',
-    "rose":'розовое',
-    "rosetto":'розовое',
-    "roseto":'розовое',
-    "игр":"игристое",
-    "шомп":"шомпанское",
-    None: 'unmatched'
-    }

+LONG_TYPES_LIST = [
+    "Пиво",
+    "Вино",  # объединяет красное, белое, розовое, игристое и шампанское
+    "Водка",
+    "Виски",
+    "Бурбон",
+    "Коньяк",
+    "Бренди",
+    "Арманьяк",
+    "Ром",
+    "Джин",
+    "Текила",
+    "Мескаль",
+    "Ликер",
+    "Самбука",
+    "Сидр",
+    "Саке",
+    "Абсент",
+    "Граппа",
+    "Портвейн",
+    "Мадера",
+    "Шерри",
+    "Кальвадос",
+    "Писко",
+    "Вермута",
+    "Вермут",
+    "Аперитив",
+    "Биттер",
+    "Эль",
+    "Глинтвейн",
+    "Пунш",
+    "Медовуха",
+    "Ламбик",
+    "Крем-ликер",
+    "Арак",
+    "Чача",
+    "Самогон",
+    "Кумыс",
+    "Сливовица",
+    "Шнапс",
+    "Настойка",
+    "Наливка",
+    "Игристое вино",
+    "Херес",
+    "Пуаре",
+    "Пуарэ",
+    "Ликер",
+    "Ликёр",
+    "Спиртной напиток со вкусом",
+    "Напиток винный",
+    "Винный напиток",
+    "Шомпань",
+    'Сироп',
+    'Конфеты',
+    'Шоколад'
+    'Сок',
+    'Вода',
+    'Табачная продукция',
+    ]
+SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
+SOUR = [
+        'brut',
+        'semi-sweet',
+        'sweet',
+        'брют',
+        'сухое',
+        'полусухое',
+        'полусладкое',
+        'сладкое',
+        'п/сух',
+        'п/сл',
+        'п/с',
+        'сл',
+        'сл.',
+        'сух',
+        'сух.'
+    ]
+WINE_TYPES = [
+    'красное',
+    'белое',
+    'розовое',
+    'роз',
+    'кр',
+    'крас',
+    'бел',
+    'розе',
+    'rosso',
+    'roso',
+    'roseto',
+    'rosetto',
+    'red',
+    'white',
+    "игристое",
+    "игр",
+    "шомпанское",
+    "шомп",
+    ]
+GBS = [
+    'cristal decanter in oak gift box',
+    'in the carton gift box with 2 glasses',
+    'decanter in the carton gift box',
+    'in the carton gift box',
+    'in the wooden gift box',
+    'in gift box in the carton',
+    'in gift box in carton',
+    'gift box in the carton',
+    'gift box in carton',
+    'in gift box in the wood',
+    'in gift box in wood',
+    'gift box in the wood',
+    'gift box in wood',
+    'gift box with 2 glasses',
+    'in gift box',
+    'gift box',
+    'in carton',
+    'in wooden case',
+    'in wooden box',
+    'in wood case'
+    'in wood box',
+    'in wood',
+    'хрустальный декантер в подарочной упаковке из дуба',
+    'декантер в подарочной упаковке из картона',
+    'в подарочной упаковке из картона с 2 бокалами'
+    'в подарочной упаковке из картона',
+    'в подарочной упаковке из Дуба',
+    'в П У графин и деревянная коробка',
+    'в подарочной упаковке',
+    'подарочная упаковка',
+    'подарочный набор',
+    'в деревянной коробке',
+    'деревянная коробка',
+    'в п/у+2 бокаланов',
+    'в п/у из картона',
+    'в п/у+бокал',
+    'в п/у (дер.коробке)',
+    'в п/у солома',
+    'в п/у',
+    'в п у',
+    'п/уп',
+    'п/у',
+    'в тубе',
+    'туба',
+    'ПУ',
+    ]
+COLORS_FOR_TRIM = [
+    'красное',
+    'крас',
+    'кр',
+    'белое',
+    'бел',
+    'розовое',
+    'розе',
+    'rose',
+    'rosso',
+    'roso',
+    'roseto',
+    'rosetto',
+    'red',
+    'white',
+    ]
+GRAPES = [
+    "Каберне Совиньон",
+    "Каберне-Совиньон",
+    "Каберне",# Cabernet Sauvignon
+    "Мерло",               # Merlot
+    "Пино Нуар",           # Pinot Noir
+    "Шардоне",            # Chardonnay
+    "Совиньон Блан",       # Sauvignon Blanc
+    "Сира",                # Syrah
+    "Гренаш",             # Grenache
+    "Рислинг",            # Riesling
+    "Мальбек",            # Malbec
+    "Темпранильо",         # Tempranillo
+    "Зинфандель",         # Zinfandel
+    "Санджовезе",         # Sangiovese
+    "Каберне Фран",       # Cabernet Franc
+    "Вионье",             # Viognier
+    "Мурведр",            # Mourvèdre
+    "Шенен Блан",         # Chenin Blanc
+    "Пино Гри",           # Pinot Grigio
+    "Гевюрцтраминер",     # Gewürztraminer
+    "Неббиоло",           # Nebbiolo
+    "Барбера",            # Barbera
+    "Petit Verdot",       # Petit Verdot (обычно оставляют в оригинале)
+    "Карменер",           # Carmenère
+    "Таннат",             # Tannat
+    "Гамей",              # Gamay
+    "Семильон",           # Semillon
+    "Мускат",             # Muscat
+    "Верментино",         # Vermentino
+    "Фиано",              # Fiano
+    "Аглианико",          # Aglianico
+    "Кариньян",           # Carignan (также может встречаться как Cariñena)
+    "Торронтес",
+    "Рислинг",
+    "Кефессия",
+    "Алиготе",
+    "Фурминт"# Torrontés (особенно для аргентинских вин)
+]
+OTHER_WORDS=[
+    "Шампанское",
+    "Шампань",
+    "Игристое",
+    "Жемчужное",
+    "Газированный",
+    "Традиционный",
+    "Двухслойный",
+    "Кофе",
+    "Напиток",
+    "Спиртной",
+    "Горькая",
+    "Виноградная",
+    "Выдержанная",
+    "Шотландский",
+    "Купажированный",
+    "креп",
+    "Ординарный",
+    "Выдержанный",
+    "Отборное",
+    "Десертный",
+    "Вкус",
+    "Сорт",
+    "односолод."
+    ]
+SOUR_MERGE_DICT={
+    'brut':'брют',
+    'semi-sweet':'полусладкое',
+    'sweet':'сладкое',
+    'сухое':'сухое',
+    'п/сух':'полусухое',
+    'п/сух.':'полусухое',
+    'п/сл':'полусладкое',
+    'п/сл.':'полусладкое',
+    'п/с':'полусухое',
+    'сл':'сладкое',
+    'сл.':'сладкое',
+    'сух':'сухое',
+    'сух.':'сухое',
+    None: 'unmatched',
+    }
+TYPES_WINES_DICT={
+    'Пуарэ':'Слабоалкогольные и энергетические напитки',
+    'Пуаре':'Слабоалкогольные и энергетические напитки',
+    'Сидр':'Слабоалкогольные и энергетические напитки',
+    'Шампань': 'Шампанское',
+    'Игристое': 'Шампанское',
+    'Сироп':'Сиропы',
+    'Арманьяк':'Коньяк',
+    'Бренди':'Коньяк',
+    'Ликер':'Ликер',
+    'Ликёр': 'Ликер',
+    'Граппа':'Водка',
+    'Настойка':'Водка',
+    'Конфеты':'Сладости',
+    'Портвейн':'Вино',
+    'Херес':'Вино',
+    'Кальвадос':'Коньяк',
+    'Винный напиток': "Вино",
+    "Игристое вино":'Шампанское',
+    "Самогон": "Водка",
+    None: 'unmatched'
+    }
+COLOR_MERGE_DICT={
+    "кр":'красное',
+    "крас":'красное',
+    "red":"красное",
+    "бел":"белое",
+    "white":"белое",
+    "роз":'розовое',
+    "розе":'розовое',
+    "roso":'розовое',
+    "rosso":'розовое',
+    "rose":'розовое',
+    "rosetto":'розовое',
+    "roseto":'розовое',
+    "игр":"игристое",
+    "шомп":"шомпанское",
+    None: 'unmatched'
+    }
+COUNTRY_LIST=[
+    "Франция",
+    "Испания",
+    "Италия",
+    "Шотландия",
+]
+NORMALIZED_NAMES_ALTERNATIVES_DICT={
+    "M&H" : ["em end ejch"],
+    "peats beast" : ["pits bist"],
+    "xo": ["ho"]
+}

preprocess/preprocess.py CHANGED Viewed

@@ -14,7 +14,8 @@ class Preprocessor():
     def __init__(self, long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict):
         self.long_types_list=long_types_list
         self.short_types_list=short_types_list
@@ -24,10 +25,19 @@ class Preprocessor():
         self.colors_ft=colors_for_trim
         self.grapes=grapes
         self.other_words=other_words
-        self.types_n_others=long_types_list+other_words
         self.sour_dict=sour_merge_dict
         self.type_dict=type_merge_dict
         self.color_merge_dict=color_merge_dict
     def process_items(self, df):
@@ -41,8 +51,10 @@ class Preprocessor():
                 if 'brand' in i.keys():
                     result['brand'].append(i['brand'])
                 else: result['brand'].append(None)
-                result['name'].append(i['name'])
-                result['fullname'].append(i['name'])
                 drink_type=get_type(i, self.long_types_list)
                 if drink_type is None:
                     drink_type=check_spark(i)
@@ -52,6 +64,8 @@ class Preprocessor():
                     drink_type=check_spark(i, col_name='type_wine')
                 if drink_type is None:
                     drink_type=check_color_and_sour(i, types=self.sour)
                 #if 'type' in i.keys():
                 result['type'].append(drink_type)#i['type'])
                 #else: dd['type'].append(None)
@@ -116,6 +130,8 @@ class Preprocessor():
         if volume_or_number is not None:
             volume_with_comma=str(volume_or_number).replace('.', ',')
             text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
             test=clean_wine_name(text) #remove_l(text)
             #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
         # else:
@@ -175,12 +191,12 @@ class Preprocessor():
         items['type']=items['type'].replace(self.type_dict)
-        print('-----*-----Unwrap brend cats step 1-----*-----')
         unwrap_b_match=unwrap_brands(products)
         items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
         products["brand"] = products["brand"].replace(unwrap_b_match)
-        print('-----*-----Unwrap brend cats step 2-----*-----')
         unwrap_b_match=unwrap_brands(products)
         items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
         products["brand"] = products["brand"].replace(unwrap_b_match)
@@ -198,9 +214,9 @@ class Preprocessor():
         print('-----*-----Adding service categories-----*-----')
         merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(items, products)
         merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(products, products)
         print('-----*-----Name trimming-----*-----')
@@ -210,6 +226,7 @@ class Preprocessor():
         items['gb']=gb
         items['sour']=sour
         items['sour']=items['sour'].replace(self.sour_dict)
         products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
         products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
         products['gb']=gb

     def __init__(self, long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict,
+                 country_list, normalized_names_dict):
         self.long_types_list=long_types_list
         self.short_types_list=short_types_list
         self.colors_ft=colors_for_trim
         self.grapes=grapes
         self.other_words=other_words
+        self.types_n_others=long_types_list+other_words+sour_list+country_list
+        self.types_n_others.remove("Шерри")
         self.sour_dict=sour_merge_dict
         self.type_dict=type_merge_dict
         self.color_merge_dict=color_merge_dict
+        self.country_list = country_list
+        self.normalized_names_dict=normalized_names_dict
+    def preprocess_name(self, name):
+        return name.replace("\n", " ")
     def process_items(self, df):
                 if 'brand' in i.keys():
                     result['brand'].append(i['brand'])
                 else: result['brand'].append(None)
+                name = self.preprocess_name(i['name'])
+                result['name'].append(name)
+                result['fullname'].append(name)
                 drink_type=get_type(i, self.long_types_list)
                 if drink_type is None:
                     drink_type=check_spark(i)
                     drink_type=check_spark(i, col_name='type_wine')
                 if drink_type is None:
                     drink_type=check_color_and_sour(i, types=self.sour)
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i, col_name='name')
                 #if 'type' in i.keys():
                 result['type'].append(drink_type)#i['type'])
                 #else: dd['type'].append(None)
         if volume_or_number is not None:
             volume_with_comma=str(volume_or_number).replace('.', ',')
             text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+            text = re.sub(r'\s+\b[лЛlL].\b', '', text)
+            text = re.sub(r'\s+\b[лЛlL]\b', '', text)
             test=clean_wine_name(text) #remove_l(text)
             #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
         # else:
         items['type']=items['type'].replace(self.type_dict)
+        print('-----*-----Unwrap brand cats step 1-----*-----')
         unwrap_b_match=unwrap_brands(products)
         items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
         products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Unwrap brand cats step 2-----*-----')
         unwrap_b_match=unwrap_brands(products)
         items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
         products["brand"] = products["brand"].replace(unwrap_b_match)
         print('-----*-----Adding service categories-----*-----')
         merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(items, products, type_merge_dict=self.type_dict)
         merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(products, products, type_merge_dict=self.type_dict)
         print('-----*-----Name trimming-----*-----')
         items['gb']=gb
         items['sour']=sour
         items['sour']=items['sour'].replace(self.sour_dict)
         products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
         products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
         products['gb']=gb

preprocess/utils/common/utils.py CHANGED Viewed

@@ -64,6 +64,7 @@ def merge_wine_type(items, colors=None, color_merge_dict=None):
     result=[]
     for row in tqdm(items.iterrows()):
         try:
             if row[1]['type_wine'] is not None:
                 color=find_full_word(row[1]['type_wine'], colors)
                 if color is not None:
@@ -88,12 +89,22 @@ def merge_wine_type(items, colors=None, color_merge_dict=None):
     items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
-def merge_types(items, products):
     alco_types=[i.strip().lower() for i in products['type'].unique()]
     alco_types.append('ликёр')
     result=[]
     for row in tqdm(items.iterrows()):
         try:
             type_in_name=find_full_word(row[1]['name'], alco_types)
             if type_in_name is not None:
                 result.append(type_in_name)
@@ -111,7 +122,8 @@ def merge_types(items, products):
             result.append(None)
     items['new_type']=result
-    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
 def trim_name(text, words_to_remove):
@@ -125,7 +137,7 @@ def trim_name(text, words_to_remove):
     # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
     # Используем re.escape, чтобы экранировать спецсимволы в словах.
     pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
-    #print(pattern)
     # Заменяем найденные полные слова на пустую строку.
     new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
@@ -141,8 +153,11 @@ def name_trimmer(df, prcess_text, types_and_others):
     gbs=[]
     sours=[]
     for idx, row in tqdm(df.iterrows()):
         text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
         text=trim_name(text, types_and_others).replace(',','').replace('.','')
         result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
         gbs.append(gb)

     result=[]
     for row in tqdm(items.iterrows()):
         try:
+            #print("merge_wine_type:" + str(row))
             if row[1]['type_wine'] is not None:
                 color=find_full_word(row[1]['type_wine'], colors)
                 if color is not None:
     items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
+def merge_types(items, products, type_merge_dict={}, sub_alco_types=["Бренди", "Шампань", "Шампанское"]):
     alco_types=[i.strip().lower() for i in products['type'].unique()]
     alco_types.append('ликёр')
     result=[]
     for row in tqdm(items.iterrows()):
         try:
+            # Parameter 'sub_alco_types' specifies specific alcohol types that usually specified
+            # in product / item name along with "parent" type and in this case this subtype should have priority
+            # For example, "Вино Шампано Ле Брён де Нёвиль", or "Бренди де Херес"
+            if sub_alco_types:
+                type_in_name=find_full_word(row[1]['name'], sub_alco_types)
+                if type_in_name is not None:
+                    result.append(type_in_name)
+                    continue
             type_in_name=find_full_word(row[1]['name'], alco_types)
             if type_in_name is not None:
                 result.append(type_in_name)
             result.append(None)
     items['new_type']=result
+    #items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
+    items['new_type'] = items['new_type'].replace(type_merge_dict)
 def trim_name(text, words_to_remove):
     # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
     # Используем re.escape, чтобы экранировать спецсимволы в словах.
     pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
+    #print("Pattern: " + pattern)
     # Заменяем найденные полные слова на пустую строку.
     new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
     gbs=[]
     sours=[]
     for idx, row in tqdm(df.iterrows()):
+        #print("Name1: " + str(row['name']))
         text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
+        #print("Name2: " + text)
         text=trim_name(text, types_and_others).replace(',','').replace('.','')
+        #print("Name3: " + text)
         result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
         gbs.append(gb)

preprocess/utils/items/attrs.py CHANGED Viewed

@@ -6,7 +6,7 @@ def check_spark(row, col_name='name', types=['Игристое', 'игр']):
         return None
-def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
     if col_name in row.keys():
         for t in types:
             if t.lower() in row[col_name].lower():

         return None
+def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное', 'крас.', 'бел.']):
     if col_name in row.keys():
         for t in types:
             if t.lower() in row[col_name].lower():

processor/matching.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from tqdm import tqdm
 from transliterate import translit, detect_language
@@ -6,6 +7,7 @@ import pandas as pd
 from rapidfuzz import fuzz, process
 import numpy as np
 from math import isnan
 def normalize_name(name):
@@ -20,6 +22,41 @@ def normalize_name(name):
         pass
     return name.lower()
 def prepare_groups_with_ids(items_df):
     """
     Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
@@ -31,11 +68,14 @@ def prepare_groups_with_ids(items_df):
     :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name)
     grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
         lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
 def prepare_groups_by_alternative_keys(items_df):
@@ -47,13 +87,23 @@ def prepare_groups_by_alternative_keys(items_df):
     :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name)
-    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
         lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
 def order_by_best_year(matched_items, year):
     best_matched_items = []
@@ -61,30 +111,37 @@ def order_by_best_year(matched_items, year):
     other_matched_items = []
     max_year = 0
-    if year:
-        year = int(str(year))
     for mi in matched_items:
         # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
         try:
-            if year and (not np.isnan(year)) and (int(year) != 0) and (not np.isnan(mi['year'])) and (mi['year'] == year):
-                best_matched_items.append(mi)
-            elif mi['year'] and (not np.isnan(mi['year'])) and int(mi['year']) != 0:
-                if int(mi['year']) > max_year:
                     max_year_matched_items = [mi]
-                    max_year = int(mi['year'])
-                elif int(mi['year']) > max_year:
                     max_year_matched_items.append(mi)
                 else:
-                    other_matched_items.append(mi)
             else:
-                other_matched_items.append(mi)
         except Exception as ex:
-            print("Error processing best year for item " + str(mi["item_id"]) + " " + str(ex))
-    best_matched_items.extend(max_year_matched_items)
-    best_matched_items.extend(other_matched_items)
-    return best_matched_items
 def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
@@ -121,14 +178,19 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         product_sour = product['sour']
         key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
         items_data = items_groups.get(key, [])
         if items_data:
             # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
             items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
         else:
             items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
-        norm_product_name = normalize_name(product_name)
         matches = process.extract(
             norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
         )
@@ -168,6 +230,7 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         # Второй проход: для продуктов без совпадений ищем по альтернативным группам
         for idx, product in tqdm(no_match_products):
             product_brand = product['brand']
             product_type_wine = product['new_type_wine']
             product_type = product['new_type']
@@ -175,19 +238,37 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
             product_name = product['name']
             product_sour = product['sour']
-            alt_key = (product_type_wine, product_type, product_volume, product_sour)
             type_items = groups_by_alternative_keys.get(alt_key, [])
             # Фильтруем, исключая итемы с исходным брендом
             filtered_items = [item for item in type_items if item[1] != product_brand]
             if filtered_items:
                 alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
             else:
                 alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
-            norm_product_name = normalize_name(product_name)
             alt_matches = process.extract(
-                norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
             )
             alt_matched_items = [
                 {
                     'item_id': alt_ids[idx_candidate],

 import json
+from constants.constants import *
 from tqdm import tqdm
 from transliterate import translit, detect_language
 from rapidfuzz import fuzz, process
 import numpy as np
 from math import isnan
+from preprocess.utils.common.utils import *
 def normalize_name(name):
         pass
     return name.lower()
+def normalize_name_ex(name):
+    name = normalize_name(name)
+    for nnk in NORMALIZED_NAMES_ALTERNATIVES_DICT:
+        word = find_full_word(name, NORMALIZED_NAMES_ALTERNATIVES_DICT[nnk])
+        if word:
+            name = name.replace(word, nnk)
+    return name
+def compare_names(name1, name2, scorer=fuzz.ratio, score_cutoff=50):
+    print("Scoring: " + name1 + " vs " + name2)
+    words1 = name1.split(" ")
+    words2 = name2.split(" ")
+    score = 0
+    for w1 in words1:
+        for w2 in words2:
+            r = scorer(w1, w2)
+            print("\t " + w1 + " - " + w2 + " ; " + str(r))
+            if r >= score_cutoff:
+                score = score + r
+    print("Score result: " + str(score / (100*len(words1))))
+    return score / (100*len(words1))
+def compare_name_with_list(name, names_list, scorer=fuzz.ratio, score_cutoff=50):
+    result = []
+    index = 0
+    for name2 in names_list:
+        result.append((name2, compare_names(name, name2, scorer, score_cutoff), index))
+        index = index + 1
+    return result
 def prepare_groups_with_ids(items_df):
     """
     Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
     :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
     grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
         lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
+    #print(grouped)
     return grouped
 def prepare_groups_by_alternative_keys(items_df):
     :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
+    #grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume' ''', 'sour''''']).apply(
+    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume']).apply(
         lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
+def parse_year(year):
+    if not year:
+        return False
+    elif isinstance(year, str):
+        return int(year)
+    elif isinstance(year, (int, float)) and not isnan(year):
+        return int(year)
+    return False
 def order_by_best_year(matched_items, year):
     best_matched_items = []
     other_matched_items = []
     max_year = 0
+    year = parse_year(year)
     for mi in matched_items:
         # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
         try:
+            if isinstance(mi['year'], (int, float, str)):
+                mi_year = int(mi['year'])
+            else:
+                mi_year = False
+            if year and mi_year and (mi_year == year):
+                best_matched_items.append(mi['item_id'])
+            elif mi_year:
+                if mi_year > max_year:
                     max_year_matched_items = [mi]
+                    max_year = mi_year
+                elif mi_year == max_year:
                     max_year_matched_items.append(mi)
                 else:
+                    other_matched_items.append(mi['item_id'])
             else:
+                other_matched_items.append(mi['item_id'])
         except Exception as ex:
+            print("Error processing best year for item " + str(mi["item_id"]) + " value " + str(mi['year']) + ": " + str(ex))
+    if len(best_matched_items) > 0:
+        for m in matched_items:
+            if not m['item_id'] in best_matched_items:
+                m['score'] = m['score']*0.8
+    return matched_items
 def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
         product_sour = product['sour']
         key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
+        #print("Name: " + product_name)
+        #print("Key: " + str(key))
+        #print("Groups: " + str(items_groups))
         items_data = items_groups.get(key, [])
         if items_data:
             # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
+            #print("Data: " + str(items_data))
             items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
         else:
+            #print("Data: No")
             items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
+        norm_product_name = normalize_name_ex(product_name)
         matches = process.extract(
             norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
         )
         # Второй проход: для продуктов без совпадений ищем по альтернативным группам
         for idx, product in tqdm(no_match_products):
+            #print("Product: " + str(product))
             product_brand = product['brand']
             product_type_wine = product['new_type_wine']
             product_type = product['new_type']
             product_name = product['name']
             product_sour = product['sour']
+            #alt_key = (product_type_wine, product_type, product_volume, product_sour)
+            alt_key = (product_type_wine, product_type, product_volume)
+            #print("AltName: " + str(product))
+            #print("AltKey: " + str(alt_key))
+            #print("AltGroups: " + str(groups_by_alternative_keys))
+            #print("AltGroups Keys: " + str(groups_by_alternative_keys.keys()))
             type_items = groups_by_alternative_keys.get(alt_key, [])
+            #print("AltGroups2: " + str(type_items))
             # Фильтруем, исключая итемы с исходным брендом
             filtered_items = [item for item in type_items if item[1] != product_brand]
             if filtered_items:
+                #print("AltData: " + str(filtered_items))
                 alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
             else:
+                #print("AltData: No")
                 alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
+            norm_product_name = normalize_name_ex(product_name)
+            #print("norm_product_name: " + str(norm_product_name))
+            #print("alt_norm_names: " + str(alt_norm_names))
             alt_matches = process.extract(
+                norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=50
             )
+            #alt_matches = compare_name_with_list(
+            #    norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=70
+            #)
+            #print("alt_matches: " + str(alt_matches))
             alt_matched_items = [
                 {
                     'item_id': alt_ids[idx_candidate],

processor/processor.py CHANGED Viewed

@@ -5,11 +5,13 @@ from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
 class Processor():
     def __init__(self, long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict):
         self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict)
     def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
         items, products=self.preprocessor.process(products, items)

 class Processor():
     def __init__(self, long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict,
+                 country_list, normalized_names_dict):
         self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
                  type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict,
+                 country_list, normalized_names_dict)
     def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
         items, products=self.preprocessor.process(products, items)

ui/gradio_ui.py CHANGED Viewed

@@ -87,7 +87,7 @@ class GradioUI():
                 output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
                 output_csv = os.path.join(results_path, output_csv)
-                df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE)
                 return output_csv
         except Exception as ex:
             raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)

                 output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
                 output_csv = os.path.join(results_path, output_csv)
+                df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE, escapechar="@")
                 return output_csv
         except Exception as ex:
             raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)