Spaces:
Sleeping
Sleeping
Upload 12 files
#1
by
Gainward777 - opened
- app.py +27 -37
- constants/constants.py +285 -0
- preprocess/preprocess.py +223 -0
- preprocess/utills/common/brand_matching.py +137 -0
- preprocess/utills/common/extracters.py +66 -0
- preprocess/utills/common/parallel_brand_mutching.py +97 -0
- preprocess/utills/common/top_inserts.py +66 -0
- preprocess/utills/common/utils.py +130 -0
- preprocess/utills/items/attrs.py +40 -0
- processor/matching.py +157 -0
- processor/processor.py +25 -0
- ui/gradio_ui.py +45 -0
app.py
CHANGED
|
@@ -1,37 +1,27 @@
|
|
| 1 |
-
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
|
| 30 |
-
|
| 31 |
-
process_button = gr.Button("Обработать файлы")
|
| 32 |
-
output_file = gr.File(label="Скачать результат (CSV)")
|
| 33 |
-
|
| 34 |
-
# При нажатии кнопки вызывается функция process_files
|
| 35 |
-
process_button.click(fn=process_files, inputs=[file_input1, file_input2, threshold_input], outputs=output_file)
|
| 36 |
-
|
| 37 |
-
demo.launch()
|
|
|
|
| 1 |
+
from processor.processor import Processor
|
| 2 |
+
from constants.constants import *
|
| 3 |
+
from ui.gradio_ui import GradioUI
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
processor=Processor(LONG_TYPES_LIST,
|
| 7 |
+
SHORT_TYPES_LIST,
|
| 8 |
+
SOUR,
|
| 9 |
+
WINE_TYPES,
|
| 10 |
+
GBS,
|
| 11 |
+
COLORS_FOR_TRIM,
|
| 12 |
+
GRAPES,
|
| 13 |
+
OTHER_WORDS,
|
| 14 |
+
SOUR_MERGE_DICT,
|
| 15 |
+
TYPES_WINES_DICT,
|
| 16 |
+
COLOR_MERGE_DICT)
|
| 17 |
+
|
| 18 |
+
ui=GradioUI(processor)
|
| 19 |
+
ui.run_ui()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constants/constants.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LONG_TYPES_LIST = [
|
| 2 |
+
"Пиво",
|
| 3 |
+
"Вино", # объединяет красное, белое, розовое, игристое и шампанское
|
| 4 |
+
"Водка",
|
| 5 |
+
"Виски",
|
| 6 |
+
"Бурбон",
|
| 7 |
+
"Коньяк",
|
| 8 |
+
"Бренди",
|
| 9 |
+
"Арманьяк",
|
| 10 |
+
"Ром",
|
| 11 |
+
"Джин",
|
| 12 |
+
"Текила",
|
| 13 |
+
"Мескаль",
|
| 14 |
+
"Ликер",
|
| 15 |
+
"Самбука",
|
| 16 |
+
"Сидр",
|
| 17 |
+
"Саке",
|
| 18 |
+
"Абсент",
|
| 19 |
+
"Граппа",
|
| 20 |
+
"Портвейн",
|
| 21 |
+
"Мадера",
|
| 22 |
+
"Шерри",
|
| 23 |
+
"Кальвадос",
|
| 24 |
+
"Писко",
|
| 25 |
+
"Вермута",
|
| 26 |
+
"Вермут",
|
| 27 |
+
"Аперитив",
|
| 28 |
+
"Биттер",
|
| 29 |
+
"Эль",
|
| 30 |
+
"Глинтвейн",
|
| 31 |
+
"Пунш",
|
| 32 |
+
"Медовуха",
|
| 33 |
+
"Ламбик",
|
| 34 |
+
"Крем-ликер",
|
| 35 |
+
"Арак",
|
| 36 |
+
"Чача",
|
| 37 |
+
"Самогон",
|
| 38 |
+
"Кумыс",
|
| 39 |
+
"Сливовица",
|
| 40 |
+
"Шнапс",
|
| 41 |
+
"Настойка",
|
| 42 |
+
"Наливка",
|
| 43 |
+
"Игристое вино",
|
| 44 |
+
"Херес",
|
| 45 |
+
"Пуаре",
|
| 46 |
+
"Пуарэ",
|
| 47 |
+
"Ликер",
|
| 48 |
+
"Ликёр",
|
| 49 |
+
"Спиртной напиток со вкусом",
|
| 50 |
+
"Напиток винный",
|
| 51 |
+
"Винный напиток",
|
| 52 |
+
"Шомпань",
|
| 53 |
+
'Сироп',
|
| 54 |
+
'Конфеты',
|
| 55 |
+
'Шоколад'
|
| 56 |
+
'Сок',
|
| 57 |
+
'Вода',
|
| 58 |
+
'Табачная продукция',
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
SOUR = [
|
| 66 |
+
'brut',
|
| 67 |
+
'semi-sweet',
|
| 68 |
+
'sweet',
|
| 69 |
+
'брют',
|
| 70 |
+
'сухое',
|
| 71 |
+
'полусухое',
|
| 72 |
+
'полусладкое',
|
| 73 |
+
'сладкое',
|
| 74 |
+
'п/сух',
|
| 75 |
+
'п/сл',
|
| 76 |
+
'п/с',
|
| 77 |
+
'сл',
|
| 78 |
+
'сух',
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
WINE_TYPES = [
|
| 83 |
+
'красное',
|
| 84 |
+
'белое',
|
| 85 |
+
'розовое',
|
| 86 |
+
'роз',
|
| 87 |
+
'кр',
|
| 88 |
+
'бел',
|
| 89 |
+
'розе',
|
| 90 |
+
'rosso',
|
| 91 |
+
'roso',
|
| 92 |
+
'roseto',
|
| 93 |
+
'rosetto',
|
| 94 |
+
'red',
|
| 95 |
+
'white',
|
| 96 |
+
"игристое",
|
| 97 |
+
"игр",
|
| 98 |
+
"шомпанское",
|
| 99 |
+
"шомп",
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
GBS = [
|
| 104 |
+
'cristal decanter in oak gift box',
|
| 105 |
+
'in the carton gift box with 2 glasses',
|
| 106 |
+
'decanter in the carton gift box',
|
| 107 |
+
'in the carton gift box',
|
| 108 |
+
'in the wooden gift box',
|
| 109 |
+
'in gift box in the carton',
|
| 110 |
+
'in gift box in carton',
|
| 111 |
+
'gift box in the carton',
|
| 112 |
+
'gift box in carton',
|
| 113 |
+
'in gift box in the wood',
|
| 114 |
+
'in gift box in wood',
|
| 115 |
+
'gift box in the wood',
|
| 116 |
+
'gift box in wood',
|
| 117 |
+
'gift box with 2 glasses',
|
| 118 |
+
'in gift box',
|
| 119 |
+
'gift box',
|
| 120 |
+
'in carton',
|
| 121 |
+
'in wooden case',
|
| 122 |
+
'in wooden box',
|
| 123 |
+
'in wood case'
|
| 124 |
+
'in wood box',
|
| 125 |
+
'in wood',
|
| 126 |
+
'хрустальный декантер в подарочной упаковке из дуба',
|
| 127 |
+
'декантер в подарочной упаковке из картона',
|
| 128 |
+
'в подарочной упаковке из картона с 2 бокалами'
|
| 129 |
+
'в подарочной упаковке из картона',
|
| 130 |
+
'в подарочной упаковке из Дуба',
|
| 131 |
+
'в П У графин и деревянная коробка',
|
| 132 |
+
'в подарочной упаковке',
|
| 133 |
+
'подарочная упаковка',
|
| 134 |
+
'подарочный набор',
|
| 135 |
+
'в деревянной коробке',
|
| 136 |
+
'деревянная коробка',
|
| 137 |
+
'в п/у+2 бокаланов',
|
| 138 |
+
'в п/у из картона',
|
| 139 |
+
'в п/у+бокал',
|
| 140 |
+
'в п/у (дер.коробке)',
|
| 141 |
+
'в п/у солома',
|
| 142 |
+
'в п/у',
|
| 143 |
+
'в п у',
|
| 144 |
+
'п/уп',
|
| 145 |
+
'п/у',
|
| 146 |
+
'в тубе',
|
| 147 |
+
'туба',
|
| 148 |
+
'ПУ',
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
COLORS_FOR_TRIM = [
|
| 153 |
+
'красное',
|
| 154 |
+
'белое',
|
| 155 |
+
'розовое'
|
| 156 |
+
'кр',
|
| 157 |
+
'бел',
|
| 158 |
+
'розе',
|
| 159 |
+
'rosso',
|
| 160 |
+
'roso',
|
| 161 |
+
'roseto',
|
| 162 |
+
'rosetto',
|
| 163 |
+
'red',
|
| 164 |
+
'white',
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
GRAPES = [
|
| 169 |
+
"Каберне Совиньон",
|
| 170 |
+
"Каберне-Совиньон",
|
| 171 |
+
"Каберне",# Cabernet Sauvignon
|
| 172 |
+
"Мерло", # Merlot
|
| 173 |
+
"Пино Нуар", # Pinot Noir
|
| 174 |
+
"Шардоне", # Chardonnay
|
| 175 |
+
"Совиньон Блан", # Sauvignon Blanc
|
| 176 |
+
"Сира", # Syrah
|
| 177 |
+
"Гренаш", # Grenache
|
| 178 |
+
"Рислинг", # Riesling
|
| 179 |
+
"Мальбек", # Malbec
|
| 180 |
+
"Темпранильо", # Tempranillo
|
| 181 |
+
"Зинфандель", # Zinfandel
|
| 182 |
+
"Санджовезе", # Sangiovese
|
| 183 |
+
"Каберне Фран", # Cabernet Franc
|
| 184 |
+
"Вионье", # Viognier
|
| 185 |
+
"Мурведр", # Mourvèdre
|
| 186 |
+
"Шенен Блан", # Chenin Blanc
|
| 187 |
+
"П��но Гри", # Pinot Grigio
|
| 188 |
+
"Гевюрцтраминер", # Gewürztraminer
|
| 189 |
+
"Неббиоло", # Nebbiolo
|
| 190 |
+
"Барбера", # Barbera
|
| 191 |
+
"Petit Verdot", # Petit Verdot (обычно оставляют в оригинале)
|
| 192 |
+
"Карменер", # Carmenère
|
| 193 |
+
"Таннат", # Tannat
|
| 194 |
+
"Гамей", # Gamay
|
| 195 |
+
"Семильон", # Semillon
|
| 196 |
+
"Мускат", # Muscat
|
| 197 |
+
"Верментино", # Vermentino
|
| 198 |
+
"Фиано", # Fiano
|
| 199 |
+
"Аглианико", # Aglianico
|
| 200 |
+
"Кариньян", # Carignan (также может встречаться как Cariñena)
|
| 201 |
+
"Торронтес",
|
| 202 |
+
"Рислинг",
|
| 203 |
+
"Кефессия",
|
| 204 |
+
"Алиготе",
|
| 205 |
+
"Фурминт"# Torrontés (особенно для аргентинских вин)
|
| 206 |
+
]
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
OTHER_WORDS=[
|
| 210 |
+
"Игристое",
|
| 211 |
+
"Жемчужное",
|
| 212 |
+
"Газированный",
|
| 213 |
+
"Традиционный",
|
| 214 |
+
"Двухслойный",
|
| 215 |
+
"Кофе",
|
| 216 |
+
"Напиток",
|
| 217 |
+
"Спиртной",
|
| 218 |
+
"Горькая",
|
| 219 |
+
"Виноградная",
|
| 220 |
+
"Выдержанная",
|
| 221 |
+
"Шотландский",
|
| 222 |
+
"Купажированный",
|
| 223 |
+
"креп",
|
| 224 |
+
"Ординарный",
|
| 225 |
+
"Выдержанный",
|
| 226 |
+
"Отборное",
|
| 227 |
+
"Десертный",
|
| 228 |
+
"Вкус",
|
| 229 |
+
"Сорт",
|
| 230 |
+
]
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
SOUR_MERGE_DICT={
|
| 234 |
+
'brut':'брют',
|
| 235 |
+
'semi-sweet':'полусладкое',
|
| 236 |
+
'sweet':'сладкое',
|
| 237 |
+
'сухое':'сухое',
|
| 238 |
+
'п/сух':'полусухое',
|
| 239 |
+
'п/сл':'полусладкое',
|
| 240 |
+
'п/с':'полусухое',
|
| 241 |
+
'сл':'сладкое',
|
| 242 |
+
'сух':'сухое',
|
| 243 |
+
None: 'unmatched',
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
TYPES_WINES_DICT={
|
| 248 |
+
'Пуарэ':'Слабоалкогольные и энергетические напитки',
|
| 249 |
+
'Пуаре':'Слабоалкогольные и энергетические напитки',
|
| 250 |
+
'Сидр':'Слабоалкогольные и энергетические напитки',
|
| 251 |
+
'Шампань': 'Шампанское',
|
| 252 |
+
'Игристое': 'Шампанское',
|
| 253 |
+
'Сироп':'Сиропы',
|
| 254 |
+
'Арманьяк':'Коньяк',
|
| 255 |
+
'Бренди':'Коньяк',
|
| 256 |
+
'Ликер':'Ликеры',
|
| 257 |
+
'Граппа':'Водка',
|
| 258 |
+
'Настойка':'Водка',
|
| 259 |
+
'Конфеты':'Сладости',
|
| 260 |
+
'Портвейн':'Вино',
|
| 261 |
+
'Херес':'Вино',
|
| 262 |
+
'Кальвадос':'Коньяк',
|
| 263 |
+
'Винный напиток': "Вино",
|
| 264 |
+
"Игристое вино":'Шампанское',
|
| 265 |
+
"Самогон": "Водка",
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
COLOR_MERGE_DICT={
|
| 270 |
+
"кр":'красное',
|
| 271 |
+
"red":"красное",
|
| 272 |
+
"бел":"белое",
|
| 273 |
+
"white":"белое",
|
| 274 |
+
"роз":'розовое',
|
| 275 |
+
"розе":'розовое',
|
| 276 |
+
"roso":'розовое',
|
| 277 |
+
"rosso":'розовое',
|
| 278 |
+
"rose":'розовое',
|
| 279 |
+
"rosetto":'розовое',
|
| 280 |
+
"roseto":'розовое',
|
| 281 |
+
"игр":"игристое",
|
| 282 |
+
"шомп":"шомпанское",
|
| 283 |
+
None: 'unmatched'
|
| 284 |
+
}
|
| 285 |
+
|
preprocess/preprocess.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from preprocess.utils.items.attrs import *
|
| 4 |
+
from preprocess.utils.common.extracters import *
|
| 5 |
+
from preprocess.utils.common.brand_matching import *
|
| 6 |
+
from preprocess.utils.common.parallel_brand_matching import *
|
| 7 |
+
from preprocess.utils.common.utils import *
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Preprocessor():
|
| 13 |
+
|
| 14 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 15 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 16 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 17 |
+
|
| 18 |
+
self.long_types_list=long_types_list
|
| 19 |
+
self.short_types_list=short_types_list
|
| 20 |
+
self.sour=sour_list
|
| 21 |
+
self.type_wine=type_wine
|
| 22 |
+
self.gbs=gbs
|
| 23 |
+
self.colors_ft=colors_for_trim
|
| 24 |
+
self.grapes=grapes
|
| 25 |
+
self.other_words=other_words
|
| 26 |
+
self.types_n_others=long_types_list+other_words
|
| 27 |
+
self.sour_dict=sour_merge_dict
|
| 28 |
+
self.type_dict=type_merge_dict
|
| 29 |
+
self.color_merge_dict=color_merge_dict
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def process_items(self, df):
|
| 33 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 34 |
+
#counter=0
|
| 35 |
+
for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
i=json.loads(i)
|
| 39 |
+
result['id'].append(idf)
|
| 40 |
+
if 'brand' in i.keys():
|
| 41 |
+
result['brand'].append(i['brand'])
|
| 42 |
+
else: result['brand'].append(None)
|
| 43 |
+
result['name'].append(i['name'])
|
| 44 |
+
drink_type=get_type(i, self.types)
|
| 45 |
+
if drink_type is None:
|
| 46 |
+
drink_type=check_spark(i)
|
| 47 |
+
if drink_type is None:
|
| 48 |
+
drink_type=check_color_and_sour(i)
|
| 49 |
+
if drink_type is None:
|
| 50 |
+
drink_type=check_spark(i, col_name='type_wine')
|
| 51 |
+
if drink_type is None:
|
| 52 |
+
drink_type=check_color_and_sour(i, types=self.sour)
|
| 53 |
+
#if 'type' in i.keys():
|
| 54 |
+
result['type'].append(drink_type)#i['type'])
|
| 55 |
+
#else: dd['type'].append(None)
|
| 56 |
+
if 'volume' in i.keys():
|
| 57 |
+
result['volume'].append(i['volume'])
|
| 58 |
+
else:
|
| 59 |
+
vol=extract_volume_or_number(i['name'])
|
| 60 |
+
result['volume'].append(vol)
|
| 61 |
+
if 'year' in i.keys():
|
| 62 |
+
result['year'].append(i['year'])
|
| 63 |
+
else:
|
| 64 |
+
year=extract_production_year(i['name'])
|
| 65 |
+
result['year'].append(year)
|
| 66 |
+
alco=extract_alcohol_content(i['name'])
|
| 67 |
+
if 'type_wine' in i.keys():
|
| 68 |
+
result['type_wine'].append(i['type_wine'])
|
| 69 |
+
else: result['type_wine'].append(None)
|
| 70 |
+
#f alco is not None:
|
| 71 |
+
result['alco'].append(alco)
|
| 72 |
+
#else: dd['type_wine'].append(None)
|
| 73 |
+
except Exception as ex:
|
| 74 |
+
print(idf, ex)
|
| 75 |
+
return pd.DataFrame(result)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def process_products(self, products):
|
| 79 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 80 |
+
for idx, row in tqdm(products.iterrows()):
|
| 81 |
+
try:
|
| 82 |
+
result['id'].append(row['id'])
|
| 83 |
+
result['brand'].append(row['brand'])
|
| 84 |
+
result['type_wine'].append(row['category'])
|
| 85 |
+
result['type'].append(row['product_type'])
|
| 86 |
+
result['name'].append(row['name_long'])
|
| 87 |
+
vol=extract_volume_or_number(row['name'])
|
| 88 |
+
result['volume'].append(vol)
|
| 89 |
+
#year=extract_production_year(row['name'])
|
| 90 |
+
year=extract_production_year(str(row['name_postfix']))
|
| 91 |
+
result['year'].append(year)
|
| 92 |
+
#rr['year'].append(row['name_postfix'])
|
| 93 |
+
alco=extract_alcohol_content(row['name'])
|
| 94 |
+
#f alco is not None:
|
| 95 |
+
result['alco'].append(alco)
|
| 96 |
+
except Exception as ex:
|
| 97 |
+
print(ex)
|
| 98 |
+
return pd.DataFrame(result)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def prcess_text(self, text):
|
| 102 |
+
#text=''+origin
|
| 103 |
+
#text=str(split_russian_and_english(text))
|
| 104 |
+
gb=find_full_word(text, self.gbs)#get_GB(text)
|
| 105 |
+
if gb is not None:
|
| 106 |
+
text=text.replace(str(gb), '')
|
| 107 |
+
|
| 108 |
+
alcohol = extract_alcohol_content(text)
|
| 109 |
+
if alcohol is not None:
|
| 110 |
+
alco_w_comma=alcohol.replace('.', ',')
|
| 111 |
+
text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
|
| 112 |
+
volume_or_number = extract_volume_or_number(text)
|
| 113 |
+
if volume_or_number is not None:
|
| 114 |
+
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 115 |
+
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 116 |
+
test=clean_wine_name(text) #remove_l(text)
|
| 117 |
+
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 118 |
+
# else:
|
| 119 |
+
# volume_or_number=re_extract_volume(text)
|
| 120 |
+
# if volume_or_number is not None:
|
| 121 |
+
# volume_with_comma=volume_or_number.replace('.', ',')
|
| 122 |
+
# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 123 |
+
years = extract_years(text)
|
| 124 |
+
if years is not None:
|
| 125 |
+
text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
|
| 126 |
+
production_year = extract_production_year(text)
|
| 127 |
+
if production_year is not None:
|
| 128 |
+
text=text.replace(str(production_year), '')
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
color=find_full_word(text, self.colors_ft)
|
| 132 |
+
if color is not None:
|
| 133 |
+
text=text.replace(str(color), '')
|
| 134 |
+
sour=find_full_word(text, self.sour) #get_sour(text)
|
| 135 |
+
if sour is not None:
|
| 136 |
+
text=text.replace(str(sour), '')
|
| 137 |
+
# re_extracted_volume=re_extract_volume(text)
|
| 138 |
+
# if re_extracted_volume is not None:
|
| 139 |
+
# volume_with_comma=re_extracted_volume.replace('.', ',')
|
| 140 |
+
# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
|
| 141 |
+
|
| 142 |
+
# else:
|
| 143 |
+
# re_extracted_volume=re_extract_volume(str(volume_or_number))
|
| 144 |
+
# volume_or_number=re_extracted_volume
|
| 145 |
+
|
| 146 |
+
return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def process(self, items, products):
|
| 150 |
+
|
| 151 |
+
print('------*-----Prepare items catalogue-----*-----')
|
| 152 |
+
items=self.process_items(items.copy())
|
| 153 |
+
print('-----*-----Prepare products catalogue-----*-----')
|
| 154 |
+
products=self.process_products(products.copy())
|
| 155 |
+
|
| 156 |
+
items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
|
| 157 |
+
products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
|
| 158 |
+
|
| 159 |
+
print('-----*-----Split n match-----*-----')
|
| 160 |
+
splited=split_n_match(products, items)
|
| 161 |
+
items["brand"] = items["brand"].replace(splited)
|
| 162 |
+
|
| 163 |
+
print('-----*-----Fill brands in items-----*-----')
|
| 164 |
+
fill_brands_in_dataframe(products['brand'].unique(), items)
|
| 165 |
+
|
| 166 |
+
print('-----*-----Brand matching-----*-----')
|
| 167 |
+
comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
|
| 168 |
+
out_prods=list(set(prod_brand_list)-set(comp_list))
|
| 169 |
+
out_items=list(set(items_brand_list)-set(comp_list))
|
| 170 |
+
brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
|
| 171 |
+
items["new_brand"] = items["new_brand"].replace(brand_map_improved)
|
| 172 |
+
|
| 173 |
+
items['type']=items['type'].replace(self.type_dict)
|
| 174 |
+
|
| 175 |
+
print('-----*-----Unwrap brend cats step 1-----*-----')
|
| 176 |
+
unwrap_b_match=unwrap_brands(products)
|
| 177 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 178 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 179 |
+
|
| 180 |
+
print('-----*-----Unwrap brend cats step 2-----*-----')
|
| 181 |
+
unwrap_b_match=unwrap_brands(products)
|
| 182 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 183 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 184 |
+
|
| 185 |
+
print('-----*-----Finding brands in names-----*-----')
|
| 186 |
+
items['new_brand']=items['new_brand'].replace('none', None)
|
| 187 |
+
i_brands=items[items['new_brand'].isna()]['name'].values
|
| 188 |
+
p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
|
| 189 |
+
new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
|
| 190 |
+
items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
|
| 191 |
+
|
| 192 |
+
print('-----*-----Top inserts-----*-----')
|
| 193 |
+
process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
|
| 194 |
+
self.grapes, self.other_words)
|
| 195 |
+
|
| 196 |
+
print('-----*-----Adding service categories-----*-----')
|
| 197 |
+
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 198 |
+
merge_types(items, products)
|
| 199 |
+
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 200 |
+
merge_types(products, products)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
print('-----*-----Name trimming-----*-----')
|
| 204 |
+
item_timed_names, gb, sour=name_trimmer(items, self.types_n_others)
|
| 205 |
+
#items['name']=items['id'].replace(item_timed_names)
|
| 206 |
+
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
|
| 207 |
+
items['gb']=gb
|
| 208 |
+
items['sour']=sour
|
| 209 |
+
items['sour']=items['sour'].replace(self.sour_dict)
|
| 210 |
+
products_trimed_names, gb, sour=name_trimmer(products, self.types_n_others)
|
| 211 |
+
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 212 |
+
products['gb']=gb
|
| 213 |
+
products['sour']=sour
|
| 214 |
+
products['sour']=products['sour'].replace(self.sour_dict)
|
| 215 |
+
|
| 216 |
+
print('-----*-----Replacing product types-----*-----')
|
| 217 |
+
products['type']=products['type'].replace(self.type_dict)
|
| 218 |
+
|
| 219 |
+
return items, products
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
preprocess/utills/common/brand_matching.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
import re
|
| 3 |
+
from ahocorasick import Automaton
|
| 4 |
+
from rapidfuzz import fuzz, process
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def contains_full_word(word, text, case_sensitive=True):
|
| 8 |
+
"""
|
| 9 |
+
Проверяет, содержится ли слово word в строке text как отдельное слово.
|
| 10 |
+
Параметр case_sensitive задаёт, учитывать ли регистр.
|
| 11 |
+
"""
|
| 12 |
+
flags = 0 if case_sensitive else re.IGNORECASE
|
| 13 |
+
pattern = r'\b' + re.escape(word) + r'\b'
|
| 14 |
+
return re.search(pattern, text, flags) is not None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def unwrap_brands(products):
|
| 18 |
+
res={}
|
| 19 |
+
#brands=items['brand'].unique()
|
| 20 |
+
new_brands=sorted([x for x in products['brand'].unique() if isinstance(x, str)], key=len)
|
| 21 |
+
#items['new_brand'].unique() if isinstance(x, str)], key=len)
|
| 22 |
+
|
| 23 |
+
for i in tqdm(new_brands):
|
| 24 |
+
for j in new_brands:
|
| 25 |
+
if contains_full_word(i, j, case_sensitive=False):
|
| 26 |
+
if i != j:
|
| 27 |
+
#if len(i)>1:#i != 'А' and i != "Я":
|
| 28 |
+
res[j]=i
|
| 29 |
+
return res
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def split_n_match(products, items, th_len=3):
|
| 34 |
+
result={}
|
| 35 |
+
conditionally_spited=[]
|
| 36 |
+
for i in tqdm(items['brand'].unique()):
|
| 37 |
+
if '/' in i:
|
| 38 |
+
conditionally_spited.append(i)
|
| 39 |
+
for i in tqdm(products['brand'].unique()):
|
| 40 |
+
for j in conditionally_spited:
|
| 41 |
+
if len(i)>th_len and contains_full_word(i,j):
|
| 42 |
+
result[j]=i
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def fill_brands_in_dataframe(brands, df, col_name='new_brand', is_brand=True):
|
| 48 |
+
"""
|
| 49 |
+
Заполняет колонку 'brand' в DataFrame найденными брендами.
|
| 50 |
+
|
| 51 |
+
:param brands: Список брендов.
|
| 52 |
+
:param df: DataFrame с колонками ['id', 'brand', 'name', ...].
|
| 53 |
+
:return: DataFrame с обновлённой колонкой 'brand'.
|
| 54 |
+
"""
|
| 55 |
+
# Инициализируем автомат для быстрого поиска брендов
|
| 56 |
+
automaton = Automaton()
|
| 57 |
+
|
| 58 |
+
# Добавляем бренды в автомат
|
| 59 |
+
for idx, brand in enumerate(brands):
|
| 60 |
+
if isinstance(brand, str) and brand:
|
| 61 |
+
automaton.add_word(brand.lower(), (idx, brand))
|
| 62 |
+
|
| 63 |
+
automaton.make_automaton()
|
| 64 |
+
|
| 65 |
+
def find_brand(name):
|
| 66 |
+
"""
|
| 67 |
+
Находит лучший бренд для данного имени.
|
| 68 |
+
"""
|
| 69 |
+
matched_brands = set()
|
| 70 |
+
for _, (_, brand) in automaton.iter(name.lower()):
|
| 71 |
+
# Проверяем, что бренд встречается как отдельное слово
|
| 72 |
+
if re.search(rf'\b{re.escape(brand.lower())}\b', name.lower()):
|
| 73 |
+
matched_brands.add(brand)
|
| 74 |
+
|
| 75 |
+
# Возвращаем бренд с максимальной длиной (более точное совпадение)
|
| 76 |
+
return max(matched_brands, key=len) if matched_brands else None
|
| 77 |
+
|
| 78 |
+
# Обновляем колонку brand только для пустых значений
|
| 79 |
+
# df['new_brand'] = df.apply(
|
| 80 |
+
# lambda row: find_brand(row['name']), #if pd.isna(row['brand']) else row['brand'],
|
| 81 |
+
# axis=1
|
| 82 |
+
# )
|
| 83 |
+
if is_brand==True:
|
| 84 |
+
df[col_name] = df.apply(lambda row: find_brand(row['name']) or row['brand'], axis=1)
|
| 85 |
+
else:
|
| 86 |
+
df[col_name] = df.apply(lambda row: find_brand(row['name']) or None, axis=1)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def get_same_brands(products, items):
|
| 91 |
+
comp_list=[]
|
| 92 |
+
#not_comp_prods=[]
|
| 93 |
+
#not_comp_items=[]
|
| 94 |
+
prod_brand_list=list(products['brand'].unique())
|
| 95 |
+
items_brand_list=list(items['new_brand'].unique())
|
| 96 |
+
for i in tqdm(prod_brand_list):
|
| 97 |
+
if i in items_brand_list:
|
| 98 |
+
comp_list.append(i)
|
| 99 |
+
|
| 100 |
+
return comp_list, prod_brand_list, items_brand_list
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def match_brands_improved(items_brands, prods_brands, threshold=85):
|
| 104 |
+
"""
|
| 105 |
+
Улучшенный алгоритм сопоставления брендов с учётом нечёткого поиска и фильтрации ошибок.
|
| 106 |
+
|
| 107 |
+
:param items_brands: Список брендов из датафрейма items.
|
| 108 |
+
:param prods_brands: Список брендов из датафрейма prods.
|
| 109 |
+
:param threshold: Порог сходства для нечёткого поиска.
|
| 110 |
+
:return: Словарь соответствий {бренд из items: ближайший бренд из prods}.
|
| 111 |
+
"""
|
| 112 |
+
brand_mapping = {}
|
| 113 |
+
|
| 114 |
+
for item_brand in tqdm(items_brands):
|
| 115 |
+
if isinstance(item_brand, str):
|
| 116 |
+
# Разделяем бренд на части
|
| 117 |
+
parts = [part.strip() for part in re.split(r"[\/\(\)]", item_brand) if part.strip()]
|
| 118 |
+
best_match = None
|
| 119 |
+
best_score = 0
|
| 120 |
+
|
| 121 |
+
for part in parts:
|
| 122 |
+
match, score, _ = process.extractOne(part, prods_brands, scorer=fuzz.ratio)
|
| 123 |
+
# Фильтрация по длине строк и порогу
|
| 124 |
+
if score >= threshold and abs(len(part) - len(match)) / len(part) <= 0.3:
|
| 125 |
+
if score > best_score:
|
| 126 |
+
best_match = match
|
| 127 |
+
best_score = score
|
| 128 |
+
|
| 129 |
+
# Сохранение результата
|
| 130 |
+
if best_match:
|
| 131 |
+
brand_mapping[item_brand] = best_match#, best_score)
|
| 132 |
+
|
| 133 |
+
return brand_mapping
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
preprocess/utills/common/extracters.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def extract_years(text):
|
| 5 |
+
"""
|
| 6 |
+
Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years').
|
| 7 |
+
"""
|
| 8 |
+
# Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра
|
| 9 |
+
match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет|years)\b', text, re.IGNORECASE)
|
| 10 |
+
if match:
|
| 11 |
+
# Приводим слово 'лет' или 'years' к исходному регистру
|
| 12 |
+
return f"{match.group(1)} {match.group(2)}"
|
| 13 |
+
return None
|
| 14 |
+
|
| 15 |
+
def extract_production_year(text):
|
| 16 |
+
"""
|
| 17 |
+
Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки.
|
| 18 |
+
Например: '2019'.
|
| 19 |
+
"""
|
| 20 |
+
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
|
| 21 |
+
if match:
|
| 22 |
+
return match.group(1)
|
| 23 |
+
return None
|
| 24 |
+
|
| 25 |
+
def extract_alcohol_content(text):
|
| 26 |
+
"""
|
| 27 |
+
Извлекает содержание алкоголя из строки.
|
| 28 |
+
Например: '40%'.
|
| 29 |
+
"""
|
| 30 |
+
match = re.search(r'(\d{1,2}(?:[.,]\d+)?\s*%)', text)
|
| 31 |
+
if match:
|
| 32 |
+
# Заменяем запятую на точку для единообразия (если нужно)
|
| 33 |
+
return match.group(1).replace(' ', '').replace(',', '.')
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def is_volume(value):
|
| 38 |
+
"""
|
| 39 |
+
Проверяет, является ли значение валидным объемом (<= 10 литров).
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
volume = float(value)
|
| 43 |
+
return volume if volume <= 10 else None
|
| 44 |
+
except ValueError:
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
def extract_volume_or_number(text):
|
| 48 |
+
"""
|
| 49 |
+
Извлекает объем в литрах или число с плавающей точкой из строки.
|
| 50 |
+
Например: '0,75л', '0.5', или '1,5 л'.
|
| 51 |
+
"""
|
| 52 |
+
# Попытка найти объем с буквой 'л' или без пробела перед ней
|
| 53 |
+
match_with_l = re.search(r'(\d+(?:[\.,]\d+)?\s*[лЛ]|(?:\d+(?:[\.,]\d+)?[лЛ]))', text)
|
| 54 |
+
if match_with_l:
|
| 55 |
+
return is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('Л', '').strip())
|
| 56 |
+
|
| 57 |
+
# Если не найдено, ищем просто число с плавающей точкой
|
| 58 |
+
match_number = re.search(r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))\b(?!\s*(№|-er|er|\d{3,}))', text)
|
| 59 |
+
if match_number:
|
| 60 |
+
return is_volume(match_number.group(1).replace(',', '.'))
|
| 61 |
+
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
preprocess/utills/common/parallel_brand_mutching.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from ahocorasick import Automaton
|
| 3 |
+
from rapidfuzz import fuzz, process
|
| 4 |
+
from unidecode import unidecode
|
| 5 |
+
from pqdm.threads import pqdm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def normalize(text):
|
| 9 |
+
"""
|
| 10 |
+
Приводит текст к нижнему регистру и транслитерирует его в латиницу.
|
| 11 |
+
"""
|
| 12 |
+
return unidecode(text.lower())
|
| 13 |
+
|
| 14 |
+
def build_regex_for_brands(brands):
|
| 15 |
+
"""
|
| 16 |
+
Нормализует бренды и создаёт одно регулярное выражение для точного поиска.
|
| 17 |
+
Возвращает скомпилированный паттерн и словарь: нормализованное название -> оригинальное название.
|
| 18 |
+
"""
|
| 19 |
+
norm_to_brand = {}
|
| 20 |
+
for brand in brands:
|
| 21 |
+
norm_brand = normalize(brand)
|
| 22 |
+
if norm_brand not in norm_to_brand:
|
| 23 |
+
norm_to_brand[norm_brand] = brand
|
| 24 |
+
pattern = re.compile(r'\b(?:' + '|'.join(re.escape(nb) for nb in norm_to_brand.keys()) + r')\b')
|
| 25 |
+
return pattern, norm_to_brand
|
| 26 |
+
|
| 27 |
+
def process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold):
|
| 28 |
+
"""
|
| 29 |
+
Обрабатывает одну строку:
|
| 30 |
+
1. Пытается найти бренд через регулярное выражение.
|
| 31 |
+
2. Если точного совпадения нет – разбивает строку и выполняет нечёткий поиск.
|
| 32 |
+
Возвращает кортеж: (исходная строка, найденный бренд или None).
|
| 33 |
+
"""
|
| 34 |
+
norm_s = normalize(s)
|
| 35 |
+
# Пытаемся найти бренд через регулярное выражение
|
| 36 |
+
match = regex_pattern.search(norm_s)
|
| 37 |
+
if match:
|
| 38 |
+
return s, norm_to_brand[match.group(0)]
|
| 39 |
+
|
| 40 |
+
# Если точного совпадения нет, разбиваем строку по разделителям и анализируем части
|
| 41 |
+
parts = [part.strip() for part in re.split(r"[\/\(\)]", s) if part.strip()]
|
| 42 |
+
parts.append(s) # анализ всей строки
|
| 43 |
+
best_match = None
|
| 44 |
+
best_score = 0
|
| 45 |
+
for part in parts:
|
| 46 |
+
norm_part = normalize(part)
|
| 47 |
+
res = process.extractOne(norm_part, norm_brand_list, scorer=fuzz.ratio, score_cutoff=threshold)
|
| 48 |
+
if res is not None:
|
| 49 |
+
match_norm, score, idx = res
|
| 50 |
+
if score > best_score:
|
| 51 |
+
best_match = index_to_brand[idx]
|
| 52 |
+
best_score = score
|
| 53 |
+
if best_score == 100:
|
| 54 |
+
break
|
| 55 |
+
if best_match:
|
| 56 |
+
return s, best_match
|
| 57 |
+
return s, None
|
| 58 |
+
|
| 59 |
+
def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
|
| 60 |
+
"""
|
| 61 |
+
Поиск брендов в строках с учетом вариантов написания и транслитерации.
|
| 62 |
+
Использует предварительный поиск через регулярное выражение и, при необходимости,
|
| 63 |
+
нечёткий поиск. Обработка выполняется параллельно с отображением прогресса с помощью pqdm.
|
| 64 |
+
|
| 65 |
+
:param strings: Список строк для поиска брендов.
|
| 66 |
+
:param brands: Список брендов для поиска.
|
| 67 |
+
:param threshold: Порог сходства для нечёткого поиска.
|
| 68 |
+
:param n_jobs: Число рабочих потоков (или процессов, если использовать pqdm.processes).
|
| 69 |
+
:return: Словарь вида {строка: найденный бренд}.
|
| 70 |
+
"""
|
| 71 |
+
# Подготавливаем список нормализованных брендов и сопоставление индексов с оригинальными брендами.
|
| 72 |
+
norm_brand_list = []
|
| 73 |
+
index_to_brand = []
|
| 74 |
+
for brand in brands:
|
| 75 |
+
norm_brand = normalize(brand)
|
| 76 |
+
norm_brand_list.append(norm_brand)
|
| 77 |
+
index_to_brand.append(brand)
|
| 78 |
+
|
| 79 |
+
# Создаем комбинированный паттерн для точного поиска.
|
| 80 |
+
regex_pattern, norm_to_brand = build_regex_for_brands(brands)
|
| 81 |
+
|
| 82 |
+
# Определяем вспомогательную функцию, закрывающую необходимые параметры.
|
| 83 |
+
def process_string_wrapper(s):
|
| 84 |
+
return process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold)
|
| 85 |
+
|
| 86 |
+
# Обрабатываем строки параллельно с отображением прогресса.
|
| 87 |
+
results = pqdm(strings, process_string_wrapper, n_jobs=n_jobs)
|
| 88 |
+
|
| 89 |
+
brand_mapping = {}
|
| 90 |
+
for s, matched_brand in results:
|
| 91 |
+
if matched_brand:
|
| 92 |
+
brand_mapping[s] = matched_brand
|
| 93 |
+
return brand_mapping
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
preprocess/utills/common/top_inserts.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from preprocess.utils.common.extracters import *
|
| 2 |
+
from preprocess.utils.common.utils import *
|
| 3 |
+
from preprocess.utils.common.parallel_brand_matching import *
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import re
|
| 6 |
+
import math
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def top_inserts_matching(other_brands, p_brands, items, th=65):
|
| 12 |
+
replaced={}
|
| 13 |
+
for i in other_brands:
|
| 14 |
+
l=i.split('/')
|
| 15 |
+
if len(l)>2:
|
| 16 |
+
replaced[l[0].replace('Шато','')]=i
|
| 17 |
+
else:
|
| 18 |
+
if 'Шато' in i:
|
| 19 |
+
replaced[i.replace('Шато','')]=i
|
| 20 |
+
|
| 21 |
+
ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
|
| 22 |
+
ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
|
| 23 |
+
|
| 24 |
+
result={}
|
| 25 |
+
for k in ob_in_pb.keys():
|
| 26 |
+
if k in replaced.keys():
|
| 27 |
+
result[replaced[k]]=ob_in_pb[k]
|
| 28 |
+
else:
|
| 29 |
+
result[k]=ob_in_pb[k]
|
| 30 |
+
|
| 31 |
+
items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
|
| 35 |
+
result={}
|
| 36 |
+
for n in tqdm(items[items['new_brand'].isna()]['name'].values):
|
| 37 |
+
|
| 38 |
+
name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
|
| 39 |
+
#name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
|
| 40 |
+
name=trim_name(name, types)
|
| 41 |
+
name=trim_name(name, grape_varieties)
|
| 42 |
+
name=trim_name(name, onther_words)
|
| 43 |
+
name=name.replace('.','').replace(',','').replace('(','').replace(')','')
|
| 44 |
+
#result.append(clean_wine_name(name).strip())
|
| 45 |
+
result[n]=clean_wine_name(name).strip()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
items['new_name']=None
|
| 50 |
+
items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)
|
| 51 |
+
|
| 52 |
+
u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
|
| 53 |
+
res={}
|
| 54 |
+
for i in tqdm(u_nn):
|
| 55 |
+
lenta=len(items[items['new_name']==i])
|
| 56 |
+
if lenta>1:
|
| 57 |
+
res[i]=lenta
|
| 58 |
+
|
| 59 |
+
th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
|
| 60 |
+
other_brands=[i for i,j in res.items() if j>th]
|
| 61 |
+
|
| 62 |
+
reess=check_brands_in_strings_pqdm(other_brands, p_brands)
|
| 63 |
+
|
| 64 |
+
items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)
|
| 65 |
+
|
| 66 |
+
top_inserts_matching(other_brands, p_brands, items)
|
preprocess/utills/common/utils.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def remove_quotes(text):
|
| 6 |
+
return re.sub(r'["\']', '', text)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def remove_l(text):
|
| 10 |
+
result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
|
| 11 |
+
|
| 12 |
+
# Убираем возможные лишние пробелы, возникающие после удаления
|
| 13 |
+
result = re.sub(r'\s{2,}', ' ', result).strip()
|
| 14 |
+
return result
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def clean_wine_name(name):
|
| 18 |
+
"""
|
| 19 |
+
Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
|
| 20 |
+
Например, "токай л" превратится в "токай".
|
| 21 |
+
"""
|
| 22 |
+
# Регулярное выражение ищет:
|
| 23 |
+
# \s+ – один или несколько пробельных символов;
|
| 24 |
+
# \b – граница слова;
|
| 25 |
+
# [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
|
| 26 |
+
# \b – граница слова;
|
| 27 |
+
# \s*$ – любые пробелы до конца строки.
|
| 28 |
+
return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def find_full_word(text, word_list):
|
| 32 |
+
"""
|
| 33 |
+
Ищет первое полное вхождение слова из word_list в строке text.
|
| 34 |
+
Возвращает найденное слово или None, если совпадение не найдено.
|
| 35 |
+
"""
|
| 36 |
+
for word in word_list:
|
| 37 |
+
pattern = r'\b' + re.escape(word) + r'\b'
|
| 38 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 39 |
+
return word
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def merge_wine_type(items, colors=None, color_merge_dict=None):
|
| 44 |
+
result=[]
|
| 45 |
+
for row in tqdm(items.iterrows()):
|
| 46 |
+
try:
|
| 47 |
+
if row[1]['type_wine'] is not None:
|
| 48 |
+
color=find_full_word(row[1]['type_wine'], colors)
|
| 49 |
+
if color is not None:
|
| 50 |
+
result.append(color)
|
| 51 |
+
else:
|
| 52 |
+
color=find_full_word(row[1]['name'], colors)
|
| 53 |
+
if color is not None:
|
| 54 |
+
result.append(color)
|
| 55 |
+
else:
|
| 56 |
+
result.append(None)
|
| 57 |
+
else:
|
| 58 |
+
color=find_full_word(row[1]['name'], colors)
|
| 59 |
+
if color is not None:
|
| 60 |
+
result.append(color)
|
| 61 |
+
else:
|
| 62 |
+
result.append(None)
|
| 63 |
+
except Exception as ex:
|
| 64 |
+
print(ex)
|
| 65 |
+
result.append(None)
|
| 66 |
+
|
| 67 |
+
items['new_type_wine']=result
|
| 68 |
+
items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def merge_types(items, products):
|
| 72 |
+
alco_types=[i.strip().lower() for i in products['type'].unique()]
|
| 73 |
+
alco_types.append('ликёр')
|
| 74 |
+
result=[]
|
| 75 |
+
for row in tqdm(items.iterrows()):
|
| 76 |
+
try:
|
| 77 |
+
type_in_name=find_full_word(row[1]['name'], alco_types)
|
| 78 |
+
if type_in_name is not None:
|
| 79 |
+
result.append(type_in_name)
|
| 80 |
+
continue
|
| 81 |
+
if row[1]['type'] is not None:
|
| 82 |
+
type_in_type=find_full_word(row[1]['type'], alco_types)
|
| 83 |
+
if type_in_type is not None:
|
| 84 |
+
result.append(type_in_type)
|
| 85 |
+
else:
|
| 86 |
+
result.append(row[1]['type'])
|
| 87 |
+
else:
|
| 88 |
+
result.append(None)
|
| 89 |
+
except Exception as ex:
|
| 90 |
+
print(ex)
|
| 91 |
+
result.append(None)
|
| 92 |
+
|
| 93 |
+
items['new_type']=result
|
| 94 |
+
items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def trim_name(text, words_to_remove):
|
| 98 |
+
"""
|
| 99 |
+
Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
|
| 100 |
+
|
| 101 |
+
:param text: Исходная строка.
|
| 102 |
+
:param words_to_remove: Список слов, которые необходимо удалить.
|
| 103 |
+
:return: Обновлённая строка с удалёнными словами.
|
| 104 |
+
"""
|
| 105 |
+
# Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
|
| 106 |
+
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 107 |
+
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 108 |
+
#print(pattern)
|
| 109 |
+
|
| 110 |
+
# Заменяем найденные полные слова на пустую строку.
|
| 111 |
+
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 112 |
+
|
| 113 |
+
# Убираем лишние пробелы, возникающие после удаления слов.
|
| 114 |
+
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
| 115 |
+
|
| 116 |
+
return new_text
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def name_trimmer(df, prcess_text, types_and_others):
|
| 120 |
+
result={}
|
| 121 |
+
gbs=[]
|
| 122 |
+
sours=[]
|
| 123 |
+
for idx, row in tqdm(df.iterrows()):
|
| 124 |
+
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
| 125 |
+
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
| 126 |
+
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 127 |
+
|
| 128 |
+
gbs.append(gb)
|
| 129 |
+
sours.append(sour)
|
| 130 |
+
return result, gbs, sours
|
preprocess/utills/items/attrs.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def check_spark(row, col_name='name', types=['Игристое', 'игр']):
|
| 2 |
+
if col_name in row.keys():
|
| 3 |
+
for t in types:
|
| 4 |
+
if t.lower() in row[col_name].lower() and 'Пилигрим' not in row[col_name].lower():
|
| 5 |
+
return 'Игристое'
|
| 6 |
+
return None
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
|
| 10 |
+
if col_name in row.keys():
|
| 11 |
+
for t in types:
|
| 12 |
+
if t.lower() in row[col_name].lower():
|
| 13 |
+
return 'Вино'
|
| 14 |
+
return None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def is_type_exist(row, types):
|
| 18 |
+
for t in types:
|
| 19 |
+
if t.lower() in row['type'].lower(): # Сравнение без учета регистра
|
| 20 |
+
return t
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
def check_type(row, types):
|
| 24 |
+
#checker=False
|
| 25 |
+
for t in types:
|
| 26 |
+
if t.lower() in row['name'].lower(): # Сравнение без учета регистра
|
| 27 |
+
return t
|
| 28 |
+
return None
|
| 29 |
+
|
| 30 |
+
def get_type(row, types):
|
| 31 |
+
if 'type' not in row.keys():
|
| 32 |
+
return check_type(row, types)
|
| 33 |
+
elif 'type' in row.keys():
|
| 34 |
+
semi_res=is_type_exist(row, types)
|
| 35 |
+
if semi_res!=None:
|
| 36 |
+
return semi_res
|
| 37 |
+
else:
|
| 38 |
+
return check_type(row, types)
|
| 39 |
+
return None
|
| 40 |
+
|
processor/matching.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
from transliterate import translit, detect_language
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from rapidfuzz import fuzz, process
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def normalize_name(name):
|
| 8 |
+
"""
|
| 9 |
+
Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
|
| 10 |
+
приводит к нижнему регистру.
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
if detect_language(name) == 'ru':
|
| 14 |
+
return translit(name, 'ru', reversed=True).lower()
|
| 15 |
+
except Exception:
|
| 16 |
+
pass
|
| 17 |
+
return name.lower()
|
| 18 |
+
|
| 19 |
+
def prepare_groups_with_ids(items_df):
|
| 20 |
+
"""
|
| 21 |
+
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
| 22 |
+
с учетом нормализованного названия.
|
| 23 |
+
|
| 24 |
+
Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
|
| 25 |
+
|
| 26 |
+
:param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
|
| 27 |
+
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 28 |
+
"""
|
| 29 |
+
items_df = items_df.copy()
|
| 30 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 31 |
+
|
| 32 |
+
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 33 |
+
lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour']))
|
| 34 |
+
).to_dict()
|
| 35 |
+
return grouped
|
| 36 |
+
|
| 37 |
+
def prepare_groups_by_alternative_keys(items_df):
|
| 38 |
+
"""
|
| 39 |
+
Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
|
| 40 |
+
оригинального и нормализованного имени.
|
| 41 |
+
|
| 42 |
+
:param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
|
| 43 |
+
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 44 |
+
"""
|
| 45 |
+
items_df = items_df.copy()
|
| 46 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 47 |
+
|
| 48 |
+
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
|
| 49 |
+
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour']))
|
| 50 |
+
).to_dict()
|
| 51 |
+
return grouped
|
| 52 |
+
|
| 53 |
+
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
|
| 54 |
+
"""
|
| 55 |
+
Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
|
| 56 |
+
нормализованные группы.
|
| 57 |
+
|
| 58 |
+
Производится два прохода:
|
| 59 |
+
- Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
|
| 60 |
+
- Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
|
| 61 |
+
исключая итемы с исходным брендом.
|
| 62 |
+
|
| 63 |
+
Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
|
| 64 |
+
|
| 65 |
+
:param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
|
| 66 |
+
:param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
|
| 67 |
+
:param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
|
| 68 |
+
:param name_threshold: Порог сходства для fuzzy matching.
|
| 69 |
+
:return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
|
| 70 |
+
"""
|
| 71 |
+
results = []
|
| 72 |
+
no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
|
| 73 |
+
|
| 74 |
+
# Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
|
| 75 |
+
for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
|
| 76 |
+
product_brand = product['brand']
|
| 77 |
+
product_type = product['type']
|
| 78 |
+
product_name = product['name']
|
| 79 |
+
product_volume = product['volume']
|
| 80 |
+
product_type_wine = product['new_type_wine']
|
| 81 |
+
product_sour = product['sour']
|
| 82 |
+
|
| 83 |
+
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 84 |
+
items_data = items_groups.get(key, [])
|
| 85 |
+
if items_data:
|
| 86 |
+
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 87 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = zip(*items_data)
|
| 88 |
+
else:
|
| 89 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = ([], [], [], [], [], [])
|
| 90 |
+
|
| 91 |
+
norm_product_name = normalize_name(product_name)
|
| 92 |
+
matches = process.extract(
|
| 93 |
+
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 94 |
+
)
|
| 95 |
+
matched_items = [
|
| 96 |
+
{
|
| 97 |
+
'item_id': items_ids[idx_candidate],
|
| 98 |
+
'item_name': items_names[idx_candidate],
|
| 99 |
+
'score': score,
|
| 100 |
+
'volume': items_volumes[idx_candidate],
|
| 101 |
+
'color': item_type_wine[idx_candidate],
|
| 102 |
+
'sour': items_sour[idx_candidate]
|
| 103 |
+
}
|
| 104 |
+
for match, score, idx_candidate in matches
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
if not matched_items:
|
| 108 |
+
no_match_products.append((idx, product))
|
| 109 |
+
|
| 110 |
+
results.append({
|
| 111 |
+
'product_id': product['id'],
|
| 112 |
+
'matched_items': matched_items,
|
| 113 |
+
'alternative': [] # Заполняется во втором проходе
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
# Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
|
| 117 |
+
groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
|
| 118 |
+
|
| 119 |
+
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 120 |
+
for idx, product in tqdm(no_match_products):
|
| 121 |
+
product_brand = product['brand']
|
| 122 |
+
product_type_wine = product['new_type_wine']
|
| 123 |
+
product_type = product['new_type']
|
| 124 |
+
product_volume = product['volume']
|
| 125 |
+
product_name = product['name']
|
| 126 |
+
product_sour = product['sour']
|
| 127 |
+
|
| 128 |
+
alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 129 |
+
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 130 |
+
# Фильтруем, исключая итемы с исходным брендом
|
| 131 |
+
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 132 |
+
if filtered_items:
|
| 133 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = zip(*filtered_items)
|
| 134 |
+
else:
|
| 135 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = ([], [], [], [], [], [], [])
|
| 136 |
+
|
| 137 |
+
norm_product_name = normalize_name(product_name)
|
| 138 |
+
alt_matches = process.extract(
|
| 139 |
+
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 140 |
+
)
|
| 141 |
+
alt_matched_items = [
|
| 142 |
+
{
|
| 143 |
+
'item_id': alt_ids[idx_candidate],
|
| 144 |
+
'item_name': alt_names[idx_candidate],
|
| 145 |
+
'score': score,
|
| 146 |
+
'volume': alt_volumes[idx_candidate],
|
| 147 |
+
'color': alt_type_wine[idx_candidate],
|
| 148 |
+
'sour': alt_sour[idx_candidate]
|
| 149 |
+
}
|
| 150 |
+
for match, score, idx_candidate in alt_matches
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
results[idx]['alternative'] = alt_matched_items
|
| 154 |
+
|
| 155 |
+
results_df = pd.DataFrame(results)
|
| 156 |
+
merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
|
| 157 |
+
return merged_df
|
processor/processor.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from preprocess.preprocess import Preprocessor
|
| 2 |
+
from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Processor():
|
| 6 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 9 |
+
|
| 10 |
+
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 11 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 12 |
+
sour_merge_dict, type_merge_dict, color_merge_dict)
|
| 13 |
+
|
| 14 |
+
def process(self, items, products, th=65):
|
| 15 |
+
items, products=self.preprocessor.process(self, items, products)
|
| 16 |
+
|
| 17 |
+
print('-----*-----Matching-----*-----')
|
| 18 |
+
items_groups = prepare_groups_with_ids(items)
|
| 19 |
+
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
|
| 20 |
+
|
| 21 |
+
return res.drop(['type','type_wine','year','alco','gb'], axis=1), items, products
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
ui/gradio_ui.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import tempfile
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class GradioUI():
|
| 7 |
+
|
| 8 |
+
def __init__(self, processor):
|
| 9 |
+
self.processor=processor
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def process_files(self, file1, file2, threshold):
|
| 13 |
+
|
| 14 |
+
row_items=pd.read_csv(file2, sep='\t')
|
| 15 |
+
row_products=pd.read_csv(file1, sep='\t', on_bad_lines='skip')
|
| 16 |
+
|
| 17 |
+
df, items, products= self.processor.process(row_products, row_items, threshold)
|
| 18 |
+
# Создаём временный CSV файл для сохранения результата
|
| 19 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
| 20 |
+
output_csv = tmp.name
|
| 21 |
+
df.to_csv(output_csv, sep='\t', index=False)
|
| 22 |
+
return output_csv
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def run_ui(self):
|
| 26 |
+
# Определяем пользовательский интерфейс с помощью gr.Blocks
|
| 27 |
+
with gr.Blocks() as demo:
|
| 28 |
+
gr.Markdown("## Обработка CSV файлов")
|
| 29 |
+
|
| 30 |
+
with gr.Row():
|
| 31 |
+
# Используем type="filepath", чтобы получить путь к файлу
|
| 32 |
+
file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 33 |
+
file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
|
| 34 |
+
|
| 35 |
+
threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
|
| 36 |
+
|
| 37 |
+
process_button = gr.Button("Обработать файлы")
|
| 38 |
+
output_file = gr.File(label="Скачать результат (CSV)")
|
| 39 |
+
|
| 40 |
+
# При нажатии кнопки вызывается функция process_files
|
| 41 |
+
process_button.click(fn=self.process_files,
|
| 42 |
+
inputs=[file_input1, file_input2, threshold_input],
|
| 43 |
+
outputs=output_file)
|
| 44 |
+
|
| 45 |
+
demo.launch()
|