Spaces:
Sleeping
Sleeping
Upload 22 files
Browse files- api.py +1 -0
- app.py +4 -1
- constants/constants.py +312 -286
- preprocess/preprocess.py +25 -8
- preprocess/utils/common/utils.py +18 -3
- preprocess/utils/items/attrs.py +1 -1
- processor/matching.py +102 -21
- processor/processor.py +4 -2
- ui/gradio_ui.py +1 -1
api.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import datetime
|
|
|
|
| 1 |
+
import csv
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import datetime
|
app.py
CHANGED
|
@@ -13,7 +13,10 @@ processor=Processor(LONG_TYPES_LIST,
|
|
| 13 |
OTHER_WORDS,
|
| 14 |
SOUR_MERGE_DICT,
|
| 15 |
TYPES_WINES_DICT,
|
| 16 |
-
COLOR_MERGE_DICT
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
searcher=Searcher()
|
| 19 |
|
|
|
|
| 13 |
OTHER_WORDS,
|
| 14 |
SOUR_MERGE_DICT,
|
| 15 |
TYPES_WINES_DICT,
|
| 16 |
+
COLOR_MERGE_DICT,
|
| 17 |
+
COUNTRY_LIST,
|
| 18 |
+
NORMALIZED_NAMES_ALTERNATIVES_DICT
|
| 19 |
+
)
|
| 20 |
|
| 21 |
searcher=Searcher()
|
| 22 |
|
constants/constants.py
CHANGED
|
@@ -1,286 +1,312 @@
|
|
| 1 |
-
LONG_TYPES_LIST = [
|
| 2 |
-
"Пиво",
|
| 3 |
-
"Вино", # объединяет красное, белое, розовое, игристое и шампанское
|
| 4 |
-
"Водка",
|
| 5 |
-
"Виски",
|
| 6 |
-
"Бурбон",
|
| 7 |
-
"Коньяк",
|
| 8 |
-
"Бренди",
|
| 9 |
-
"Арманьяк",
|
| 10 |
-
"Ром",
|
| 11 |
-
"Джин",
|
| 12 |
-
"Текила",
|
| 13 |
-
"Мескаль",
|
| 14 |
-
"Ликер",
|
| 15 |
-
"Самбука",
|
| 16 |
-
"Сидр",
|
| 17 |
-
"Саке",
|
| 18 |
-
"Абсент",
|
| 19 |
-
"Граппа",
|
| 20 |
-
"Портвейн",
|
| 21 |
-
"Мадера",
|
| 22 |
-
"Шерри",
|
| 23 |
-
"Кальвадос",
|
| 24 |
-
"Писко",
|
| 25 |
-
"Вермута",
|
| 26 |
-
"Вермут",
|
| 27 |
-
"Аперитив",
|
| 28 |
-
"Биттер",
|
| 29 |
-
"Эль",
|
| 30 |
-
"Глинтвейн",
|
| 31 |
-
"Пунш",
|
| 32 |
-
"Медовуха",
|
| 33 |
-
"Ламбик",
|
| 34 |
-
"Крем-ликер",
|
| 35 |
-
"Арак",
|
| 36 |
-
"Чача",
|
| 37 |
-
"Самогон",
|
| 38 |
-
"Кумыс",
|
| 39 |
-
"Сливовица",
|
| 40 |
-
"Шнапс",
|
| 41 |
-
"Настойка",
|
| 42 |
-
"Наливка",
|
| 43 |
-
"Игристое вино",
|
| 44 |
-
"Херес",
|
| 45 |
-
"Пуаре",
|
| 46 |
-
"Пуарэ",
|
| 47 |
-
"Ликер",
|
| 48 |
-
"Ликёр",
|
| 49 |
-
"Спиртной напиток со вкусом",
|
| 50 |
-
"Напиток винный",
|
| 51 |
-
"Винный напиток",
|
| 52 |
-
"Шомпань",
|
| 53 |
-
'Сироп',
|
| 54 |
-
'Конфеты',
|
| 55 |
-
'Шоколад'
|
| 56 |
-
'Сок',
|
| 57 |
-
'Вода',
|
| 58 |
-
'Табачная продукция',
|
| 59 |
-
]
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
SOUR = [
|
| 66 |
-
'brut',
|
| 67 |
-
'semi-sweet',
|
| 68 |
-
'sweet',
|
| 69 |
-
'брют',
|
| 70 |
-
'сухое',
|
| 71 |
-
'полусухое',
|
| 72 |
-
'полусладкое',
|
| 73 |
-
'сладкое',
|
| 74 |
-
'п/сух',
|
| 75 |
-
'п/сл',
|
| 76 |
-
'п/с',
|
| 77 |
-
'сл',
|
| 78 |
-
'с
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
'ро
|
| 86 |
-
'
|
| 87 |
-
'
|
| 88 |
-
'
|
| 89 |
-
'р
|
| 90 |
-
'
|
| 91 |
-
'
|
| 92 |
-
'
|
| 93 |
-
'
|
| 94 |
-
'
|
| 95 |
-
'
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
"
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
'
|
| 108 |
-
'in the
|
| 109 |
-
'
|
| 110 |
-
'in
|
| 111 |
-
'
|
| 112 |
-
'gift box in carton',
|
| 113 |
-
'in gift box in
|
| 114 |
-
'
|
| 115 |
-
'gift box in
|
| 116 |
-
'gift box in wood',
|
| 117 |
-
'gift box
|
| 118 |
-
'
|
| 119 |
-
'gift box',
|
| 120 |
-
'
|
| 121 |
-
'in
|
| 122 |
-
'
|
| 123 |
-
'in
|
| 124 |
-
'in
|
| 125 |
-
'in
|
| 126 |
-
'
|
| 127 |
-
'
|
| 128 |
-
'
|
| 129 |
-
'в подарочной упаковке из
|
| 130 |
-
'в подарочной упаковке из
|
| 131 |
-
'в
|
| 132 |
-
'в подарочной упаковке',
|
| 133 |
-
'подарочн
|
| 134 |
-
'
|
| 135 |
-
'в д
|
| 136 |
-
'д
|
| 137 |
-
'
|
| 138 |
-
'в
|
| 139 |
-
'в
|
| 140 |
-
'в п/у
|
| 141 |
-
'в п/у
|
| 142 |
-
'в п/у',
|
| 143 |
-
'в п
|
| 144 |
-
'п/у
|
| 145 |
-
'п/у',
|
| 146 |
-
'в
|
| 147 |
-
'
|
| 148 |
-
'
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
'кр',
|
| 157 |
-
'
|
| 158 |
-
'р
|
| 159 |
-
'
|
| 160 |
-
'
|
| 161 |
-
'
|
| 162 |
-
'
|
| 163 |
-
'
|
| 164 |
-
'
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
"
|
| 175 |
-
"Совиньон
|
| 176 |
-
"
|
| 177 |
-
"
|
| 178 |
-
"
|
| 179 |
-
"
|
| 180 |
-
"
|
| 181 |
-
"
|
| 182 |
-
"
|
| 183 |
-
"
|
| 184 |
-
"
|
| 185 |
-
"
|
| 186 |
-
"
|
| 187 |
-
"
|
| 188 |
-
"
|
| 189 |
-
"
|
| 190 |
-
"
|
| 191 |
-
"
|
| 192 |
-
"
|
| 193 |
-
"
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
-
"
|
| 197 |
-
"
|
| 198 |
-
"
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
-
"
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
-
"Али
|
| 205 |
-
"
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
"
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"
|
| 219 |
-
"
|
| 220 |
-
"
|
| 221 |
-
"
|
| 222 |
-
"К
|
| 223 |
-
"
|
| 224 |
-
"
|
| 225 |
-
"
|
| 226 |
-
"
|
| 227 |
-
"
|
| 228 |
-
"
|
| 229 |
-
"
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
'
|
| 243 |
-
'
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
'
|
| 250 |
-
'
|
| 251 |
-
'
|
| 252 |
-
'
|
| 253 |
-
'
|
| 254 |
-
'
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
'
|
| 261 |
-
'П
|
| 262 |
-
'
|
| 263 |
-
'
|
| 264 |
-
'
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
"
|
| 278 |
-
"
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LONG_TYPES_LIST = [
|
| 2 |
+
"Пиво",
|
| 3 |
+
"Вино", # объединяет красное, белое, розовое, игристое и шампанское
|
| 4 |
+
"Водка",
|
| 5 |
+
"Виски",
|
| 6 |
+
"Бурбон",
|
| 7 |
+
"Коньяк",
|
| 8 |
+
"Бренди",
|
| 9 |
+
"Арманьяк",
|
| 10 |
+
"Ром",
|
| 11 |
+
"Джин",
|
| 12 |
+
"Текила",
|
| 13 |
+
"Мескаль",
|
| 14 |
+
"Ликер",
|
| 15 |
+
"Самбука",
|
| 16 |
+
"Сидр",
|
| 17 |
+
"Саке",
|
| 18 |
+
"Абсент",
|
| 19 |
+
"Граппа",
|
| 20 |
+
"Портвейн",
|
| 21 |
+
"Мадера",
|
| 22 |
+
"Шерри",
|
| 23 |
+
"Кальвадос",
|
| 24 |
+
"Писко",
|
| 25 |
+
"Вермута",
|
| 26 |
+
"Вермут",
|
| 27 |
+
"Аперитив",
|
| 28 |
+
"Биттер",
|
| 29 |
+
"Эль",
|
| 30 |
+
"Глинтвейн",
|
| 31 |
+
"Пунш",
|
| 32 |
+
"Медовуха",
|
| 33 |
+
"Ламбик",
|
| 34 |
+
"Крем-ликер",
|
| 35 |
+
"Арак",
|
| 36 |
+
"Чача",
|
| 37 |
+
"Самогон",
|
| 38 |
+
"Кумыс",
|
| 39 |
+
"Сливовица",
|
| 40 |
+
"Шнапс",
|
| 41 |
+
"Настойка",
|
| 42 |
+
"Наливка",
|
| 43 |
+
"Игристое вино",
|
| 44 |
+
"Херес",
|
| 45 |
+
"Пуаре",
|
| 46 |
+
"Пуарэ",
|
| 47 |
+
"Ликер",
|
| 48 |
+
"Ликёр",
|
| 49 |
+
"Спиртной напиток со вкусом",
|
| 50 |
+
"Напиток винный",
|
| 51 |
+
"Винный напиток",
|
| 52 |
+
"Шомпань",
|
| 53 |
+
'Сироп',
|
| 54 |
+
'Конфеты',
|
| 55 |
+
'Шоколад'
|
| 56 |
+
'Сок',
|
| 57 |
+
'Вода',
|
| 58 |
+
'Табачная продукция',
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
SHORT_TYPES_LIST=['Вино', "Сидр", "Водка", "Коньяк", "Настойка", "Ликер", "Виски", "Джин"]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
SOUR = [
|
| 66 |
+
'brut',
|
| 67 |
+
'semi-sweet',
|
| 68 |
+
'sweet',
|
| 69 |
+
'брют',
|
| 70 |
+
'сухое',
|
| 71 |
+
'полусухое',
|
| 72 |
+
'полусладкое',
|
| 73 |
+
'сладкое',
|
| 74 |
+
'п/сух',
|
| 75 |
+
'п/сл',
|
| 76 |
+
'п/с',
|
| 77 |
+
'сл',
|
| 78 |
+
'сл.',
|
| 79 |
+
'сух',
|
| 80 |
+
'сух.'
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
WINE_TYPES = [
|
| 85 |
+
'красное',
|
| 86 |
+
'белое',
|
| 87 |
+
'розовое',
|
| 88 |
+
'роз',
|
| 89 |
+
'кр',
|
| 90 |
+
'крас',
|
| 91 |
+
'бел',
|
| 92 |
+
'розе',
|
| 93 |
+
'rosso',
|
| 94 |
+
'roso',
|
| 95 |
+
'roseto',
|
| 96 |
+
'rosetto',
|
| 97 |
+
'red',
|
| 98 |
+
'white',
|
| 99 |
+
"игристое",
|
| 100 |
+
"игр",
|
| 101 |
+
"шомпанское",
|
| 102 |
+
"шомп",
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
GBS = [
|
| 107 |
+
'cristal decanter in oak gift box',
|
| 108 |
+
'in the carton gift box with 2 glasses',
|
| 109 |
+
'decanter in the carton gift box',
|
| 110 |
+
'in the carton gift box',
|
| 111 |
+
'in the wooden gift box',
|
| 112 |
+
'in gift box in the carton',
|
| 113 |
+
'in gift box in carton',
|
| 114 |
+
'gift box in the carton',
|
| 115 |
+
'gift box in carton',
|
| 116 |
+
'in gift box in the wood',
|
| 117 |
+
'in gift box in wood',
|
| 118 |
+
'gift box in the wood',
|
| 119 |
+
'gift box in wood',
|
| 120 |
+
'gift box with 2 glasses',
|
| 121 |
+
'in gift box',
|
| 122 |
+
'gift box',
|
| 123 |
+
'in carton',
|
| 124 |
+
'in wooden case',
|
| 125 |
+
'in wooden box',
|
| 126 |
+
'in wood case'
|
| 127 |
+
'in wood box',
|
| 128 |
+
'in wood',
|
| 129 |
+
'хрустальный декантер в подарочной упаковке из дуба',
|
| 130 |
+
'декантер в подарочной упаковке из картона',
|
| 131 |
+
'в подарочной упаковке из картона с 2 бокалами'
|
| 132 |
+
'в подарочной упаковке из картона',
|
| 133 |
+
'в подарочной упаковке из Дуба',
|
| 134 |
+
'в П У графин и деревянная коробка',
|
| 135 |
+
'в подарочной упаковке',
|
| 136 |
+
'подарочная упаковка',
|
| 137 |
+
'подарочный набор',
|
| 138 |
+
'в деревянной коробке',
|
| 139 |
+
'деревянная коробка',
|
| 140 |
+
'в п/у+2 бокаланов',
|
| 141 |
+
'в п/у из картона',
|
| 142 |
+
'в п/у+бокал',
|
| 143 |
+
'в п/у (дер.коробке)',
|
| 144 |
+
'в п/у солома',
|
| 145 |
+
'в п/у',
|
| 146 |
+
'в п у',
|
| 147 |
+
'п/уп',
|
| 148 |
+
'п/у',
|
| 149 |
+
'в тубе',
|
| 150 |
+
'туба',
|
| 151 |
+
'ПУ',
|
| 152 |
+
]
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
COLORS_FOR_TRIM = [
|
| 156 |
+
'красное',
|
| 157 |
+
'крас',
|
| 158 |
+
'кр',
|
| 159 |
+
'белое',
|
| 160 |
+
'бел',
|
| 161 |
+
'розовое',
|
| 162 |
+
'розе',
|
| 163 |
+
'rose',
|
| 164 |
+
'rosso',
|
| 165 |
+
'roso',
|
| 166 |
+
'roseto',
|
| 167 |
+
'rosetto',
|
| 168 |
+
'red',
|
| 169 |
+
'white',
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
GRAPES = [
|
| 174 |
+
"Каберне Совиньон",
|
| 175 |
+
"Каберне-Совиньон",
|
| 176 |
+
"Каберне",# Cabernet Sauvignon
|
| 177 |
+
"Мерло", # Merlot
|
| 178 |
+
"Пино Нуар", # Pinot Noir
|
| 179 |
+
"Шардоне", # Chardonnay
|
| 180 |
+
"Совиньон Блан", # Sauvignon Blanc
|
| 181 |
+
"Сира", # Syrah
|
| 182 |
+
"Гренаш", # Grenache
|
| 183 |
+
"Рислинг", # Riesling
|
| 184 |
+
"Мальбек", # Malbec
|
| 185 |
+
"Темпранильо", # Tempranillo
|
| 186 |
+
"Зинфандель", # Zinfandel
|
| 187 |
+
"Санджовезе", # Sangiovese
|
| 188 |
+
"Каберне Фран", # Cabernet Franc
|
| 189 |
+
"Вионье", # Viognier
|
| 190 |
+
"Мурведр", # Mourvèdre
|
| 191 |
+
"Шенен Блан", # Chenin Blanc
|
| 192 |
+
"Пино Гри", # Pinot Grigio
|
| 193 |
+
"Гевюрцтраминер", # Gewürztraminer
|
| 194 |
+
"Неббиоло", # Nebbiolo
|
| 195 |
+
"Барбера", # Barbera
|
| 196 |
+
"Petit Verdot", # Petit Verdot (обычно оставляют в оригинале)
|
| 197 |
+
"Карменер", # Carmenère
|
| 198 |
+
"Таннат", # Tannat
|
| 199 |
+
"Гамей", # Gamay
|
| 200 |
+
"Семильон", # Semillon
|
| 201 |
+
"Мускат", # Muscat
|
| 202 |
+
"Верментино", # Vermentino
|
| 203 |
+
"Фиано", # Fiano
|
| 204 |
+
"Аглианико", # Aglianico
|
| 205 |
+
"Кариньян", # Carignan (также может встречаться как Cariñena)
|
| 206 |
+
"Торронтес",
|
| 207 |
+
"Рислинг",
|
| 208 |
+
"Кефессия",
|
| 209 |
+
"Алиготе",
|
| 210 |
+
"Фурминт"# Torrontés (особенно для аргентинских вин)
|
| 211 |
+
]
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
OTHER_WORDS=[
|
| 215 |
+
"Шампанское",
|
| 216 |
+
"Шампань",
|
| 217 |
+
"Игристое",
|
| 218 |
+
"Жемчужное",
|
| 219 |
+
"Газированный",
|
| 220 |
+
"Традиционный",
|
| 221 |
+
"Двухслойный",
|
| 222 |
+
"Кофе",
|
| 223 |
+
"Напиток",
|
| 224 |
+
"Спиртной",
|
| 225 |
+
"Горькая",
|
| 226 |
+
"Виноградная",
|
| 227 |
+
"Выдержанная",
|
| 228 |
+
"Шотландский",
|
| 229 |
+
"Купажированный",
|
| 230 |
+
"креп",
|
| 231 |
+
"Ординарный",
|
| 232 |
+
"Выдержанный",
|
| 233 |
+
"Отборное",
|
| 234 |
+
"Десертный",
|
| 235 |
+
"Вкус",
|
| 236 |
+
"Сорт",
|
| 237 |
+
"односолод."
|
| 238 |
+
]
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
SOUR_MERGE_DICT={
|
| 242 |
+
'brut':'брют',
|
| 243 |
+
'semi-sweet':'полусладкое',
|
| 244 |
+
'sweet':'сладкое',
|
| 245 |
+
'сухое':'сухое',
|
| 246 |
+
'п/сух':'полусухое',
|
| 247 |
+
'п/сух.':'полусухое',
|
| 248 |
+
'п/сл':'полусладкое',
|
| 249 |
+
'п/сл.':'полусладкое',
|
| 250 |
+
'п/с':'полусухое',
|
| 251 |
+
'сл':'сладкое',
|
| 252 |
+
'сл.':'сладкое',
|
| 253 |
+
'сух':'сухое',
|
| 254 |
+
'сух.':'сухое',
|
| 255 |
+
None: 'unmatched',
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
TYPES_WINES_DICT={
|
| 260 |
+
'Пуарэ':'Слабоалкогольные и энергетические напитки',
|
| 261 |
+
'Пуаре':'Слабоалкогольные и энергетические напитки',
|
| 262 |
+
'Сидр':'Слабоалкогольные и энергетические напитки',
|
| 263 |
+
'Шампань': 'Шампанское',
|
| 264 |
+
'Игристое': 'Шампанское',
|
| 265 |
+
'Сироп':'Сиропы',
|
| 266 |
+
'Арманьяк':'Коньяк',
|
| 267 |
+
'Бренди':'Коньяк',
|
| 268 |
+
'Ликер':'Ликер',
|
| 269 |
+
'Ликёр': 'Ликер',
|
| 270 |
+
'Граппа':'Водка',
|
| 271 |
+
'Настойка':'Водка',
|
| 272 |
+
'Конфеты':'Сладости',
|
| 273 |
+
'Портвейн':'Вино',
|
| 274 |
+
'Херес':'Вино',
|
| 275 |
+
'Кальвадос':'Коньяк',
|
| 276 |
+
'Винный напиток': "Вино",
|
| 277 |
+
"Игристое вино":'Шампанское',
|
| 278 |
+
"Самогон": "Водка",
|
| 279 |
+
None: 'unmatched'
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
COLOR_MERGE_DICT={
|
| 284 |
+
"кр":'красное',
|
| 285 |
+
"крас":'красное',
|
| 286 |
+
"red":"красное",
|
| 287 |
+
"бел":"белое",
|
| 288 |
+
"white":"белое",
|
| 289 |
+
"роз":'розовое',
|
| 290 |
+
"розе":'розовое',
|
| 291 |
+
"roso":'розовое',
|
| 292 |
+
"rosso":'розовое',
|
| 293 |
+
"rose":'розовое',
|
| 294 |
+
"rosetto":'розовое',
|
| 295 |
+
"roseto":'розовое',
|
| 296 |
+
"игр":"игристое",
|
| 297 |
+
"шомп":"шомпанское",
|
| 298 |
+
None: 'unmatched'
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
COUNTRY_LIST=[
|
| 302 |
+
"Франция",
|
| 303 |
+
"Испания",
|
| 304 |
+
"Италия",
|
| 305 |
+
"Шотландия",
|
| 306 |
+
]
|
| 307 |
+
|
| 308 |
+
NORMALIZED_NAMES_ALTERNATIVES_DICT={
|
| 309 |
+
"M&H" : ["em end ejch"],
|
| 310 |
+
"peats beast" : ["pits bist"],
|
| 311 |
+
"xo": ["ho"]
|
| 312 |
+
}
|
preprocess/preprocess.py
CHANGED
|
@@ -14,7 +14,8 @@ class Preprocessor():
|
|
| 14 |
|
| 15 |
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
-
sour_merge_dict, type_merge_dict, color_merge_dict
|
|
|
|
| 18 |
|
| 19 |
self.long_types_list=long_types_list
|
| 20 |
self.short_types_list=short_types_list
|
|
@@ -24,10 +25,19 @@ class Preprocessor():
|
|
| 24 |
self.colors_ft=colors_for_trim
|
| 25 |
self.grapes=grapes
|
| 26 |
self.other_words=other_words
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
self.sour_dict=sour_merge_dict
|
| 29 |
self.type_dict=type_merge_dict
|
| 30 |
self.color_merge_dict=color_merge_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def process_items(self, df):
|
|
@@ -41,8 +51,10 @@ class Preprocessor():
|
|
| 41 |
if 'brand' in i.keys():
|
| 42 |
result['brand'].append(i['brand'])
|
| 43 |
else: result['brand'].append(None)
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
drink_type=get_type(i, self.long_types_list)
|
| 47 |
if drink_type is None:
|
| 48 |
drink_type=check_spark(i)
|
|
@@ -52,6 +64,8 @@ class Preprocessor():
|
|
| 52 |
drink_type=check_spark(i, col_name='type_wine')
|
| 53 |
if drink_type is None:
|
| 54 |
drink_type=check_color_and_sour(i, types=self.sour)
|
|
|
|
|
|
|
| 55 |
#if 'type' in i.keys():
|
| 56 |
result['type'].append(drink_type)#i['type'])
|
| 57 |
#else: dd['type'].append(None)
|
|
@@ -116,6 +130,8 @@ class Preprocessor():
|
|
| 116 |
if volume_or_number is not None:
|
| 117 |
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 118 |
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
|
|
|
|
|
|
| 119 |
test=clean_wine_name(text) #remove_l(text)
|
| 120 |
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 121 |
# else:
|
|
@@ -175,12 +191,12 @@ class Preprocessor():
|
|
| 175 |
|
| 176 |
items['type']=items['type'].replace(self.type_dict)
|
| 177 |
|
| 178 |
-
print('-----*-----Unwrap
|
| 179 |
unwrap_b_match=unwrap_brands(products)
|
| 180 |
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 181 |
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 182 |
|
| 183 |
-
print('-----*-----Unwrap
|
| 184 |
unwrap_b_match=unwrap_brands(products)
|
| 185 |
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 186 |
products["brand"] = products["brand"].replace(unwrap_b_match)
|
|
@@ -198,9 +214,9 @@ class Preprocessor():
|
|
| 198 |
|
| 199 |
print('-----*-----Adding service categories-----*-----')
|
| 200 |
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 201 |
-
merge_types(items, products)
|
| 202 |
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 203 |
-
merge_types(products, products)
|
| 204 |
|
| 205 |
|
| 206 |
print('-----*-----Name trimming-----*-----')
|
|
@@ -210,6 +226,7 @@ class Preprocessor():
|
|
| 210 |
items['gb']=gb
|
| 211 |
items['sour']=sour
|
| 212 |
items['sour']=items['sour'].replace(self.sour_dict)
|
|
|
|
| 213 |
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 214 |
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 215 |
products['gb']=gb
|
|
|
|
| 14 |
|
| 15 |
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
+
sour_merge_dict, type_merge_dict, color_merge_dict,
|
| 18 |
+
country_list, normalized_names_dict):
|
| 19 |
|
| 20 |
self.long_types_list=long_types_list
|
| 21 |
self.short_types_list=short_types_list
|
|
|
|
| 25 |
self.colors_ft=colors_for_trim
|
| 26 |
self.grapes=grapes
|
| 27 |
self.other_words=other_words
|
| 28 |
+
|
| 29 |
+
self.types_n_others=long_types_list+other_words+sour_list+country_list
|
| 30 |
+
self.types_n_others.remove("Шерри")
|
| 31 |
+
|
| 32 |
self.sour_dict=sour_merge_dict
|
| 33 |
self.type_dict=type_merge_dict
|
| 34 |
self.color_merge_dict=color_merge_dict
|
| 35 |
+
self.country_list = country_list
|
| 36 |
+
self.normalized_names_dict=normalized_names_dict
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def preprocess_name(self, name):
|
| 40 |
+
return name.replace("\n", " ")
|
| 41 |
|
| 42 |
|
| 43 |
def process_items(self, df):
|
|
|
|
| 51 |
if 'brand' in i.keys():
|
| 52 |
result['brand'].append(i['brand'])
|
| 53 |
else: result['brand'].append(None)
|
| 54 |
+
|
| 55 |
+
name = self.preprocess_name(i['name'])
|
| 56 |
+
result['name'].append(name)
|
| 57 |
+
result['fullname'].append(name)
|
| 58 |
drink_type=get_type(i, self.long_types_list)
|
| 59 |
if drink_type is None:
|
| 60 |
drink_type=check_spark(i)
|
|
|
|
| 64 |
drink_type=check_spark(i, col_name='type_wine')
|
| 65 |
if drink_type is None:
|
| 66 |
drink_type=check_color_and_sour(i, types=self.sour)
|
| 67 |
+
if drink_type is None:
|
| 68 |
+
drink_type=check_color_and_sour(i, col_name='name')
|
| 69 |
#if 'type' in i.keys():
|
| 70 |
result['type'].append(drink_type)#i['type'])
|
| 71 |
#else: dd['type'].append(None)
|
|
|
|
| 130 |
if volume_or_number is not None:
|
| 131 |
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 132 |
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 133 |
+
text = re.sub(r'\s+\b[лЛlL].\b', '', text)
|
| 134 |
+
text = re.sub(r'\s+\b[лЛlL]\b', '', text)
|
| 135 |
test=clean_wine_name(text) #remove_l(text)
|
| 136 |
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 137 |
# else:
|
|
|
|
| 191 |
|
| 192 |
items['type']=items['type'].replace(self.type_dict)
|
| 193 |
|
| 194 |
+
print('-----*-----Unwrap brand cats step 1-----*-----')
|
| 195 |
unwrap_b_match=unwrap_brands(products)
|
| 196 |
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 197 |
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 198 |
|
| 199 |
+
print('-----*-----Unwrap brand cats step 2-----*-----')
|
| 200 |
unwrap_b_match=unwrap_brands(products)
|
| 201 |
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 202 |
products["brand"] = products["brand"].replace(unwrap_b_match)
|
|
|
|
| 214 |
|
| 215 |
print('-----*-----Adding service categories-----*-----')
|
| 216 |
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 217 |
+
merge_types(items, products, type_merge_dict=self.type_dict)
|
| 218 |
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 219 |
+
merge_types(products, products, type_merge_dict=self.type_dict)
|
| 220 |
|
| 221 |
|
| 222 |
print('-----*-----Name trimming-----*-----')
|
|
|
|
| 226 |
items['gb']=gb
|
| 227 |
items['sour']=sour
|
| 228 |
items['sour']=items['sour'].replace(self.sour_dict)
|
| 229 |
+
|
| 230 |
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 231 |
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 232 |
products['gb']=gb
|
preprocess/utils/common/utils.py
CHANGED
|
@@ -64,6 +64,7 @@ def merge_wine_type(items, colors=None, color_merge_dict=None):
|
|
| 64 |
result=[]
|
| 65 |
for row in tqdm(items.iterrows()):
|
| 66 |
try:
|
|
|
|
| 67 |
if row[1]['type_wine'] is not None:
|
| 68 |
color=find_full_word(row[1]['type_wine'], colors)
|
| 69 |
if color is not None:
|
|
@@ -88,12 +89,22 @@ def merge_wine_type(items, colors=None, color_merge_dict=None):
|
|
| 88 |
items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
|
| 89 |
|
| 90 |
|
| 91 |
-
def merge_types(items, products):
|
| 92 |
alco_types=[i.strip().lower() for i in products['type'].unique()]
|
| 93 |
alco_types.append('ликёр')
|
| 94 |
result=[]
|
|
|
|
| 95 |
for row in tqdm(items.iterrows()):
|
| 96 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
type_in_name=find_full_word(row[1]['name'], alco_types)
|
| 98 |
if type_in_name is not None:
|
| 99 |
result.append(type_in_name)
|
|
@@ -111,7 +122,8 @@ def merge_types(items, products):
|
|
| 111 |
result.append(None)
|
| 112 |
|
| 113 |
items['new_type']=result
|
| 114 |
-
items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def trim_name(text, words_to_remove):
|
|
@@ -125,7 +137,7 @@ def trim_name(text, words_to_remove):
|
|
| 125 |
# Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
|
| 126 |
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 127 |
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 128 |
-
#print(pattern)
|
| 129 |
|
| 130 |
# Заменяем найденные полные слова на пустую строку.
|
| 131 |
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
|
@@ -141,8 +153,11 @@ def name_trimmer(df, prcess_text, types_and_others):
|
|
| 141 |
gbs=[]
|
| 142 |
sours=[]
|
| 143 |
for idx, row in tqdm(df.iterrows()):
|
|
|
|
| 144 |
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
|
|
|
| 145 |
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
|
|
|
| 146 |
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 147 |
|
| 148 |
gbs.append(gb)
|
|
|
|
| 64 |
result=[]
|
| 65 |
for row in tqdm(items.iterrows()):
|
| 66 |
try:
|
| 67 |
+
#print("merge_wine_type:" + str(row))
|
| 68 |
if row[1]['type_wine'] is not None:
|
| 69 |
color=find_full_word(row[1]['type_wine'], colors)
|
| 70 |
if color is not None:
|
|
|
|
| 89 |
items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
|
| 90 |
|
| 91 |
|
| 92 |
+
def merge_types(items, products, type_merge_dict={}, sub_alco_types=["Бренди", "Шампань", "Шампанское"]):
|
| 93 |
alco_types=[i.strip().lower() for i in products['type'].unique()]
|
| 94 |
alco_types.append('ликёр')
|
| 95 |
result=[]
|
| 96 |
+
|
| 97 |
for row in tqdm(items.iterrows()):
|
| 98 |
try:
|
| 99 |
+
# Parameter 'sub_alco_types' specifies specific alcohol types that usually specified
|
| 100 |
+
# in product / item name along with "parent" type and in this case this subtype should have priority
|
| 101 |
+
# For example, "Вино Шампано Ле Брён де Нёвиль", or "Бренди де Херес"
|
| 102 |
+
if sub_alco_types:
|
| 103 |
+
type_in_name=find_full_word(row[1]['name'], sub_alco_types)
|
| 104 |
+
if type_in_name is not None:
|
| 105 |
+
result.append(type_in_name)
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
type_in_name=find_full_word(row[1]['name'], alco_types)
|
| 109 |
if type_in_name is not None:
|
| 110 |
result.append(type_in_name)
|
|
|
|
| 122 |
result.append(None)
|
| 123 |
|
| 124 |
items['new_type']=result
|
| 125 |
+
#items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
|
| 126 |
+
items['new_type'] = items['new_type'].replace(type_merge_dict)
|
| 127 |
|
| 128 |
|
| 129 |
def trim_name(text, words_to_remove):
|
|
|
|
| 137 |
# Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
|
| 138 |
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 139 |
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 140 |
+
#print("Pattern: " + pattern)
|
| 141 |
|
| 142 |
# Заменяем найденные полные слова на пустую строку.
|
| 143 |
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
|
|
|
| 153 |
gbs=[]
|
| 154 |
sours=[]
|
| 155 |
for idx, row in tqdm(df.iterrows()):
|
| 156 |
+
#print("Name1: " + str(row['name']))
|
| 157 |
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
| 158 |
+
#print("Name2: " + text)
|
| 159 |
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
| 160 |
+
#print("Name3: " + text)
|
| 161 |
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 162 |
|
| 163 |
gbs.append(gb)
|
preprocess/utils/items/attrs.py
CHANGED
|
@@ -6,7 +6,7 @@ def check_spark(row, col_name='name', types=['Игристое', 'игр']):
|
|
| 6 |
return None
|
| 7 |
|
| 8 |
|
| 9 |
-
def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
|
| 10 |
if col_name in row.keys():
|
| 11 |
for t in types:
|
| 12 |
if t.lower() in row[col_name].lower():
|
|
|
|
| 6 |
return None
|
| 7 |
|
| 8 |
|
| 9 |
+
def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное', 'крас.', 'бел.']):
|
| 10 |
if col_name in row.keys():
|
| 11 |
for t in types:
|
| 12 |
if t.lower() in row[col_name].lower():
|
processor/matching.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
|
| 3 |
from tqdm import tqdm
|
| 4 |
from transliterate import translit, detect_language
|
|
@@ -6,6 +7,7 @@ import pandas as pd
|
|
| 6 |
from rapidfuzz import fuzz, process
|
| 7 |
import numpy as np
|
| 8 |
from math import isnan
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def normalize_name(name):
|
|
@@ -20,6 +22,41 @@ def normalize_name(name):
|
|
| 20 |
pass
|
| 21 |
return name.lower()
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def prepare_groups_with_ids(items_df):
|
| 24 |
"""
|
| 25 |
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
|
@@ -31,11 +68,14 @@ def prepare_groups_with_ids(items_df):
|
|
| 31 |
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 32 |
"""
|
| 33 |
items_df = items_df.copy()
|
| 34 |
-
items_df['norm_name'] = items_df['name'].apply(
|
| 35 |
|
| 36 |
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 37 |
lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 38 |
).to_dict()
|
|
|
|
|
|
|
|
|
|
| 39 |
return grouped
|
| 40 |
|
| 41 |
def prepare_groups_by_alternative_keys(items_df):
|
|
@@ -47,13 +87,23 @@ def prepare_groups_by_alternative_keys(items_df):
|
|
| 47 |
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 48 |
"""
|
| 49 |
items_df = items_df.copy()
|
| 50 |
-
items_df['norm_name'] = items_df['name'].apply(
|
| 51 |
|
| 52 |
-
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
|
|
|
|
| 53 |
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 54 |
).to_dict()
|
| 55 |
return grouped
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def order_by_best_year(matched_items, year):
|
| 59 |
best_matched_items = []
|
|
@@ -61,30 +111,37 @@ def order_by_best_year(matched_items, year):
|
|
| 61 |
other_matched_items = []
|
| 62 |
max_year = 0
|
| 63 |
|
| 64 |
-
|
| 65 |
-
year = int(str(year))
|
| 66 |
|
| 67 |
for mi in matched_items:
|
| 68 |
# Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
|
| 69 |
try:
|
| 70 |
-
if
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
max_year_matched_items = [mi]
|
| 75 |
-
max_year =
|
| 76 |
-
elif
|
| 77 |
max_year_matched_items.append(mi)
|
| 78 |
else:
|
| 79 |
-
other_matched_items.append(mi)
|
| 80 |
else:
|
| 81 |
-
other_matched_items.append(mi)
|
| 82 |
except Exception as ex:
|
| 83 |
-
print("Error processing best year for item " + str(mi["item_id"]) + " " + str(ex))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
best_matched_items.extend(other_matched_items)
|
| 87 |
-
return best_matched_items
|
| 88 |
|
| 89 |
|
| 90 |
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
|
|
@@ -121,14 +178,19 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
|
|
| 121 |
product_sour = product['sour']
|
| 122 |
|
| 123 |
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
|
|
|
|
|
|
|
|
|
| 124 |
items_data = items_groups.get(key, [])
|
| 125 |
if items_data:
|
| 126 |
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
|
|
|
| 127 |
items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 128 |
else:
|
|
|
|
| 129 |
items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
|
| 130 |
|
| 131 |
-
norm_product_name =
|
| 132 |
matches = process.extract(
|
| 133 |
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
|
| 134 |
)
|
|
@@ -168,6 +230,7 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
|
|
| 168 |
|
| 169 |
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 170 |
for idx, product in tqdm(no_match_products):
|
|
|
|
| 171 |
product_brand = product['brand']
|
| 172 |
product_type_wine = product['new_type_wine']
|
| 173 |
product_type = product['new_type']
|
|
@@ -175,19 +238,37 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
|
|
| 175 |
product_name = product['name']
|
| 176 |
product_sour = product['sour']
|
| 177 |
|
| 178 |
-
alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
type_items = groups_by_alternative_keys.get(alt_key, [])
|
|
|
|
| 180 |
# Фильтруем, исключая итемы с исходным брендом
|
| 181 |
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 182 |
if filtered_items:
|
|
|
|
| 183 |
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 184 |
else:
|
|
|
|
| 185 |
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
|
| 186 |
|
| 187 |
-
norm_product_name =
|
|
|
|
|
|
|
| 188 |
alt_matches = process.extract(
|
| 189 |
-
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=
|
| 190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
alt_matched_items = [
|
| 192 |
{
|
| 193 |
'item_id': alt_ids[idx_candidate],
|
|
|
|
| 1 |
import json
|
| 2 |
+
from constants.constants import *
|
| 3 |
|
| 4 |
from tqdm import tqdm
|
| 5 |
from transliterate import translit, detect_language
|
|
|
|
| 7 |
from rapidfuzz import fuzz, process
|
| 8 |
import numpy as np
|
| 9 |
from math import isnan
|
| 10 |
+
from preprocess.utils.common.utils import *
|
| 11 |
|
| 12 |
|
| 13 |
def normalize_name(name):
|
|
|
|
| 22 |
pass
|
| 23 |
return name.lower()
|
| 24 |
|
| 25 |
+
|
| 26 |
+
def normalize_name_ex(name):
|
| 27 |
+
name = normalize_name(name)
|
| 28 |
+
for nnk in NORMALIZED_NAMES_ALTERNATIVES_DICT:
|
| 29 |
+
word = find_full_word(name, NORMALIZED_NAMES_ALTERNATIVES_DICT[nnk])
|
| 30 |
+
if word:
|
| 31 |
+
name = name.replace(word, nnk)
|
| 32 |
+
return name
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def compare_names(name1, name2, scorer=fuzz.ratio, score_cutoff=50):
|
| 36 |
+
print("Scoring: " + name1 + " vs " + name2)
|
| 37 |
+
words1 = name1.split(" ")
|
| 38 |
+
words2 = name2.split(" ")
|
| 39 |
+
|
| 40 |
+
score = 0
|
| 41 |
+
for w1 in words1:
|
| 42 |
+
for w2 in words2:
|
| 43 |
+
r = scorer(w1, w2)
|
| 44 |
+
print("\t " + w1 + " - " + w2 + " ; " + str(r))
|
| 45 |
+
if r >= score_cutoff:
|
| 46 |
+
score = score + r
|
| 47 |
+
|
| 48 |
+
print("Score result: " + str(score / (100*len(words1))))
|
| 49 |
+
return score / (100*len(words1))
|
| 50 |
+
|
| 51 |
+
def compare_name_with_list(name, names_list, scorer=fuzz.ratio, score_cutoff=50):
|
| 52 |
+
result = []
|
| 53 |
+
index = 0
|
| 54 |
+
for name2 in names_list:
|
| 55 |
+
result.append((name2, compare_names(name, name2, scorer, score_cutoff), index))
|
| 56 |
+
index = index + 1
|
| 57 |
+
return result
|
| 58 |
+
|
| 59 |
+
|
| 60 |
def prepare_groups_with_ids(items_df):
|
| 61 |
"""
|
| 62 |
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
|
|
|
| 68 |
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 69 |
"""
|
| 70 |
items_df = items_df.copy()
|
| 71 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
|
| 72 |
|
| 73 |
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 74 |
lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 75 |
).to_dict()
|
| 76 |
+
|
| 77 |
+
#print(grouped)
|
| 78 |
+
|
| 79 |
return grouped
|
| 80 |
|
| 81 |
def prepare_groups_by_alternative_keys(items_df):
|
|
|
|
| 87 |
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 88 |
"""
|
| 89 |
items_df = items_df.copy()
|
| 90 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
|
| 91 |
|
| 92 |
+
#grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume' ''', 'sour''''']).apply(
|
| 93 |
+
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume']).apply(
|
| 94 |
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 95 |
).to_dict()
|
| 96 |
return grouped
|
| 97 |
|
| 98 |
+
def parse_year(year):
|
| 99 |
+
if not year:
|
| 100 |
+
return False
|
| 101 |
+
elif isinstance(year, str):
|
| 102 |
+
return int(year)
|
| 103 |
+
elif isinstance(year, (int, float)) and not isnan(year):
|
| 104 |
+
return int(year)
|
| 105 |
+
|
| 106 |
+
return False
|
| 107 |
|
| 108 |
def order_by_best_year(matched_items, year):
|
| 109 |
best_matched_items = []
|
|
|
|
| 111 |
other_matched_items = []
|
| 112 |
max_year = 0
|
| 113 |
|
| 114 |
+
year = parse_year(year)
|
|
|
|
| 115 |
|
| 116 |
for mi in matched_items:
|
| 117 |
# Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
|
| 118 |
try:
|
| 119 |
+
if isinstance(mi['year'], (int, float, str)):
|
| 120 |
+
mi_year = int(mi['year'])
|
| 121 |
+
else:
|
| 122 |
+
mi_year = False
|
| 123 |
+
|
| 124 |
+
if year and mi_year and (mi_year == year):
|
| 125 |
+
best_matched_items.append(mi['item_id'])
|
| 126 |
+
elif mi_year:
|
| 127 |
+
if mi_year > max_year:
|
| 128 |
max_year_matched_items = [mi]
|
| 129 |
+
max_year = mi_year
|
| 130 |
+
elif mi_year == max_year:
|
| 131 |
max_year_matched_items.append(mi)
|
| 132 |
else:
|
| 133 |
+
other_matched_items.append(mi['item_id'])
|
| 134 |
else:
|
| 135 |
+
other_matched_items.append(mi['item_id'])
|
| 136 |
except Exception as ex:
|
| 137 |
+
print("Error processing best year for item " + str(mi["item_id"]) + " value " + str(mi['year']) + ": " + str(ex))
|
| 138 |
+
|
| 139 |
+
if len(best_matched_items) > 0:
|
| 140 |
+
for m in matched_items:
|
| 141 |
+
if not m['item_id'] in best_matched_items:
|
| 142 |
+
m['score'] = m['score']*0.8
|
| 143 |
|
| 144 |
+
return matched_items
|
|
|
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
|
|
|
|
| 178 |
product_sour = product['sour']
|
| 179 |
|
| 180 |
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 181 |
+
#print("Name: " + product_name)
|
| 182 |
+
#print("Key: " + str(key))
|
| 183 |
+
#print("Groups: " + str(items_groups))
|
| 184 |
items_data = items_groups.get(key, [])
|
| 185 |
if items_data:
|
| 186 |
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 187 |
+
#print("Data: " + str(items_data))
|
| 188 |
items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 189 |
else:
|
| 190 |
+
#print("Data: No")
|
| 191 |
items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
|
| 192 |
|
| 193 |
+
norm_product_name = normalize_name_ex(product_name)
|
| 194 |
matches = process.extract(
|
| 195 |
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
|
| 196 |
)
|
|
|
|
| 230 |
|
| 231 |
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 232 |
for idx, product in tqdm(no_match_products):
|
| 233 |
+
#print("Product: " + str(product))
|
| 234 |
product_brand = product['brand']
|
| 235 |
product_type_wine = product['new_type_wine']
|
| 236 |
product_type = product['new_type']
|
|
|
|
| 238 |
product_name = product['name']
|
| 239 |
product_sour = product['sour']
|
| 240 |
|
| 241 |
+
#alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 242 |
+
alt_key = (product_type_wine, product_type, product_volume)
|
| 243 |
+
|
| 244 |
+
#print("AltName: " + str(product))
|
| 245 |
+
#print("AltKey: " + str(alt_key))
|
| 246 |
+
#print("AltGroups: " + str(groups_by_alternative_keys))
|
| 247 |
+
#print("AltGroups Keys: " + str(groups_by_alternative_keys.keys()))
|
| 248 |
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 249 |
+
#print("AltGroups2: " + str(type_items))
|
| 250 |
# Фильтруем, исключая итемы с исходным брендом
|
| 251 |
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 252 |
if filtered_items:
|
| 253 |
+
#print("AltData: " + str(filtered_items))
|
| 254 |
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 255 |
else:
|
| 256 |
+
#print("AltData: No")
|
| 257 |
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
|
| 258 |
|
| 259 |
+
norm_product_name = normalize_name_ex(product_name)
|
| 260 |
+
#print("norm_product_name: " + str(norm_product_name))
|
| 261 |
+
#print("alt_norm_names: " + str(alt_norm_names))
|
| 262 |
alt_matches = process.extract(
|
| 263 |
+
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=50
|
| 264 |
)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
#alt_matches = compare_name_with_list(
|
| 268 |
+
# norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=70
|
| 269 |
+
#)
|
| 270 |
+
|
| 271 |
+
#print("alt_matches: " + str(alt_matches))
|
| 272 |
alt_matched_items = [
|
| 273 |
{
|
| 274 |
'item_id': alt_ids[idx_candidate],
|
processor/processor.py
CHANGED
|
@@ -5,11 +5,13 @@ from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
|
| 5 |
class Processor():
|
| 6 |
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
-
sour_merge_dict, type_merge_dict, color_merge_dict
|
|
|
|
| 9 |
|
| 10 |
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 11 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 12 |
-
sour_merge_dict, type_merge_dict, color_merge_dict
|
|
|
|
| 13 |
|
| 14 |
def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
|
| 15 |
items, products=self.preprocessor.process(products, items)
|
|
|
|
| 5 |
class Processor():
|
| 6 |
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
+
sour_merge_dict, type_merge_dict, color_merge_dict,
|
| 9 |
+
country_list, normalized_names_dict):
|
| 10 |
|
| 11 |
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 12 |
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 13 |
+
sour_merge_dict, type_merge_dict, color_merge_dict,
|
| 14 |
+
country_list, normalized_names_dict)
|
| 15 |
|
| 16 |
def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
|
| 17 |
items, products=self.preprocessor.process(products, items)
|
ui/gradio_ui.py
CHANGED
|
@@ -87,7 +87,7 @@ class GradioUI():
|
|
| 87 |
|
| 88 |
output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
|
| 89 |
output_csv = os.path.join(results_path, output_csv)
|
| 90 |
-
df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE)
|
| 91 |
return output_csv
|
| 92 |
except Exception as ex:
|
| 93 |
raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
|
|
|
|
| 87 |
|
| 88 |
output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
|
| 89 |
output_csv = os.path.join(results_path, output_csv)
|
| 90 |
+
df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE, escapechar="@")
|
| 91 |
return output_csv
|
| 92 |
except Exception as ex:
|
| 93 |
raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
|