File size: 31,173 Bytes
606ca5f
c5b2790
fe2d51b
606ca5f
3cc4e3f
 
 
606ca5f
3cc4e3f
d3ca850
3cc4e3f
 
 
2956b24
95c9287
 
 
c5b2790
3cc4e3f
2956b24
3cc4e3f
 
 
 
 
 
 
 
 
 
 
 
 
c5b2790
 
 
 
 
 
 
 
3cc4e3f
 
95c9287
 
 
 
 
 
 
 
 
c5b2790
 
95c9287
 
c5b2790
95c9287
 
 
 
d4bade4
95c9287
 
 
 
 
 
 
 
 
1f7be9a
 
 
95c9287
 
 
 
 
c5b2790
 
95c9287
 
 
 
 
c5b2790
95c9287
 
c5b2790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4bade4
 
 
c5b2790
 
 
d4bade4
 
 
c5b2790
 
 
d4bade4
 
 
c5b2790
 
 
d4bade4
 
 
c5b2790
95c9287
 
 
 
c5b2790
95c9287
c5b2790
 
 
 
 
 
95c9287
c5b2790
 
95c9287
 
 
c5b2790
 
95c9287
c5b2790
 
95c9287
c5b2790
95c9287
c5b2790
95c9287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5b2790
 
95c9287
 
 
 
 
 
 
 
c5b2790
 
95c9287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5b2790
 
 
95c9287
 
c5b2790
 
95c9287
c5b2790
95c9287
 
c5b2790
95c9287
 
c5b2790
 
 
 
95c9287
c5b2790
95c9287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b19d8a
 
 
 
c5b2790
5b19d8a
c5b2790
 
 
2956b24
c5b2790
2956b24
 
c5b2790
 
5b19d8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95c9287
c5b2790
 
 
 
 
 
 
 
 
 
 
 
 
5b19d8a
c5b2790
 
5b19d8a
c5b2790
 
5b19d8a
c5b2790
5b19d8a
c5b2790
5b19d8a
2956b24
95c9287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2956b24
 
 
5b19d8a
 
 
 
 
 
95c9287
5b19d8a
 
 
 
 
2956b24
3cc4e3f
 
c5b2790
 
 
 
5b19d8a
 
95c9287
5b19d8a
d4bade4
 
 
5b19d8a
 
 
c5b2790
95c9287
d4bade4
 
5b19d8a
 
d4bade4
 
 
 
eedd5dc
c5b2790
 
 
5b19d8a
 
 
 
 
 
 
c5b2790
 
 
 
 
5b19d8a
c5b2790
 
d3ca850
5b19d8a
c5b2790
95c9287
5b19d8a
95c9287
d4bade4
95c9287
 
 
 
c5b2790
 
d4bade4
 
 
5b19d8a
 
95c9287
 
 
5b19d8a
 
 
 
95c9287
 
 
5b19d8a
95c9287
 
5b19d8a
95c9287
 
5b19d8a
95c9287
 
 
5b19d8a
 
95c9287
 
5b19d8a
c5b2790
 
 
 
95c9287
 
 
c5b2790
5b19d8a
95c9287
 
 
 
5b19d8a
 
 
95c9287
 
c5b2790
95c9287
 
 
5b19d8a
c5b2790
95c9287
 
 
5b19d8a
 
95c9287
 
 
 
 
 
5b19d8a
 
95c9287
 
606ca5f
 
95c9287
 
2956b24
 
95c9287
 
 
2956b24
c5b2790
95c9287
c5b2790
 
 
 
 
 
95c9287
2956b24
c5b2790
95c9287
c5b2790
95c9287
 
2956b24
 
d4bade4
 
 
 
2956b24
d4bade4
 
 
 
 
 
95c9287
d4bade4
 
2956b24
d4bade4
 
95c9287
d4bade4
 
 
2956b24
d4bade4
 
 
 
 
95c9287
d4bade4
 
 
3cc4e3f
d4bade4
 
 
 
95c9287
d4bade4
 
606ca5f
d4bade4
 
95c9287
d4bade4
 
2956b24
 
95c9287
 
 
c5b2790
95c9287
c5b2790
606ca5f
d4bade4
 
 
 
 
 
95c9287
c5b2790
 
 
 
2956b24
 
c5b2790
 
 
 
 
2956b24
95c9287
 
 
 
2956b24
95c9287
 
 
 
 
 
 
 
3cc4e3f
2956b24
3cc4e3f
95c9287
 
d4bade4
 
 
 
 
3cc4e3f
 
 
 
 
 
 
 
95c9287
2956b24
 
 
 
 
 
3cc4e3f
95c9287
2956b24
95c9287
2956b24
 
 
 
 
 
 
 
 
 
 
3cc4e3f
 
 
 
 
 
 
2956b24
 
3cc4e3f
95c9287
2956b24
95c9287
 
 
 
2956b24
95c9287
 
 
2956b24
3cc4e3f
 
 
 
 
 
 
 
 
95c9287
 
3cc4e3f
95c9287
3cc4e3f
95c9287
 
3cc4e3f
 
 
 
95c9287
 
3cc4e3f
 
 
 
 
95c9287
606ca5f
 
 
2956b24
606ca5f
 
d3ca850
 
606ca5f
 
 
 
 
 
 
 
 
 
 
c5b2790
 
606ca5f
c5b2790
 
606ca5f
 
 
95c9287
3cc4e3f
c5b2790
3cc4e3f
606ca5f
 
2956b24
3cc4e3f
5b19d8a
 
 
95c9287
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
import os.path
from preprocess.utils.common.utils import normalize_name
from math import isnan

from preprocess.utils.items.attrs import *
from preprocess.utils.common.brand_matching import *
from preprocess.utils.common.top_inserts import *
from preprocess.utils.products.products import *
import pandas as pd
from processor.matching import prepare_groups_with_ids_ex

class Preprocessor():

    def __init__(self, long_types_list, short_types_list, sour_list,
                 type_wine, gbs, grapes, other_words,
                 #sour_merge_dict,
                 type_merge_dict, color_merge_dict,
                 country_list):

        self.long_types_list=[element.lower() for element in long_types_list]
        self.short_types_list=short_types_list
        self.sour=sour_list
        self.type_wine=type_wine
        self.gbs=gbs
        self.grapes=grapes
        self.other_words=other_words

        self.types_n_others=long_types_list+other_words+sour_list+country_list
        self.types_n_others.remove("Шерри")

        self.type_dict=type_merge_dict
        self.color_merge_dict=color_merge_dict
        self.country_list = country_list

        global TYPES_FROM_BRAND_DICT
        updated = {}
        for k, v in TYPES_FROM_BRAND_DICT.items():
            updated[k] = v
            updated[normalize_name(k)] = v
        TYPES_FROM_BRAND_DICT = updated



    def write_log(self, logfn, s):
        print(s + "\n")
        with open(logfn, 'a') as logf:
            logf.write(datetime.now().strftime('[%Y-%m-%d %H:%M:%S]: ') + s + "\n")



    def process_products(self, products):
        result={'index':[], 'id':[], 'orig_brand':[], 'brand':[], 'brand_unwrap':[],
                'orig_name':[], 'name':[], 'name_wo_brand':[], 'name_with_brand':[],
                'orig_name_2':[], 'name_2': [],
                'orig_type':[], 'type':[], 'type_l1':[], 'type_l0':[],
                'orig_type_wine':[], "type_wine":[], 'sour':[],
                "volume":[], "gb":[], "year":[], 'alco':[], 'other': []}#, 'embeddings':[]}

        index = 0
        for idx, row in tqdm(products.iterrows()):
            try:
                #if not row['id'] == 1115:
                #    continue

                #if not isinstance(row['brand'], str):
                #    continue

                #if (row['brand'].lower() == 'Villa Raiano'.lower()) or (row['brand'].lower() == 'bosco'.lower()):
                #    row = row
                #else:
                #    continue
                if isinstance(row['product_type'], (int, float)) and isnan(row['product_type']):
                    print("Product type is not specified or incorrect for product id=[" + str(row['id']) + "]. Product is ignored")
                    continue

                result['index'].append(index)
                result['id'].append(row['id'])

                result['orig_brand'].append(row['brand'])
                #result['orig_name'].append(row['name_long'])
                result['orig_name'].append(row['name'])
                result['orig_name_2'].append(row['name_translit'])
                result['orig_type'].append(row['product_type'])
                result['orig_type_wine'].append(row['category'])

                brand = preprocess_product_brand(row['brand'])
                #name = preprocess_product_name(row['name_long'])
                name = preprocess_product_name(row['name'])
                name_translit = preprocess_product_name(row['name_translit'])


                # First of all let's check if it is sparkling wine
                drink_type, _ = extract_spark(row['product_type'], False)
                drink_type_n, name = extract_spark(name, True)

                if not drink_type:
                    drink_type, _ = extract_type(row['product_type'], False)
                    drink_type_n, name = extract_type(name, True)

                if not drink_type:
                    drink_type = row['product_type'].lower()


                type_wine = None
                sour_wine = ''
                if isinstance(row['type_prefix'], str) and row['type_prefix']:
                    type_wine, sour_wine, _ = extract_color_and_sour(row['type_prefix'], remove=False)
                    if drink_type is None and (type_wine or sour_wine):
                        drink_type='вино'

                volume = is_volume(row['volume'])
                year, _ = extract_production_year(str(row['name_postfix']))
                gb, _ = extract_gb(row['name_postfix'], False)
                alco, _ = extract_alcohol_content(name)



                name, alcohol_n, volume_n, aging_n, year_n, gb_n, color_n, sour_wine_n, other_n = extract_attributes_from_name(name)
                name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ')
                name = normalize_and_clean_name(name)

                name_translit, alcohol_n2, volume_n2, aging_n2, year_n2, gb_n2, color_n2, sour_wine_n2, other_n2 = extract_attributes_from_name(name_translit)
                name_translit = trim_name(name_translit, self.types_n_others).replace(',', ' ').replace('.', ' ')
                name_translit = normalize_and_clean_name(name_translit)


                if not year:
                    year = year_n
                #elif year and year_n and (year != year_n):
                #    print("Product year conflict detected for product id=[" + str(row['id']) + "]: " + str(year) + " vs " + str(year_n))


                if not type_wine:
                    type_wine = color_n
                #elif color_n and type_wine and (color_n != type_wine):
                #    print("Product type_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(type_wine) + " vs " + str(color_n))


                if not sour_wine:
                    sour_wine = sour_wine_n
                #if sour_wine_n and sour_wine and (sour_wine != sour_wine_n):
                #    print("Product sour_wine conflict detected for product id=[" + str(row['id']) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n))


                if not volume:
                    volume = volume_n
                elif volume_n and volume and (volume_n != volume):
                    print("Product volume conflict detected for product id=[" + str(row['id']) + "]: " + str(volume) + " vs " + str(volume_n))



                result['brand'].append(brand)
                result['brand_unwrap'].append('')

                result['name'].append(name)
                result['name_2'].append(name_translit)
                result['name_wo_brand'].append('')
                result['name_with_brand'].append('')

                if not type_wine:
                    type_wine = ''


                result['type'].append(drink_type.lower())
                result['type_wine'].append(type_wine.lower())
                result['type_l1'].append('')
                result['type_l0'].append('')

                if not sour_wine:
                    sour_wine = ''

                result['sour'].append(sour_wine)
                result['volume'].append(volume)
                result['year'].append(year)
                result['gb'].append(gb)
                result['alco'].append(alco)
                result['other'].append(other_n)

                index += 1
            except Exception as ex:
                print("Error processing product id=" + str(idx) + ": " + str(ex))
        return pd.DataFrame(result)


    def process_products_full(self, products_data):
        logfn = os.path.join(products_data['dir'], "update_log.txt")
        try:
            self.write_log(logfn, "Products processing started")

            prods_file = products_data['path']
            products_delimiter = get_delimiter(prods_file)
            # row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
            products = pd.read_csv(prods_file, sep=products_delimiter)

            # 1)
            self.write_log(logfn, '------*-----Prepare products catalogue-----*-----')
            products = self.process_products(products.copy())

            products_data["dict_types"] = products['type'].unique().tolist()

            # 2)
            #products['brand'] = products['brand'].apply(lambda x: str(x).strip().lower())

            # 3)
            #products_data["brand_3"] = products['brand'].unique()

            self.write_log(logfn, '------*-----Unwrapping brands-----*-----')
            products["brand_unwrap"] = products["brand"]
            # 4)
            ##products_data["unwrap_brands_1"] = unwrap_brands(products)
            products_data["unwrap_brands_1"] = {}

            # 5)
            products["brand_unwrap"] = products["brand"].replace(products_data["unwrap_brands_1"])

            # 6)
            #products_data["unwrap_brand_2"] = unwrap_brands(products)

            # 7)
            ##products_data["unwrap_brands_2"] = unwrap_brands(products, products['brand_unwrap'].unique())
            products_data["unwrap_brands_2"] = {}

            # 8)
            products["brand_unwrap"] = products["brand_unwrap"].replace(products_data["unwrap_brands_2"])
            products["brand_unwrap"] = products.apply(lambda row: row["brand_unwrap"] if row["brand_unwrap"] != row["brand"] else '', axis=1)

            # 9)
            self.write_log(logfn, '-----*-----Adding service categories-----*-----')
            merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
            merge_types(products, products, type_merge_dict=self.type_dict)

            # Now we can normalize and clean brands and names (only after trimming)
            products['brand'] = products['brand'].apply(normalize_and_clean_brand)
            products['norm_name'] = products['name']

            # 11)
            self.write_log(logfn, '-----*-----Replacing product types-----*-----')
            products['type']=products['type'].replace(self.type_dict)

            products['new_brand']=products['brand']
            #products["name_with_brand"] = products["name"]

            products["name_wo_brand"] = products.apply(lambda row: remove_brand_from_name(row['name'], row['brand']), axis=1)
            products["name_with_brand"] = products.apply(lambda row: insert_brand_in_name(row['name'], row['brand']), axis=1)
            #products["name_wo_brand_len"] = products['name_wo_brand'].apply(lambda x: len(x))



            #products_data["dict_groups_brand_type_vol_typewine"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume', 'new_type_wine'])
            products_data["groups_brand_type_vol"] = prepare_groups_with_ids_ex(products, ["new_brand", 'type', 'volume'], "name_wo_brand")

            # Change it from type_wine to type
            products['type_l1'] = products['type'].replace(TYPES_LEVEL_1_DICT)
            products['type_l0'] = products['type_l1'].replace(TYPES_LEVEL_0_DICT)

            products_data["groups_brand_typel1_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l1', 'volume'], "name_wo_brand")
            products_data["groups_brand_typel0_vol"] = prepare_groups_with_ids_ex(products, ['new_brand', 'type_l0', 'volume'], "name_wo_brand")

            products_data["groups_typewine_type_vol"] = prepare_groups_with_ids_ex(products, ['new_type_wine', 'new_type', 'volume'], "name_with_brand")

            products_data["groups_typel0"] = prepare_groups_with_ids_ex(products, ['type_l0'], "name_with_brand")

            #products_data["dict_groups_typel1_vol"] = prepare_groups_with_ids_ex(products, ['type_l1','volume'])
            #products_data["dict_groups_typel0_vol"] = prepare_groups_with_ids_ex(products, ['type_l0','volume'])
            #products_data["dict_groups_vol"] = prepare_groups_with_ids_ex(products, ['volume'])

            products_data["df_products"] = products
            save_products_data(products_data)

            remove_old_products(products_data)

            self.write_log(logfn, "Products processing finished")
        except Exception as ex:
            self.write_log(logfn, "An error occurred: " + str(ex))
            return None

        return products_data


    def preprocess_item_brand(self, brand):
        if not isinstance(brand, str):
            return str(brand), ''

        parts = brand.split('/', 2)
        if len(parts) > 1:
            return parts[0].strip(), parts[1].strip()

        return brand.strip(), ''



    def detect_language_simple_2(self, name, reverse=False):
        if reverse:
            name = name[::-1]

        ru_count = 0
        en_count = 0

        for ch in name:
            if (ord(ch) >= ord('А') and ord(ch) <= ord('Я')) or \
                (ord(ch) >= ord('а') and ord(ch) <= ord('я')):
                ru_count += 1
            elif (ord(ch) >= ord('A') and ord(ch) <= ord('Z')) or \
                (ord(ch) >= ord('a') and ord(ch) <= ord('z')):
                en_count += 1


        if ru_count < 2 and en_count < 2:
            return 'xx'

        if ru_count > en_count:
            return 'ru'

        return 'en'


    def check_alternative_name(self, name, check_len = True, simple_lang_check=True):
        startpos = 0
        while True:
            pos = name.find("/", startpos)
            if pos == -1:
                return name, ''

            parts = [name[:pos], name[pos+1:]]
            startpos = pos + 1

            if check_len:
                if float(min(len(parts[0]), len(parts[1]))) / max(len(parts[0]), len(parts[1])) < 0.5:
                    continue

                if len(parts[1]) < 3:
                    return name, ''

            lang1 = self.detect_language_simple_2(parts[0], True)
            lang2 = self.detect_language_simple_2(parts[1])
            if (lang1 == 'ru' and lang2=='en') or (lang1 == 'en' and lang2=='ru'):
                return parts[0], parts[1]

        return name, ''


    def merge_multiline_name(self, name_parts):
        name = name_parts[0]
        name_2 = ""

        lang_0 = detect_language(name)
        for n in name_parts[1:]:
            if detect_language(n) == lang_0:
                name += " " + n
            else:
                name_2 += " " + n

        return name, name_2


    def process_multiline_name(self, name, check_len = True, simple_lane_check=True):
        if not name:
            return name, ''

        pos = name.find(" ##### ")
        if pos >= 0:
            parts = name.split(" ##### ")
            # Special processing for complex multiline names like;
            # "Луи Мемори До\nВыдержка: от 30 до 50 лет\nLouis Memory Deau\nAgeing: from 30 to 50 years"
            if len(parts) > 2:
                return self.merge_multiline_name(parts)

            return parts[0], parts[1]

        return name, ''



    def process_items(self, df):
        result={'id':[], 'orig_brand':[], 'brand':[], 'brand_short':[], 'brand_2':[], 'brand_2_short':[], 'alt_brands': [],
                'orig_name':[], 'name':[], 'name_wo_brand':[],  'name_with_brand':[],
                'name_2':[], 'name_2_wo_brand':[], 'name_2_with_brand':[],
                'names_wo_alt_brands': [], 'names_with_alt_brands': [], 'names_2_wo_alt_brands': [], 'names_2_with_alt_brands': [],
                'type':[], 'new_type':[], 'type_n':[],
                "type_wine":[], "new_type_wine":[], "type_wine_n":[],
                "sour":[], "volume":[], 'gb':[], "year":[], 'aging':[], 'alco':[]} #, 'orig_attrs':[],}#, 'embeddings':[]}

        volume_issues = []
        year_issues = []

        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):

            try:
                if not isinstance(i, str) or not i:
                    #print("Skipping item with id=" + str(idf) + " because of incorrect format\n")
                    volume_issues.append(0)
                    year_issues.append(0)
                    continue

                #if not (idf == 2008546 or idf == 2007114 or idf == 2008080) :
                #    continue
                #if not idf == 275213:
                #    continue
                #if not idf == 173796:
                #    continue

                #if idf > 1000:
                #    continue

                i = json.loads(i.lower().replace("\\n", " ##### ").replace("\n", " ##### "))

                result['id'].append(idf)

                if 'brand' in i.keys():
                    result['orig_brand'].append(i['brand'])
                    brand, brand_2 = self.preprocess_item_brand(i['brand'])

                    brand = normalize_and_clean_brand(brand)
                    brand_2 = normalize_and_clean_brand(brand_2)
                else:
                    result['orig_brand'].append(None)
                    brand = brand_2 = None


                name = i['name']
                result['orig_name'].append(name)

                # First of all remove from name specific brands that makes collisions while name parsing and trimming
                name, specific_brand, specific_name = replace_specific_brand_and_name(name)
                if specific_brand:
                    if brand and specific_brand and (brand != specific_brand):
                        print("Conflict between brand and specific brand for item id=[" + str(idf) + "]")
                    else:
                        brand = specific_brand = normalize_and_clean_brand(specific_brand)
                        brand_2 = None

                if specific_name:
                    specific_name  = normalize_and_clean_name(specific_name)


                # Some items contains many lines separated with new line. We can easilty process them because new line is universal separator
                # Other types of multiline names that are separated with \ or / we process later (using process_multiline_name2) after all attributes are extracted
                name, name_2 = self.process_multiline_name(name)


                type_wine = None
                sour_wine = None
                volume = None
                alcohol = None
                year = None

                # First of all let's check if it is sparkling wine
                drink_type, name = extract_spark(name, False)

                if not drink_type and ('type_wine' in i.keys()):
                    drink_type, _ = extract_spark(i['type_wine'], False)

                # Next let's check any other known type
                if not drink_type and ('type' in i.keys()):
                    drink_type, _ = extract_type(i['type'], False)


                if not drink_type and ('type_wine' in i.keys()):
                    drink_type, _ = extract_type(i['type_wine'], False)

                # Next let's check any other known type
                if not drink_type and ('category' in i.keys()):
                    drink_type, _ = extract_type(i['category'], False)

                # Special case for some brands like 'jaegermeister' which sometimes the only thing specified in name
                # so we try to detect drink type using only brand / name if it is possible
                if not drink_type and brand:
                    drink_type = extract_type_by_brand_name(brand)


                if 'type_wine' in i.keys():
                    type_wine, sour_wine, _ = extract_color_and_sour(i['type_wine'], remove=False)
                    if drink_type is None and (type_wine or sour_wine):
                        drink_type='вино'


                # Try to extract type_wine and sour from "color" attribute if exists
                if 'color' in i.keys():
                    if not type_wine:
                        type_wine, _ = extract_color(i['color'])
                        if type_wine and drink_type is None:
                            drink_type='вино'

                    if not sour_wine:
                        sour_wine, _ = extract_sour(i['color'])
                        if sour_wine and drink_type is None:
                            drink_type='вино'


                # Try to extract sour from "sugar" attribute if exists
                if 'sugar' in i.keys():
                    if sour_wine is None:
                        sour_wine, _ = extract_sour(i['sugar'])
                        if sour_wine and drink_type is None:
                            drink_type='вино'


                if 'volume' in i.keys():
                    volume = i['volume']


                if 'year' in i.keys():
                    year = i['year']


                #alco, _ =extract_alcohol_content(i['name'])
                #result['alco'].append(alco)
                drink_type_n, name = extract_type(name, True)

                name, alcohol_n, volume_n, aging, year_n, gb, color_n, sour_wine_n, other_n = extract_attributes_from_name(name)
                name = trim_name(name, self.types_n_others).replace(',', ' ').replace('.', ' ')

                # If alternative name is not specified, then it is time to check it
                # (after we removed all attributes that could break the logic, but before normalization in order to save language difference)
                if not name_2:
                    name, name_2 = self.check_alternative_name(name)

                name = normalize_and_clean_name(name)


                if name_2:
                    name_2, _, _, _, _, _, _, _, _ = extract_attributes_from_name(name_2)
                    name_2 = trim_name(name_2, self.types_n_others).replace(',', ' ').replace('.', ' ')
                    name_2 = normalize_and_clean_name(name_2)


                if specific_brand or specific_name:
                    name = restore_specific_brand_and_name(name, specific_brand, specific_name)

                # Check that there is no conflict between values extracted from name and from item attributes

                if not drink_type:
                    drink_type = drink_type_n
                #elif drink_type and drink_type_n and (drink_type != drink_type_n):
                #    print("Item drink_type conflict detected for item id=[" + str(idf) + "]: " + str(drink_type) + " vs " + str(drink_type_n))

                if not alcohol:
                    alcohol = alcohol_n
                #elif alcohol and alcohol_n and (alcohol != alcohol_n):
                #    print("Item alcohol conflict detected for item id=[" + str(idf) + "]: " + str(alcohol) + " vs " + str(alcohol_n))

                vol_issue = 0
                if not volume:
                    volume = volume_n
                elif volume and volume_n and (volume != volume_n):
                    vol_issue = 1
                    #print("Item volume conflict detected for item id=[" + str(idf) + "]: " + str(volume) + " vs " + str(volume_n))

                volume_issues.append(vol_issue)


                year_issue = 0
                if not year:
                    year = year_n
                elif year and year_n and (str(year).strip() != str(year_n).strip()):
                    #print("Item year conflict detected for item id=[" + str(idf) + "]: " + str(year) + " vs " + str(year_n))
                    year_issue = 1

                year_issues.append(year_issue)


                if not type_wine:
                    type_wine = color_n
                #elif type_wine and color_n and (type_wine != color_n):
                #    print("Item type_wine conflict detected for item id=[" + str(idf) + "]: " + str(type_wine) + " vs " + str(color_n))


                if not sour_wine:
                    sour_wine = sour_wine_n
                #elif sour_wine and sour_wine_n and (sour_wine != sour_wine_n):
                #    print("Item sour_wine conflict detected for item id=[" + str(idf) + "]: " + str(sour_wine) + " vs " + str(sour_wine_n))


                # Finally fill in the data
                result['brand'].append(brand)
                result['brand_short'].append('')
                result['brand_2'].append(brand_2)
                result['brand_2_short'].append('')
                result['alt_brands'].append([])

                if name is None:
                    name = name

                if name_2 is None:
                    name_2 = name_2

                result['name'].append(name)
                result['name_wo_brand'].append('')
                result['name_with_brand'].append('')
                result['names_wo_alt_brands'].append([])
                result['names_with_alt_brands'].append([])


                result['name_2'].append(name_2)
                result['name_2_wo_brand'].append('')
                result['name_2_with_brand'].append('')
                result['names_2_wo_alt_brands'].append([])
                result['names_2_with_alt_brands'].append([])

                result['new_type'].append('')
                result['type_n'].append('')
                result['new_type_wine'].append('')
                result['type_wine_n'].append('')

                result['type'].append(drink_type)
                result['type_wine'].append(type_wine)
                result['sour'].append(sour_wine)

                result['aging'].append(aging)
                result['alco'].append(alcohol)
                result['gb'].append(gb)
                result['volume'].append(volume)
                result['year'].append(year)

            except Exception as ex:
                print("Error occurred while processing item id=" + str(idf), ex)

        #df = df.assign(volume_issues=volume_issues)
        #df = df.assign(year_issues=year_issues)
        #df.to_csv("c:\\!\\feed_items_issues.csv")
        #exit(0)

        return pd.DataFrame(result)


    def prcess_text(self, text):
        #text=''+origin
        #text=str(split_russian_and_english(text))
        gb=find_full_word(text, self.gbs)#get_GB(text)
        if gb is not None:
            text=text.replace(str(gb), ' ')
        #text = remove_full_words(text, self.gbs)

        alcohol, text = extract_alcohol_content(text, True)
        #if alcohol is not None:
        #    alco_w_comma=alcohol.replace('.', ',')
        #    text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')

        years, text = extract_years(text, True)
        if years is not None:
            text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ')

        production_year, text = extract_production_year(text, True)

        volume_or_number, text = extract_volume_or_number(text, True)
        #if volume_or_number is not None:
            #text = text.replace(vol_text, " ")
            #volume_with_comma=str(volume_or_number).replace('.', ',')
            #text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
            #text = re.sub(r'\s+\b[лЛlL].\b', ' ', text)
            #text = re.sub(r'\s+\b[лЛlL]\b', ' ', text)
            #test=clean_wine_name(text) #remove_l(text)
            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
        # else:
        #     volume_or_number=re_extract_volume(text)
        #     if volume_or_number is not None:
        #         volume_with_comma=volume_or_number.replace('.', ',')
        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')

        #if production_year is not None:
        #    text = re.sub(r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text)

        color, sour, text = extract_color_and_sour(text, True)

        #color=find_full_word(text, self.type_wine)
        #if color is not None:
        #    if not find_word(text, SPECIFIC_NAMES):
        #        text=text.replace(str(color), '')

        #sour=find_full_word(text, self.sour) #get_sour(text)
        #if sour is not None:
        #    text=text.replace(str(sour), '')

        # re_extracted_volume=re_extract_volume(text)
        # if re_extracted_volume is not None:
        #     volume_with_comma=re_extracted_volume.replace('.', ',')
        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')

        # else:
        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
        # volume_or_number=re_extracted_volume

        return text, alcohol, volume_or_number, years, production_year, gb, color, sour


    def process_new(self, products_data, items):

        if not "df_products" in products_data.keys():
            products_data = self.process_products_full(products_data)

        print('------*-----Prepare items catalogue-----*-----')
        items=self.process_items(items.copy())

        products = products_data["df_products"]
        products_brands = products['brand'].unique()

        items['type']=items['type'].replace(self.type_dict)

        print('-----*-----Adding service categories-----*-----')
        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
        merge_types(items, products, type_merge_dict=self.type_dict, product_types=products_data["dict_types"])


        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())

        print('-----*-----Fill brands in items-----*-----')
        fill_brands_in_dataframe(products_brands, items)
        fill_brands_in_dataframe_2(products_brands, items)

        print('-----*-----Brand matching-----*-----')
        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
        out_prods=list(set(prod_brand_list)-set(comp_list))
        out_items=list(set(items_brand_list)-set(comp_list))
        brand_map_improved=match_brands_improved(out_items, list(products_brands))
        items["new_brand"] = items["new_brand"].replace(brand_map_improved)


        print('-----*-----Finding brands in names-----*-----')
        items['new_brand']=items['new_brand'].replace('none', None)
        #i_brands=items[items['new_brand'].isna()]['name'].values
        i_brands = items['name'].values
        p_brands=[i for i in products_brands if i is not None and len(i)>3]
        #new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands, threshold=30)
        new_found_brands = check_brands_in_strings_pqdm(i_brands, p_brands)
        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)

        print('-----*-----Top inserts-----*-----')
        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, self.grapes, self.other_words)

        items['brand']=items['brand'].replace('none', None)

        #print('-----*-----Replacing product types-----*-----')
        # 11)
        items['new_type'] = items['new_type'].replace(self.type_dict)

        items['type_l1'] = items['type'].replace(TYPES_LEVEL_1_DICT)
        items['type_l0'] = items['type_l1'].replace(TYPES_LEVEL_0_DICT)

        #fullpath = os.path.join("c:\\!!\\_items_with_types.pkl")
        #save_df_to_file(items, fullpath, True)
        #exit(1)

        return items, products