File size: 2,428 Bytes
f556b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from preprocess.utils.common.extracters import *
from preprocess.utils.common.utils import *
from preprocess.utils.common.parallel_brand_matching import *
from tqdm import tqdm
import re
import math
import numpy as np



def top_inserts_matching(other_brands, p_brands, items, th=65):
    replaced={}
    for i in other_brands:
        l=i.split('/')
        if len(l)>2:
            replaced[l[0].replace('Шато','')]=i
        else:
            if 'Шато' in i:
                replaced[i.replace('Шато','')]=i

    ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
    ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)

    result={}
    for k in ob_in_pb.keys():
        if k in replaced.keys():
            result[replaced[k]]=ob_in_pb[k]
        else:
            result[k]=ob_in_pb[k]

    items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)


def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
    result={}
    for n in tqdm(items[items['new_brand'].isna()]['name'].values):

        name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
        #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
        name=trim_name(name, types)
        name=trim_name(name, grape_varieties)
        name=trim_name(name, onther_words)
        name=name.replace('.','').replace(',','').replace('(','').replace(')','')
        #result.append(clean_wine_name(name).strip())
        result[n]=clean_wine_name(name).strip()



    items['new_name']=None
    items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)

    u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
    res={}
    for i in tqdm(u_nn):
        lenta=len(items[items['new_name']==i])
        if lenta>1:
            res[i]=lenta

    th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
    other_brands=[i for i,j in res.items() if j>th]

    reess=check_brands_in_strings_pqdm(other_brands, p_brands)

    items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)

    top_inserts_matching(other_brands, p_brands, items)