Spaces:
Sleeping
Sleeping
File size: 2,428 Bytes
f556b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from preprocess.utils.common.extracters import *
from preprocess.utils.common.utils import *
from preprocess.utils.common.parallel_brand_matching import *
from tqdm import tqdm
import re
import math
import numpy as np
def top_inserts_matching(other_brands, p_brands, items, th=65):
replaced={}
for i in other_brands:
l=i.split('/')
if len(l)>2:
replaced[l[0].replace('Шато','')]=i
else:
if 'Шато' in i:
replaced[i.replace('Шато','')]=i
ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
result={}
for k in ob_in_pb.keys():
if k in replaced.keys():
result[replaced[k]]=ob_in_pb[k]
else:
result[k]=ob_in_pb[k]
items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)
def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
result={}
for n in tqdm(items[items['new_brand'].isna()]['name'].values):
name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
#name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
name=trim_name(name, types)
name=trim_name(name, grape_varieties)
name=trim_name(name, onther_words)
name=name.replace('.','').replace(',','').replace('(','').replace(')','')
#result.append(clean_wine_name(name).strip())
result[n]=clean_wine_name(name).strip()
items['new_name']=None
items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)
u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
res={}
for i in tqdm(u_nn):
lenta=len(items[items['new_name']==i])
if lenta>1:
res[i]=lenta
th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
other_brands=[i for i,j in res.items() if j>th]
reess=check_brands_in_strings_pqdm(other_brands, p_brands)
items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)
top_inserts_matching(other_brands, p_brands, items) |