Gainward777's picture
Upload 6 files
f556b0c verified
from preprocess.utils.common.extracters import *
from preprocess.utils.common.utils import *
from preprocess.utils.common.parallel_brand_matching import *
from tqdm import tqdm
import re
import math
import numpy as np
def top_inserts_matching(other_brands, p_brands, items, th=65):
replaced={}
for i in other_brands:
l=i.split('/')
if len(l)>2:
replaced[l[0].replace('Шато','')]=i
else:
if 'Шато' in i:
replaced[i.replace('Шато','')]=i
ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
result={}
for k in ob_in_pb.keys():
if k in replaced.keys():
result[replaced[k]]=ob_in_pb[k]
else:
result[k]=ob_in_pb[k]
items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)
def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
result={}
for n in tqdm(items[items['new_brand'].isna()]['name'].values):
name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
#name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
name=trim_name(name, types)
name=trim_name(name, grape_varieties)
name=trim_name(name, onther_words)
name=name.replace('.','').replace(',','').replace('(','').replace(')','')
#result.append(clean_wine_name(name).strip())
result[n]=clean_wine_name(name).strip()
items['new_name']=None
items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)
u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
res={}
for i in tqdm(u_nn):
lenta=len(items[items['new_name']==i])
if lenta>1:
res[i]=lenta
th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
other_brands=[i for i,j in res.items() if j>th]
reess=check_brands_in_strings_pqdm(other_brands, p_brands)
items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)
top_inserts_matching(other_brands, p_brands, items)