from preprocess.utils.common.extracters import * from preprocess.utils.common.utils import * from preprocess.utils.common.parallel_brand_matching import * from tqdm import tqdm import re import math import numpy as np def top_inserts_matching(other_brands, p_brands, items, th=65): replaced={} for i in other_brands: l=i.split('/') if len(l)>2: replaced[l[0].replace('Шато','')]=i else: if 'Шато' in i: replaced[i.replace('Шато','')]=i ob=[i.split('/')[0].replace('Шато','') for i in other_brands] ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th) result={} for k in ob_in_pb.keys(): if k in replaced.keys(): result[replaced[k]]=ob_in_pb[k] else: result[k]=ob_in_pb[k] items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result) def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words): result={} for n in tqdm(items[items['new_brand'].isna()]['name'].values): name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n) #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л') name=trim_name(name, types) name=trim_name(name, grape_varieties) name=trim_name(name, onther_words) name=name.replace('.','').replace(',','').replace('(','').replace(')','') #result.append(clean_wine_name(name).strip()) result[n]=clean_wine_name(name).strip() items['new_name']=None items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result) u_nn=list(items[~items['new_name'].isna()]['new_name'].unique()) res={} for i in tqdm(u_nn): lenta=len(items[items['new_name']==i]) if lenta>1: res[i]=lenta th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2) other_brands=[i for i,j in res.items() if j>th] reess=check_brands_in_strings_pqdm(other_brands, p_brands) items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess) top_inserts_matching(other_brands, p_brands, items)