Spaces:
Sleeping
Sleeping
| from preprocess.utils.common.extracters import * | |
| from preprocess.utils.common.utils import * | |
| from preprocess.utils.common.parallel_brand_matching import * | |
| from tqdm import tqdm | |
| import re | |
| import math | |
| import numpy as np | |
| def top_inserts_matching(other_brands, p_brands, items, th=65): | |
| replaced={} | |
| for i in other_brands: | |
| l=i.split('/') | |
| if len(l)>2: | |
| replaced[l[0].replace('Шато','')]=i | |
| else: | |
| if 'Шато' in i: | |
| replaced[i.replace('Шато','')]=i | |
| ob=[i.split('/')[0].replace('Шато','') for i in other_brands] | |
| ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th) | |
| result={} | |
| for k in ob_in_pb.keys(): | |
| if k in replaced.keys(): | |
| result[replaced[k]]=ob_in_pb[k] | |
| else: | |
| result[k]=ob_in_pb[k] | |
| items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result) | |
| def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words): | |
| result={} | |
| for n in tqdm(items[items['new_brand'].isna()]['name'].values): | |
| name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n) | |
| #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л') | |
| name=trim_name(name, types) | |
| name=trim_name(name, grape_varieties) | |
| name=trim_name(name, onther_words) | |
| name=name.replace('.','').replace(',','').replace('(','').replace(')','') | |
| #result.append(clean_wine_name(name).strip()) | |
| result[n]=clean_wine_name(name).strip() | |
| items['new_name']=None | |
| items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result) | |
| u_nn=list(items[~items['new_name'].isna()]['new_name'].unique()) | |
| res={} | |
| for i in tqdm(u_nn): | |
| lenta=len(items[items['new_name']==i]) | |
| if lenta>1: | |
| res[i]=lenta | |
| th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2) | |
| other_brands=[i for i,j in res.items() if j>th] | |
| reess=check_brands_in_strings_pqdm(other_brands, p_brands) | |
| items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess) | |
| top_inserts_matching(other_brands, p_brands, items) |