Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Product_Matching / preprocess /utils /common /top_inserts.py

Gainward777

Upload 6 files

f556b0c verified 11 months ago

raw

history blame contribute delete

2.43 kB

	from preprocess.utils.common.extracters import *
	from preprocess.utils.common.utils import *
	from preprocess.utils.common.parallel_brand_matching import *
	from tqdm import tqdm
	import re
	import math
	import numpy as np



	def top_inserts_matching(other_brands, p_brands, items, th=65):
	replaced={}
	for i in other_brands:
	l=i.split('/')
	if len(l)>2:
	replaced[l[0].replace('Шато','')]=i
	else:
	if 'Шато' in i:
	replaced[i.replace('Шато','')]=i

	ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
	ob_in_pb=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)

	result={}
	for k in ob_in_pb.keys():
	if k in replaced.keys():
	result[replaced[k]]=ob_in_pb[k]
	else:
	result[k]=ob_in_pb[k]

	items.loc[items['new_name'].isin(result.keys()), 'new_brand'] = items['new_name'].map(result)


	def process_unbrended_names(items, p_brands, process_text, types, grape_varieties, onther_words):
	result={}
	for n in tqdm(items[items['new_brand'].isna()]['name'].values):

	name, alcohol, volume_or_number, years, production_year, gb, color, sour=process_text(n)
	#name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
	name=trim_name(name, types)
	name=trim_name(name, grape_varieties)
	name=trim_name(name, onther_words)
	name=name.replace('.','').replace(',','').replace('(','').replace(')','')
	#result.append(clean_wine_name(name).strip())
	result[n]=clean_wine_name(name).strip()



	items['new_name']=None
	items.loc[items['name'].isin(result.keys()), 'new_name'] = items['name'].map(result)

	u_nn=list(items[~items['new_name'].isna()]['new_name'].unique())
	res={}
	for i in tqdm(u_nn):
	lenta=len(items[items['new_name']==i])
	if lenta>1:
	res[i]=lenta

	th=math.sqrt(((np.array(list(res.values())).mean()+np.array(list(res.values())).std())**2)//2)
	other_brands=[i for i,j in res.items() if j>th]

	reess=check_brands_in_strings_pqdm(other_brands, p_brands)

	items.loc[items['new_name'].isin(reess.keys()), 'new_brand'] = items['new_name'].map(reess)

	top_inserts_matching(other_brands, p_brands, items)