classification / traning_zone /standardisation /dictionnaire.py

COULIBALY BOURAHIMA

first commit

f1f2665 over 2 years ago

3.69 kB

	import pandas as pd
	from dotenv import load_dotenv
	import os
	load_dotenv()
	project_id = os.getenv('PROJECT_ID')


	data = pd.read_csv("dictionnaire.tsv", sep ="\t", encoding='utf-8')
	dictionnaire = data.set_index('ABBREVIATIONS')['CORRESPONDANCES'].to_dict()

	#dictionnaire = {"rg": "rouge","rges" : "rouge","rge": "rouge", "rse": "rose" ,"rs" : "rose", "bl": "blanc", "bdx": "Bordeaux",
	# "vdt": "vin de table", 'vdp': "vin de pays","blc": "blanc", "bib": "bag in box", "citr": "citron", "co": "coco", "gourm" : "gourmand",
	# "patis": "patisserie", "p'tits" : "petit", "p'tit": "petit","p tit": "petit", "pt": "pepite", "rev": "revil","succ": "sucettes",
	# "succet": "sucettes", "chocohouse": "choco house", "sach": "sachet", "choc": "choco", "tab" : "tablette", "hte" : "haute",
	# "spagh" : "spaghetti", "scht": "sachet", "nr": "noir", "caf": "cafe","barr": "barre", "pces": "pieces","pc": "pieces", "acidu": "acidule","blnc": "blanc",
	# "frui" : "fruit", "gourman" : "gourmand","bte" : "boîte", "bt" : "boîte", "ptit": "petit", "corb": "corbeil","ptits": "petit", "pti": "petit", "nois": "noisette",
	# "poul": "poulain", "barq" : "barquette", "barqu" : "barquette", 'fizz': 'fizzy', "st": "saint", "mich": "michel", "cal" : "calendrier", "calend" : "calendrier",
	# "calendr" : "calendrier", "caram" : "caramel", "cava" : "cavalier", "har" : "haribo", 'choc' : "chocolat", "choco" :"chocolat", 'lt' : "lait", "choc'n" :"chocolat noir",
	# "choc n" :"chocolat noir", "degust" : "degustation", "degus" : "degustation", "bis" : "biscuit", "coffr" : "coffret", "coff" : "coffret", "conf" : "confiserie",
	# "confis" : "confiserie", "croco" : "crocodile", "dble" : "double", "dess" : "dessert", "doyp" : "doypack", "harib" : "harib" , "et" : "etui", "exc" : "excellence",
	# "excel" : "excellence", "frit" : "friture","fritu" : "friture","fritur" : "friture", "gd" : "grand", "gr" : "grand", "grd" : "grand", "grchoc" : "grand chocolat", "lat" : "lait", 'ass' : "assorti", "assoti" :"assorti",
	# "noug" : "nougatine", "nougat" : "nougatine", "scht" : "sachet", "sct" : "secret", "cho" : "chocolat" , "bisc" : "biscuit", "am" : "amande", "liq" : "liqueur", "tabl" : "tablette","asst":"assorti",
	# "tab" : "tablette", "bil" : "bille", "vali" : "valisette", "cda" : "chevaliers d argouges", "tub": "tubo", "gril" :"grille", "amandesgrilles" : "amandes grilles", "ball" : "ballotin",
	# "piecestubo" : "pieces tubo"
	# }

	#Brand = pd.read_gbq("""
	# SELECT DISTINCT
	# BEM_BRAND_DESC AS brand
	# FROM `c4-gdw-prd.products_referential.d_bem_product_barcode` Pdcts
	# WHERE Pdcts.BEM_BRAND_DESC NOT IN ("UNKNOWN")
	# AND Pdcts.BEM_BRAND_KEY<>'-1'
	# AND Pdcts.BEM_SECTOR_KEY="1"
	# AND Pdcts.COUNTRY_KEY IN ('FRA','BEL')
	#
	# UNION DISTINCT
	#
	# SELECT DISTINCT
	# BRAND_DESC AS brand
	# FROM `c4-gdw-prd.products_referential.d_bem_product_barcode` Pdcts
	# WHERE Pdcts.BRAND_DESC NOT IN ("UNKNOWN")
	# AND Pdcts.BRAND_KEY<>'-1'
	# AND Pdcts.BEM_SECTOR_KEY="1"
	# AND Pdcts.COUNTRY_KEY IN ('FRA','BEL')""",
	# project_id=project_id).brand.apply(lambda x: x.lower())

	#liste_stopword = np.append(Brand.values.tolist(), ['oz', 'kg', 'g', 'lb', 'mg', 'l', 'cl', 'ml', 'tsp', 'tbsp', 'cm', 'x', 'cte', 'h', 'nux'])

	liste_stopword = ["unknown",'oz', 'kg', 'g', 'lb', 'mg', 'l', 'cl', 'ml', 'tsp', 'tbsp', 'cm', 'x', 'cte', 'h']