import pandas as pd from dotenv import load_dotenv import os load_dotenv() project_id = os.getenv('PROJECT_ID') data = pd.read_csv("dictionnaire.tsv", sep ="\t", encoding='utf-8') dictionnaire = data.set_index('ABBREVIATIONS')['CORRESPONDANCES'].to_dict() #dictionnaire = {"rg": "rouge","rges" : "rouge","rge": "rouge", "rse": "rose" ,"rs" : "rose", "bl": "blanc", "bdx": "Bordeaux", # "vdt": "vin de table", 'vdp': "vin de pays","blc": "blanc", "bib": "bag in box", "citr": "citron", "co": "coco", "gourm" : "gourmand", # "patis": "patisserie", "p'tits" : "petit", "p'tit": "petit","p tit": "petit", "pt": "pepite", "rev": "revil","succ": "sucettes", # "succet": "sucettes", "chocohouse": "choco house", "sach": "sachet", "choc": "choco", "tab" : "tablette", "hte" : "haute", # "spagh" : "spaghetti", "scht": "sachet", "nr": "noir", "caf": "cafe","barr": "barre", "pces": "pieces","pc": "pieces", "acidu": "acidule","blnc": "blanc", # "frui" : "fruit", "gourman" : "gourmand","bte" : "boîte", "bt" : "boîte", "ptit": "petit", "corb": "corbeil","ptits": "petit", "pti": "petit", "nois": "noisette", # "poul": "poulain", "barq" : "barquette", "barqu" : "barquette", 'fizz': 'fizzy', "st": "saint", "mich": "michel", "cal" : "calendrier", "calend" : "calendrier", # "calendr" : "calendrier", "caram" : "caramel", "cava" : "cavalier", "har" : "haribo", 'choc' : "chocolat", "choco" :"chocolat", 'lt' : "lait", "choc'n" :"chocolat noir", # "choc n" :"chocolat noir", "degust" : "degustation", "degus" : "degustation", "bis" : "biscuit", "coffr" : "coffret", "coff" : "coffret", "conf" : "confiserie", # "confis" : "confiserie", "croco" : "crocodile", "dble" : "double", "dess" : "dessert", "doyp" : "doypack", "harib" : "harib" , "et" : "etui", "exc" : "excellence", # "excel" : "excellence", "frit" : "friture","fritu" : "friture","fritur" : "friture", "gd" : "grand", "gr" : "grand", "grd" : "grand", "grchoc" : "grand chocolat", "lat" : "lait", 'ass' : "assorti", "assoti" :"assorti", # "noug" : "nougatine", "nougat" : "nougatine", "scht" : "sachet", "sct" : "secret", "cho" : "chocolat" , "bisc" : "biscuit", "am" : "amande", "liq" : "liqueur", "tabl" : "tablette","asst":"assorti", # "tab" : "tablette", "bil" : "bille", "vali" : "valisette", "cda" : "chevaliers d argouges", "tub": "tubo", "gril" :"grille", "amandesgrilles" : "amandes grilles", "ball" : "ballotin", # "piecestubo" : "pieces tubo" # } #Brand = pd.read_gbq(""" # SELECT DISTINCT # BEM_BRAND_DESC AS brand # FROM `c4-gdw-prd.products_referential.d_bem_product_barcode` Pdcts # WHERE Pdcts.BEM_BRAND_DESC NOT IN ("UNKNOWN") # AND Pdcts.BEM_BRAND_KEY<>'-1' # AND Pdcts.BEM_SECTOR_KEY="1" # AND Pdcts.COUNTRY_KEY IN ('FRA','BEL') # # UNION DISTINCT # # SELECT DISTINCT # BRAND_DESC AS brand # FROM `c4-gdw-prd.products_referential.d_bem_product_barcode` Pdcts # WHERE Pdcts.BRAND_DESC NOT IN ("UNKNOWN") # AND Pdcts.BRAND_KEY<>'-1' # AND Pdcts.BEM_SECTOR_KEY="1" # AND Pdcts.COUNTRY_KEY IN ('FRA','BEL')""", # project_id=project_id).brand.apply(lambda x: x.lower()) #liste_stopword = np.append(Brand.values.tolist(), ['oz', 'kg', 'g', 'lb', 'mg', 'l', 'cl', 'ml', 'tsp', 'tbsp', 'cm', 'x', 'cte', 'h', 'nux']) liste_stopword = ["unknown",'oz', 'kg', 'g', 'lb', 'mg', 'l', 'cl', 'ml', 'tsp', 'tbsp', 'cm', 'x', 'cte', 'h']