import re import string from traning_zone.standardisation.dictionnaire import * from traning_zone.data_import.data_importation import * import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem.snowball import FrenchStemmer from nltk.corpus import stopwords nltk.download('stopwords') def supprimer_mot_double(expression): mots = expression.split() mots_uniques = [] for mot in mots: if mot not in mots_uniques: mots_uniques.append(mot) nouvelle_expression = " ".join(mots_uniques) return nouvelle_expression def data_cleaning(strings): strings = strings.lower().strip() strings = strings.replace('\'',' ') strings = strings.replace('/',' ') strings = re.sub(r'[^\w\s]', ' ', strings) text_normalized = re.sub('[^A-Za-z ,éêèîôœàâ]+', ' ', strings) return text_normalized def standardization(strings): liste = strings.split(' ') for i in range(len(liste)) : if liste[i] in dictionnaire.keys(): liste[i] = dictionnaire[liste[i]] return ' '.join(liste) def remove_stop_words(strings): liste_stopword_unicode = [str(item) for item in liste_stopword] en_stops = set(stopwords.words('english') + liste_stopword_unicode) fr_stops = set(stopwords.words('french') + liste_stopword_unicode) list_DESCRIPTION = strings.split(' ') cleaned_list = [] for ingredient in list_DESCRIPTION: temp = ingredient.split(' ') cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in en_stops]) cleaned_list.append(cleaned_ingredient) strings = ' '.join([ingredient for ingredient in cleaned_list]) list_DESCRIPTION = strings.split(' ') cleaned_list = [] for ingredient in list_DESCRIPTION: temp = ingredient.split(' ') cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in fr_stops]) cleaned_list.append(cleaned_ingredient) strings = ' '.join([ingredient for ingredient in cleaned_list]) return strings en_stemmer = PorterStemmer() fr_stemmer = FrenchStemmer() def stem_sentence(sentence, stemmer): words = sentence.split(' ') stemmed_words = [stemmer.stem(word) for word in words] stemmed_sentence = ' '.join(stemmed_words) return stemmed_sentence def english_stemmer(strings): list_ingredients = strings.split(' ') stemmed_list = [stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients] strings = ' '.join(stemmed_list) return strings def french_stemmer(strings): list_ingredients = strings.split(',') stemmed_list = [stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients] strings = ' '.join(stemmed_list) return strings def clearning(*args): df = data_import(*args) print(df.shape) df.drop_duplicates(inplace= True) df['DESCRIPTION'] = df['DESCRIPTION'].apply(supprimer_mot_double) df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning) df['BEM_CLASS_DESC_FR'] = df['BEM_CLASS_DESC_FR'].apply(data_cleaning) df.DESCRIPTION = df.DESCRIPTION.apply(standardization) final_df = df.copy() final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words) final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True) final_df.drop_duplicates(inplace=True) #final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer) #final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer) print("Traitement done") return(final_df) def clearning_modele(df): print(df.shape) df.drop_duplicates(inplace= True) df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning) df['Regroupement_de_Class'] = df['Regroupement_de_Class'].apply(data_cleaning) df.DESCRIPTION = df.DESCRIPTION.apply(standardization) final_df = df.copy() final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words) final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True) final_df.drop_duplicates(inplace=True) final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer) final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer) print("Traitement done") return(final_df) def clearning_pred(X): X = data_cleaning(X) X = standardization(X) X = remove_stop_words(X) X = english_stemmer(X) X = french_stemmer(X) return X