|
|
import re |
|
|
import string |
|
|
from traning_zone.standardisation.dictionnaire import * |
|
|
from traning_zone.data_import.data_importation import * |
|
|
|
|
|
import nltk |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.stem import PorterStemmer |
|
|
from nltk.stem.snowball import FrenchStemmer |
|
|
from nltk.corpus import stopwords |
|
|
nltk.download('stopwords') |
|
|
|
|
|
def supprimer_mot_double(expression): |
|
|
|
|
|
mots = expression.split() |
|
|
mots_uniques = [] |
|
|
|
|
|
for mot in mots: |
|
|
if mot not in mots_uniques: |
|
|
mots_uniques.append(mot) |
|
|
|
|
|
nouvelle_expression = " ".join(mots_uniques) |
|
|
return nouvelle_expression |
|
|
|
|
|
|
|
|
def data_cleaning(strings): |
|
|
|
|
|
strings = strings.lower().strip() |
|
|
strings = strings.replace('\'',' ') |
|
|
strings = strings.replace('/',' ') |
|
|
strings = re.sub(r'[^\w\s]', ' ', strings) |
|
|
text_normalized = re.sub('[^A-Za-z ,éêèîôœàâ]+', ' ', strings) |
|
|
|
|
|
return text_normalized |
|
|
|
|
|
|
|
|
def standardization(strings): |
|
|
liste = strings.split(' ') |
|
|
for i in range(len(liste)) : |
|
|
if liste[i] in dictionnaire.keys(): |
|
|
liste[i] = dictionnaire[liste[i]] |
|
|
return ' '.join(liste) |
|
|
|
|
|
|
|
|
def remove_stop_words(strings): |
|
|
liste_stopword_unicode = [str(item) for item in liste_stopword] |
|
|
en_stops = set(stopwords.words('english') + liste_stopword_unicode) |
|
|
fr_stops = set(stopwords.words('french') + liste_stopword_unicode) |
|
|
|
|
|
list_DESCRIPTION = strings.split(' ') |
|
|
cleaned_list = [] |
|
|
|
|
|
for ingredient in list_DESCRIPTION: |
|
|
temp = ingredient.split(' ') |
|
|
cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in en_stops]) |
|
|
cleaned_list.append(cleaned_ingredient) |
|
|
|
|
|
strings = ' '.join([ingredient for ingredient in cleaned_list]) |
|
|
list_DESCRIPTION = strings.split(' ') |
|
|
cleaned_list = [] |
|
|
|
|
|
for ingredient in list_DESCRIPTION: |
|
|
temp = ingredient.split(' ') |
|
|
cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in fr_stops]) |
|
|
cleaned_list.append(cleaned_ingredient) |
|
|
|
|
|
strings = ' '.join([ingredient for ingredient in cleaned_list]) |
|
|
return strings |
|
|
|
|
|
|
|
|
en_stemmer = PorterStemmer() |
|
|
fr_stemmer = FrenchStemmer() |
|
|
|
|
|
|
|
|
def stem_sentence(sentence, stemmer): |
|
|
words = sentence.split(' ') |
|
|
stemmed_words = [stemmer.stem(word) for word in words] |
|
|
stemmed_sentence = ' '.join(stemmed_words) |
|
|
return stemmed_sentence |
|
|
|
|
|
|
|
|
def english_stemmer(strings): |
|
|
list_ingredients = strings.split(' ') |
|
|
stemmed_list = [stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients] |
|
|
strings = ' '.join(stemmed_list) |
|
|
return strings |
|
|
|
|
|
|
|
|
def french_stemmer(strings): |
|
|
list_ingredients = strings.split(',') |
|
|
stemmed_list = [stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients] |
|
|
strings = ' '.join(stemmed_list) |
|
|
return strings |
|
|
|
|
|
|
|
|
def clearning(*args): |
|
|
|
|
|
df = data_import(*args) |
|
|
print(df.shape) |
|
|
df.drop_duplicates(inplace= True) |
|
|
|
|
|
df['DESCRIPTION'] = df['DESCRIPTION'].apply(supprimer_mot_double) |
|
|
df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning) |
|
|
df['BEM_CLASS_DESC_FR'] = df['BEM_CLASS_DESC_FR'].apply(data_cleaning) |
|
|
|
|
|
df.DESCRIPTION = df.DESCRIPTION.apply(standardization) |
|
|
|
|
|
final_df = df.copy() |
|
|
|
|
|
final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words) |
|
|
|
|
|
final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True) |
|
|
final_df.drop_duplicates(inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Traitement done") |
|
|
return(final_df) |
|
|
|
|
|
|
|
|
def clearning_modele(df): |
|
|
|
|
|
print(df.shape) |
|
|
df.drop_duplicates(inplace= True) |
|
|
|
|
|
df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning) |
|
|
df['Regroupement_de_Class'] = df['Regroupement_de_Class'].apply(data_cleaning) |
|
|
|
|
|
df.DESCRIPTION = df.DESCRIPTION.apply(standardization) |
|
|
|
|
|
final_df = df.copy() |
|
|
|
|
|
final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words) |
|
|
|
|
|
final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True) |
|
|
final_df.drop_duplicates(inplace=True) |
|
|
|
|
|
final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer) |
|
|
final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer) |
|
|
|
|
|
print("Traitement done") |
|
|
return(final_df) |
|
|
|
|
|
|
|
|
def clearning_pred(X): |
|
|
|
|
|
X = data_cleaning(X) |
|
|
X = standardization(X) |
|
|
X = remove_stop_words(X) |
|
|
X = english_stemmer(X) |
|
|
X = french_stemmer(X) |
|
|
|
|
|
return X |
|
|
|