File size: 4,508 Bytes

f1f2665
 
 
 
 
 
 
 
 
 
d88cfeb
f1f2665

import re
import string
from traning_zone.standardisation.dictionnaire import *
from traning_zone.data_import.data_importation import *

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

def supprimer_mot_double(expression):

    mots = expression.split()
    mots_uniques = []

    for mot in mots:
        if mot not in mots_uniques:
            mots_uniques.append(mot)
    
    nouvelle_expression = " ".join(mots_uniques)
    return nouvelle_expression


def data_cleaning(strings):

    strings = strings.lower().strip()
    strings = strings.replace('\'',' ')
    strings = strings.replace('/',' ')
    strings = re.sub(r'[^\w\s]', ' ', strings)
    text_normalized = re.sub('[^A-Za-z ,éêèîôœàâ]+', ' ', strings)

    return text_normalized


def standardization(strings):
  liste = strings.split(' ')
  for i in range(len(liste)) :
    if liste[i] in dictionnaire.keys():
      liste[i] = dictionnaire[liste[i]]
  return ' '.join(liste)


def remove_stop_words(strings):
    liste_stopword_unicode = [str(item) for item in liste_stopword]
    en_stops = set(stopwords.words('english') + liste_stopword_unicode)
    fr_stops = set(stopwords.words('french') + liste_stopword_unicode)

    list_DESCRIPTION = strings.split(' ')
    cleaned_list = []

    for ingredient in list_DESCRIPTION:
        temp = ingredient.split(' ')
        cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in en_stops])
        cleaned_list.append(cleaned_ingredient)

    strings = ' '.join([ingredient for ingredient in cleaned_list])
    list_DESCRIPTION = strings.split(' ')
    cleaned_list = []

    for ingredient in list_DESCRIPTION:
        temp = ingredient.split(' ')
        cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in fr_stops])
        cleaned_list.append(cleaned_ingredient)

    strings = ' '.join([ingredient for ingredient in cleaned_list])
    return strings


en_stemmer = PorterStemmer()
fr_stemmer = FrenchStemmer()


def stem_sentence(sentence, stemmer):
    words = sentence.split(' ')
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_sentence = ' '.join(stemmed_words)
    return stemmed_sentence


def english_stemmer(strings):
    list_ingredients = strings.split(' ')
    stemmed_list = [stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients]
    strings = ' '.join(stemmed_list)
    return strings


def french_stemmer(strings):
    list_ingredients = strings.split(',')
    stemmed_list = [stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients]
    strings = ' '.join(stemmed_list)
    return strings


def clearning(*args):
   
   df = data_import(*args)
   print(df.shape)
   df.drop_duplicates(inplace= True)

   df['DESCRIPTION'] = df['DESCRIPTION'].apply(supprimer_mot_double)
   df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning)
   df['BEM_CLASS_DESC_FR'] = df['BEM_CLASS_DESC_FR'].apply(data_cleaning)

   df.DESCRIPTION  = df.DESCRIPTION.apply(standardization)

   final_df = df.copy()

   final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words)

   final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True)
   final_df.drop_duplicates(inplace=True)

   #final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer)
   #final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer)
   
   print("Traitement done")
   return(final_df)


def clearning_modele(df):
   
   print(df.shape)
   df.drop_duplicates(inplace= True)

   df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning)
   df['Regroupement_de_Class'] = df['Regroupement_de_Class'].apply(data_cleaning)

   df.DESCRIPTION  = df.DESCRIPTION.apply(standardization)

   final_df = df.copy()

   final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words)

   final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True)
   final_df.drop_duplicates(inplace=True)

   final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer)
   final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer)
   
   print("Traitement done")
   return(final_df)


def clearning_pred(X):
   
   X  = data_cleaning(X)
   X = standardization(X)
   X = remove_stop_words(X)
   X = english_stemmer(X)
   X = french_stemmer(X)

   return X