COULIBALY BOURAHIMA

nltk download

d88cfeb over 2 years ago

4.51 kB

	import re
	import string
	from traning_zone.standardisation.dictionnaire import *
	from traning_zone.data_import.data_importation import *

	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.stem.snowball import FrenchStemmer
	from nltk.corpus import stopwords
	nltk.download('stopwords')

	def supprimer_mot_double(expression):

	mots = expression.split()
	mots_uniques = []

	for mot in mots:
	if mot not in mots_uniques:
	mots_uniques.append(mot)

	nouvelle_expression = " ".join(mots_uniques)
	return nouvelle_expression


	def data_cleaning(strings):

	strings = strings.lower().strip()
	strings = strings.replace('\'',' ')
	strings = strings.replace('/',' ')
	strings = re.sub(r'[^\w\s]', ' ', strings)
	text_normalized = re.sub('[^A-Za-z ,éêèîôœàâ]+', ' ', strings)

	return text_normalized


	def standardization(strings):
	liste = strings.split(' ')
	for i in range(len(liste)) :
	if liste[i] in dictionnaire.keys():
	liste[i] = dictionnaire[liste[i]]
	return ' '.join(liste)


	def remove_stop_words(strings):
	liste_stopword_unicode = [str(item) for item in liste_stopword]
	en_stops = set(stopwords.words('english') + liste_stopword_unicode)
	fr_stops = set(stopwords.words('french') + liste_stopword_unicode)

	list_DESCRIPTION = strings.split(' ')
	cleaned_list = []

	for ingredient in list_DESCRIPTION:
	temp = ingredient.split(' ')
	cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in en_stops])
	cleaned_list.append(cleaned_ingredient)

	strings = ' '.join([ingredient for ingredient in cleaned_list])
	list_DESCRIPTION = strings.split(' ')
	cleaned_list = []

	for ingredient in list_DESCRIPTION:
	temp = ingredient.split(' ')
	cleaned_ingredient = ' '.join([word for word in temp if word.lower() not in fr_stops])
	cleaned_list.append(cleaned_ingredient)

	strings = ' '.join([ingredient for ingredient in cleaned_list])
	return strings


	en_stemmer = PorterStemmer()
	fr_stemmer = FrenchStemmer()


	def stem_sentence(sentence, stemmer):
	words = sentence.split(' ')
	stemmed_words = [stemmer.stem(word) for word in words]
	stemmed_sentence = ' '.join(stemmed_words)
	return stemmed_sentence


	def english_stemmer(strings):
	list_ingredients = strings.split(' ')
	stemmed_list = [stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients]
	strings = ' '.join(stemmed_list)
	return strings


	def french_stemmer(strings):
	list_ingredients = strings.split(',')
	stemmed_list = [stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients]
	strings = ' '.join(stemmed_list)
	return strings


	def clearning(*args):

	df = data_import(*args)
	print(df.shape)
	df.drop_duplicates(inplace= True)

	df['DESCRIPTION'] = df['DESCRIPTION'].apply(supprimer_mot_double)
	df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning)
	df['BEM_CLASS_DESC_FR'] = df['BEM_CLASS_DESC_FR'].apply(data_cleaning)

	df.DESCRIPTION = df.DESCRIPTION.apply(standardization)

	final_df = df.copy()

	final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words)

	final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True)
	final_df.drop_duplicates(inplace=True)

	#final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer)
	#final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer)

	print("Traitement done")
	return(final_df)


	def clearning_modele(df):

	print(df.shape)
	df.drop_duplicates(inplace= True)

	df['DESCRIPTION'] = df['DESCRIPTION'].apply(data_cleaning)
	df['Regroupement_de_Class'] = df['Regroupement_de_Class'].apply(data_cleaning)

	df.DESCRIPTION = df.DESCRIPTION.apply(standardization)

	final_df = df.copy()

	final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(remove_stop_words)

	final_df.drop(labels=["BARCODE","COUNTRY_KEY", "BEM_CLASS_KEY"],axis = 1, inplace= True)
	final_df.drop_duplicates(inplace=True)

	final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(english_stemmer)
	final_df['DESCRIPTION'] = final_df['DESCRIPTION'].apply(french_stemmer)

	print("Traitement done")
	return(final_df)


	def clearning_pred(X):

	X = data_cleaning(X)
	X = standardization(X)
	X = remove_stop_words(X)
	X = english_stemmer(X)
	X = french_stemmer(X)

	return X