Spaces:

Demosthene-OR
/

rakuten

Paused

App Files Files Community

rakuten / src /features /build_features.py

Demosthene-OR

Configure LFS for images and update code

eb5ec73 3 months ago

raw

history blame contribute delete

7.31 kB

	import pandas as pd
	import nltk
	from bs4 import BeautifulSoup
	import re
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	import pickle
	import math
	import json
	import os
	import sys


	class DataImporter:
	def __init__(self, x_train_path="data/preprocessed/X_train_update.csv", y_train_path="data/preprocessed/Y_train_CVw08PX.csv", model_path="models"):
	self.x_train_path = x_train_path
	self.y_train_path = y_train_path
	self.model_path = model_path

	def load_data(self):
	data = pd.read_csv(self.x_train_path)
	# Remplacer les NaN par des chaînes vides
	data["designation"] = data["designation"].fillna('')
	data["description"] = data["description"].fillna('')
	data["description"] = data["designation"] + " " + data["description"]
	data = data.drop(["Unnamed: 0", "designation"], axis=1)

	target = pd.read_csv(self.y_train_path)
	target = target.drop(["Unnamed: 0"], axis=1)
	if not os.path.exists(f"{self.model_path}/mapper.json"):
	modalite_mapping = {
	modalite: i for i, modalite in enumerate(target["prdtypecode"].unique())
	}
	# with open(f"{self.model_path}mapper.pkl", "wb") as fichier:
	# pickle.dump(modalite_mapping, fichier)
	with open(f"{self.model_path}/mapper.json", "w") as fichier_json:
	json_mapper = {str(v): str(k) for k, v in modalite_mapping.items()}
	json.dump(json_mapper, fichier_json)
	else:
	with open(f"{self.model_path}/mapper.json", "r") as json_file:
	modalite_mapping = json.load(json_file)
	modalite_mapping = {int(v): int(k) for k, v in modalite_mapping.items()}

	target["prdtypecode"] = target["prdtypecode"].replace(modalite_mapping)

	df = pd.concat([data, target], axis=1)

	return df

	def split_train_test(self, df, samples_per_class=0, random_state=42, with_test=False):
	# Dans la suite, si samples_per_class==0, on entraine le modele sur la totalité de df
	# Sinon on l'entraine sur un jeu de donnée equilibrée (X_train = nb de classes * samples_per_class)

	print("len(df) = ",len(df))
	grouped_data = df.groupby("prdtypecode")
	num_samples_per_class_test = 50

	# Le calcul suivant est nécessaire si on entraine le modele sur la totalité de df (i.e. samples_per_class==0,)
	class_size = grouped_data.size().tolist()
	train_size = [int(n*0.8) for n in class_size]
	# Le nombre de ligne de X_test est plafonné à 50 * nombre de classes
	test_reduc = 1.0 if (len(df)//10) < (27num_samples_per_class_test) else (27num_samples_per_class_test) / len(df)
	test_size = [(math.ceil(test_reduc *n) if with_test else 0) for n in class_size]
	val_size = [(class_size[i]-train_size[i]-test_size[i]) for i in range(len(class_size))]

	X_train_samples = []
	X_test_samples = []
	i=0

	for _, group in grouped_data:

	if (samples_per_class > 0):
	samples = group.sample(n=samples_per_class, random_state=random_state)
	else:
	samples = group.sample(n=train_size[i], random_state=random_state)
	i +=1
	X_train_samples.append(samples)

	remaining_samples = group.drop(samples.index)
	X_test_samples.append(remaining_samples)

	X_train = pd.concat(X_train_samples)
	X_test = pd.concat(X_test_samples)

	X_train = X_train.sample(frac=1, random_state=random_state).reset_index(drop=True)
	X_test = X_test.sample(frac=1, random_state=random_state).reset_index(drop=True)

	y_train = X_train["prdtypecode"]
	X_train = X_train.drop(["prdtypecode"], axis=1)

	if (samples_per_class > 0):
	val_samples_per_class = max(int(samples_per_class/12),3)
	else:
	val_samples_per_class = 0

	grouped_data_test = X_test.groupby("prdtypecode")
	X_test = X_test.drop(X_test.index)
	y_test=[]
	X_val_samples = []
	X_test_samples = []

	i=0
	for _, group in grouped_data_test:
	if (val_samples_per_class > 0):
	samples = group.sample(n=val_samples_per_class, random_state=random_state)
	else:
	samples = group.sample(n=val_size[i], random_state=random_state)
	i +=1
	X_val_samples.append(samples)
	remaining_samples = group.drop(samples.index)
	if with_test and (val_samples_per_class > 0):
	X_test_samples.append(remaining_samples[:num_samples_per_class_test])
	elif with_test:
	X_test_samples.append(remaining_samples)

	X_val = pd.concat(X_val_samples)
	X_val = X_val.sample(frac=1, random_state=random_state).reset_index(drop=True)
	y_val = X_val["prdtypecode"]
	X_val = X_val.drop(["prdtypecode"], axis=1)

	if with_test:
	X_test = pd.concat(X_test_samples)
	X_test = X_test.sample(frac=1, random_state=random_state).reset_index(drop=True)
	y_test = X_test["prdtypecode"]
	X_test= X_test.drop(["prdtypecode"], axis=1)

	print('============================')
	print("Dataset size : ", len(X_train)+len(X_val)+len(X_test))
	print("Train size : ", len(X_train))
	print("Val size : ", len(X_val))
	print("Test size : ", len(X_test))
	print('============================')
	# sys.exit(0)
	return X_train, X_val, X_test, y_train, y_val, y_test


	class ImagePreprocessor:
	def __init__(self, filepath="data/preprocessed/image_train"):
	self.filepath = filepath

	def preprocess_images_in_df(self, df):
	df["image_path"] = (
	f"{self.filepath}/image_"
	+ df["imageid"].astype(str)
	+ "_product_"
	+ df["productid"].astype(str)
	+ ".jpg"
	)


	class TextPreprocessor:
	def __init__(self):
	nltk.download("punkt")
	nltk.download("stopwords")
	nltk.download("wordnet")
	self.lemmatizer = WordNetLemmatizer()
	self.stop_words = set(
	stopwords.words("french")
	) # Vous pouvez choisir une autre langue si nécessaire

	def preprocess_text(self, text):

	if isinstance(text, float) and math.isnan(text):
	return ""
	# Supprimer les balises HTML
	text = BeautifulSoup(text, "html.parser").get_text()

	# Supprimer les caractères non alphabétiques
	text = re.sub(r"[^a-zA-Z]", " ", text)

	# Tokenization
	words = word_tokenize(text.lower())

	# Suppression des stopwords et lemmatisation
	filtered_words = [
	self.lemmatizer.lemmatize(word)
	for word in words
	if word not in self.stop_words
	]

	return " ".join(filtered_words[:50])

	def preprocess_text_in_df(self, df, columns):
	for column in columns:
	df[column] = df[column].apply(self.preprocess_text)