rakuten / src /features /build_features.py
Demosthene-OR's picture
Configure LFS for images and update code
eb5ec73
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pickle
import math
import json
import os
import sys
class DataImporter:
def __init__(self, x_train_path="data/preprocessed/X_train_update.csv", y_train_path="data/preprocessed/Y_train_CVw08PX.csv", model_path="models"):
self.x_train_path = x_train_path
self.y_train_path = y_train_path
self.model_path = model_path
def load_data(self):
data = pd.read_csv(self.x_train_path)
# Remplacer les NaN par des chaînes vides
data["designation"] = data["designation"].fillna('')
data["description"] = data["description"].fillna('')
data["description"] = data["designation"] + " " + data["description"]
data = data.drop(["Unnamed: 0", "designation"], axis=1)
target = pd.read_csv(self.y_train_path)
target = target.drop(["Unnamed: 0"], axis=1)
if not os.path.exists(f"{self.model_path}/mapper.json"):
modalite_mapping = {
modalite: i for i, modalite in enumerate(target["prdtypecode"].unique())
}
# with open(f"{self.model_path}mapper.pkl", "wb") as fichier:
# pickle.dump(modalite_mapping, fichier)
with open(f"{self.model_path}/mapper.json", "w") as fichier_json:
json_mapper = {str(v): str(k) for k, v in modalite_mapping.items()}
json.dump(json_mapper, fichier_json)
else:
with open(f"{self.model_path}/mapper.json", "r") as json_file:
modalite_mapping = json.load(json_file)
modalite_mapping = {int(v): int(k) for k, v in modalite_mapping.items()}
target["prdtypecode"] = target["prdtypecode"].replace(modalite_mapping)
df = pd.concat([data, target], axis=1)
return df
def split_train_test(self, df, samples_per_class=0, random_state=42, with_test=False):
# Dans la suite, si samples_per_class==0, on entraine le modele sur la totalité de df
# Sinon on l'entraine sur un jeu de donnée equilibrée (X_train = nb de classes * samples_per_class)
print("len(df) = ",len(df))
grouped_data = df.groupby("prdtypecode")
num_samples_per_class_test = 50
# Le calcul suivant est nécessaire si on entraine le modele sur la totalité de df (i.e. samples_per_class==0,)
class_size = grouped_data.size().tolist()
train_size = [int(n*0.8) for n in class_size]
# Le nombre de ligne de X_test est plafonné à 50 * nombre de classes
test_reduc = 1.0 if (len(df)//10) < (27*num_samples_per_class_test) else (27*num_samples_per_class_test) / len(df)
test_size = [(math.ceil(test_reduc *n) if with_test else 0) for n in class_size]
val_size = [(class_size[i]-train_size[i]-test_size[i]) for i in range(len(class_size))]
X_train_samples = []
X_test_samples = []
i=0
for _, group in grouped_data:
if (samples_per_class > 0):
samples = group.sample(n=samples_per_class, random_state=random_state)
else:
samples = group.sample(n=train_size[i], random_state=random_state)
i +=1
X_train_samples.append(samples)
remaining_samples = group.drop(samples.index)
X_test_samples.append(remaining_samples)
X_train = pd.concat(X_train_samples)
X_test = pd.concat(X_test_samples)
X_train = X_train.sample(frac=1, random_state=random_state).reset_index(drop=True)
X_test = X_test.sample(frac=1, random_state=random_state).reset_index(drop=True)
y_train = X_train["prdtypecode"]
X_train = X_train.drop(["prdtypecode"], axis=1)
if (samples_per_class > 0):
val_samples_per_class = max(int(samples_per_class/12),3)
else:
val_samples_per_class = 0
grouped_data_test = X_test.groupby("prdtypecode")
X_test = X_test.drop(X_test.index)
y_test=[]
X_val_samples = []
X_test_samples = []
i=0
for _, group in grouped_data_test:
if (val_samples_per_class > 0):
samples = group.sample(n=val_samples_per_class, random_state=random_state)
else:
samples = group.sample(n=val_size[i], random_state=random_state)
i +=1
X_val_samples.append(samples)
remaining_samples = group.drop(samples.index)
if with_test and (val_samples_per_class > 0):
X_test_samples.append(remaining_samples[:num_samples_per_class_test])
elif with_test:
X_test_samples.append(remaining_samples)
X_val = pd.concat(X_val_samples)
X_val = X_val.sample(frac=1, random_state=random_state).reset_index(drop=True)
y_val = X_val["prdtypecode"]
X_val = X_val.drop(["prdtypecode"], axis=1)
if with_test:
X_test = pd.concat(X_test_samples)
X_test = X_test.sample(frac=1, random_state=random_state).reset_index(drop=True)
y_test = X_test["prdtypecode"]
X_test= X_test.drop(["prdtypecode"], axis=1)
print('============================')
print("Dataset size : ", len(X_train)+len(X_val)+len(X_test))
print("Train size : ", len(X_train))
print("Val size : ", len(X_val))
print("Test size : ", len(X_test))
print('============================')
# sys.exit(0)
return X_train, X_val, X_test, y_train, y_val, y_test
class ImagePreprocessor:
def __init__(self, filepath="data/preprocessed/image_train"):
self.filepath = filepath
def preprocess_images_in_df(self, df):
df["image_path"] = (
f"{self.filepath}/image_"
+ df["imageid"].astype(str)
+ "_product_"
+ df["productid"].astype(str)
+ ".jpg"
)
class TextPreprocessor:
def __init__(self):
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
self.lemmatizer = WordNetLemmatizer()
self.stop_words = set(
stopwords.words("french")
) # Vous pouvez choisir une autre langue si nécessaire
def preprocess_text(self, text):
if isinstance(text, float) and math.isnan(text):
return ""
# Supprimer les balises HTML
text = BeautifulSoup(text, "html.parser").get_text()
# Supprimer les caractères non alphabétiques
text = re.sub(r"[^a-zA-Z]", " ", text)
# Tokenization
words = word_tokenize(text.lower())
# Suppression des stopwords et lemmatisation
filtered_words = [
self.lemmatizer.lemmatize(word)
for word in words
if word not in self.stop_words
]
return " ".join(filtered_words[:50])
def preprocess_text_in_df(self, df, columns):
for column in columns:
df[column] = df[column].apply(self.preprocess_text)