Spaces:
Sleeping
Sleeping
File size: 8,002 Bytes
a61aa1b d6a3367 fa8b2f8 2c49a88 d6a3367 a61aa1b 2c49a88 a61aa1b 2c49a88 a61aa1b 2c49a88 a61aa1b 2c49a88 a61aa1b 2c49a88 a61aa1b 2c49a88 a61aa1b 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 2c49a88 d6a3367 fa8b2f8 2c49a88 fa8b2f8 39149ca 2c49a88 39149ca 2c49a88 785c15d 2c49a88 ed59366 2c49a88 ed59366 2c49a88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Callable
from App.utils.standadisation import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
@st.cache_data
def convert_df(df):
return df.to_csv().encode("utf-8")
@st.cache_data
def supprime_country(df):
try:
df.drop(["Country"], axis=1, inplace=True)
except:
try:
df.drop(["COUNTRY_KEY"], axis=1, inplace=True)
except:
try:
df.drop(["COUNTRY"], axis=1, inplace=True)
except:
pass
return df
def merge_and_update_classification(
main_df, update_df, product_id_col, classification_col
):
"""
Merge two DataFrames and update the classification based on the update_df.
Only rows where the classification has changed are retained.
Args:
main_df (pd.DataFrame): The main DataFrame containing original data.
update_df (pd.DataFrame): DataFrame containing updated classifications.
product_id_col (str): Name of the column used as the product identifier.
classification_col (str): Name of the classification column to be updated.
Returns:
pd.DataFrame: A DataFrame containing only the rows where classification was updated.
"""
# Get unique product IDs from the update DataFrame
update_product_ids = update_df[product_id_col].unique()
# Filter main DataFrame to include only products in the update DataFrame
filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)]
# Preserve the original classification
original_classification_col = f"original_{classification_col}"
filtered_main_df[original_classification_col] = filtered_main_df[classification_col]
# Merge DataFrames
merged_df = pd.merge(
filtered_main_df,
update_df,
on=[product_id_col],
how="inner",
suffixes=("_main", "_update"),
indicator=True,
)
# Update classification, keeping original if update is NaN
merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna(
merged_df[original_classification_col]
)
# Keep only rows where classification has changed
updated_df = merged_df[
merged_df[f"{classification_col}_main"]
!= merged_df[f"{classification_col}_update"]
]
# Remove merge indicator column
final_df = updated_df.drop(columns=["_merge"])
return final_df
def data_cleaning_func(strings):
strings = strings.lower().strip()
strings = strings.replace("'", " ")
strings = strings.replace("/", " ")
strings = re.sub(r"[^\w\s]", " ", strings)
text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings)
return text_normalized
def standardization_func(strings):
liste = strings.split(" ")
for i in range(len(liste)):
if liste[i] in dictionnaire.keys():
liste[i] = dictionnaire[liste[i]]
return " ".join(liste)
def remove_stop_words_func(strings):
liste_stopword_unicode = [str(item) for item in liste_stopword]
en_stops = set(stopwords.words("english") + liste_stopword_unicode)
fr_stops = set(stopwords.words("french") + liste_stopword_unicode)
list_DESCRIPTION = strings.split(" ")
cleaned_list = []
for ingredient in list_DESCRIPTION:
temp = ingredient.split(" ")
cleaned_ingredient = " ".join(
[word for word in temp if word.lower() not in en_stops]
)
cleaned_list.append(cleaned_ingredient)
strings = " ".join([ingredient for ingredient in cleaned_list])
list_DESCRIPTION = strings.split(" ")
cleaned_list = []
for ingredient in list_DESCRIPTION:
temp = ingredient.split(" ")
cleaned_ingredient = " ".join(
[word for word in temp if word.lower() not in fr_stops]
)
cleaned_list.append(cleaned_ingredient)
strings = " ".join([ingredient for ingredient in cleaned_list])
return strings
en_stemmer = PorterStemmer()
fr_stemmer = FrenchStemmer()
def stem_sentence(sentence, stemmer):
words = sentence.split(" ")
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def english_stemmer(strings):
list_ingredients = strings.split(" ")
stemmed_list = [
stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients
]
strings = " ".join(stemmed_list)
return strings
def french_stemmer(strings):
list_ingredients = strings.split(",")
stemmed_list = [
stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients
]
strings = " ".join(stemmed_list)
return strings
def cosine_similarity_func(expr1, expr2):
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform([expr1, expr2])
similarity = cosine_similarity(vectors[0], vectors[1])
return similarity[0][0]
def add_text_similarity(
df: pd.DataFrame,
data_cleaning_func: Callable = data_cleaning_func,
remove_stop_words_func: Callable = remove_stop_words_func,
standardization_func: Callable = standardization_func,
cosine_similarity_func: Callable = cosine_similarity_func,
) -> pd.DataFrame:
"""
Add text similarity measures to the DataFrame based on item descriptions.
Args:
df (pd.DataFrame): Input DataFrame containing item descriptions.
data_cleaning_func (Callable): Function to clean the text data.
remove_stop_words_func (Callable): Function to remove stop words.
standardization_func (Callable): Function to standardize text.
cosine_similarity_func (Callable): Function to calculate cosine similarity.
Returns:
pd.DataFrame: DataFrame with added text similarity measures.
"""
# Clean item descriptions
df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply(
data_cleaning_func
)
df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply(
data_cleaning_func
)
# Remove stop words (French and English)
for language in ["french", "english"]:
stop_words = set(stopwords.words(language))
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(
lambda x: " ".join(
word for word in x.split() if word.lower() not in stop_words
)
)
# Apply custom stop words removal
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(remove_stop_words_func)
# Standardize text
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(standardization_func)
# Calculate cosine similarity
df["Cosine_Similarity"] = df.apply(
lambda row: cosine_similarity_func(
row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"]
),
axis=1,
)
return df
def display_data_with_download_button(
df,
title="Data without decision-making"
) -> None:
if df.empty:
st.write("No result for the above criterion ")
else:
st.subheader(title)
df.loc[:, "Evaluation"] = True
edited_df = st.data_editor(df)
csv_data = convert_df(edited_df)
try:
st.download_button(
label="Download data as CSV",
data=csv_data,
file_name=f"{title}.csv",
mime="text/csv",
key=title,
)
except:
st.download_button(
label="Download data as CSV",
data=csv_data,
file_name=f"{title}.csv",
mime="text/csv",
key=title + "1",
)
|