import streamlit as st import pandas as pd import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from typing import Callable from App.utils.standadisation import * from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem.snowball import FrenchStemmer from nltk.corpus import stopwords @st.cache_data def convert_df(df): return df.to_csv().encode("utf-8") @st.cache_data def supprime_country(df): try: df.drop(["Country"], axis=1, inplace=True) except: try: df.drop(["COUNTRY_KEY"], axis=1, inplace=True) except: try: df.drop(["COUNTRY"], axis=1, inplace=True) except: pass return df def merge_and_update_classification( main_df, update_df, product_id_col, classification_col ): """ Merge two DataFrames and update the classification based on the update_df. Only rows where the classification has changed are retained. Args: main_df (pd.DataFrame): The main DataFrame containing original data. update_df (pd.DataFrame): DataFrame containing updated classifications. product_id_col (str): Name of the column used as the product identifier. classification_col (str): Name of the classification column to be updated. Returns: pd.DataFrame: A DataFrame containing only the rows where classification was updated. """ # Get unique product IDs from the update DataFrame update_product_ids = update_df[product_id_col].unique() # Filter main DataFrame to include only products in the update DataFrame filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)] # Preserve the original classification original_classification_col = f"original_{classification_col}" filtered_main_df[original_classification_col] = filtered_main_df[classification_col] # Merge DataFrames merged_df = pd.merge( filtered_main_df, update_df, on=[product_id_col], how="inner", suffixes=("_main", "_update"), indicator=True, ) # Update classification, keeping original if update is NaN merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna( merged_df[original_classification_col] ) # Keep only rows where classification has changed updated_df = merged_df[ merged_df[f"{classification_col}_main"] != merged_df[f"{classification_col}_update"] ] # Remove merge indicator column final_df = updated_df.drop(columns=["_merge"]) return final_df def data_cleaning_func(strings): strings = strings.lower().strip() strings = strings.replace("'", " ") strings = strings.replace("/", " ") strings = re.sub(r"[^\w\s]", " ", strings) text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings) return text_normalized def standardization_func(strings): liste = strings.split(" ") for i in range(len(liste)): if liste[i] in dictionnaire.keys(): liste[i] = dictionnaire[liste[i]] return " ".join(liste) def remove_stop_words_func(strings): liste_stopword_unicode = [str(item) for item in liste_stopword] en_stops = set(stopwords.words("english") + liste_stopword_unicode) fr_stops = set(stopwords.words("french") + liste_stopword_unicode) list_DESCRIPTION = strings.split(" ") cleaned_list = [] for ingredient in list_DESCRIPTION: temp = ingredient.split(" ") cleaned_ingredient = " ".join( [word for word in temp if word.lower() not in en_stops] ) cleaned_list.append(cleaned_ingredient) strings = " ".join([ingredient for ingredient in cleaned_list]) list_DESCRIPTION = strings.split(" ") cleaned_list = [] for ingredient in list_DESCRIPTION: temp = ingredient.split(" ") cleaned_ingredient = " ".join( [word for word in temp if word.lower() not in fr_stops] ) cleaned_list.append(cleaned_ingredient) strings = " ".join([ingredient for ingredient in cleaned_list]) return strings en_stemmer = PorterStemmer() fr_stemmer = FrenchStemmer() def stem_sentence(sentence, stemmer): words = sentence.split(" ") stemmed_words = [stemmer.stem(word) for word in words] stemmed_sentence = " ".join(stemmed_words) return stemmed_sentence def english_stemmer(strings): list_ingredients = strings.split(" ") stemmed_list = [ stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients ] strings = " ".join(stemmed_list) return strings def french_stemmer(strings): list_ingredients = strings.split(",") stemmed_list = [ stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients ] strings = " ".join(stemmed_list) return strings def cosine_similarity_func(expr1, expr2): vectorizer = CountVectorizer() vectors = vectorizer.fit_transform([expr1, expr2]) similarity = cosine_similarity(vectors[0], vectors[1]) return similarity[0][0] def add_text_similarity( df: pd.DataFrame, data_cleaning_func: Callable = data_cleaning_func, remove_stop_words_func: Callable = remove_stop_words_func, standardization_func: Callable = standardization_func, cosine_similarity_func: Callable = cosine_similarity_func, ) -> pd.DataFrame: """ Add text similarity measures to the DataFrame based on item descriptions. Args: df (pd.DataFrame): Input DataFrame containing item descriptions. data_cleaning_func (Callable): Function to clean the text data. remove_stop_words_func (Callable): Function to remove stop words. standardization_func (Callable): Function to standardize text. cosine_similarity_func (Callable): Function to calculate cosine similarity. Returns: pd.DataFrame: DataFrame with added text similarity measures. """ # Clean item descriptions df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply( data_cleaning_func ) df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply( data_cleaning_func ) # Remove stop words (French and English) for language in ["french", "english"]: stop_words = set(stopwords.words(language)) for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: df[col] = df[col].apply( lambda x: " ".join( word for word in x.split() if word.lower() not in stop_words ) ) # Apply custom stop words removal for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: df[col] = df[col].apply(remove_stop_words_func) # Standardize text for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: df[col] = df[col].apply(standardization_func) # Calculate cosine similarity df["Cosine_Similarity"] = df.apply( lambda row: cosine_similarity_func( row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"] ), axis=1, ) return df def display_data_with_download_button( df, title="Data without decision-making" ) -> None: if df.empty: st.write("No result for the above criterion ") else: st.subheader(title) df.loc[:, "Evaluation"] = True edited_df = st.data_editor(df) csv_data = convert_df(edited_df) try: st.download_button( label="Download data as CSV", data=csv_data, file_name=f"{title}.csv", mime="text/csv", key=title, ) except: st.download_button( label="Download data as CSV", data=csv_data, file_name=f"{title}.csv", mime="text/csv", key=title + "1", )