Spaces:

bourahima
/

Carrefourrefbem

Sleeping

File size: 8,002 Bytes

import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Callable
from App.utils.standadisation import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords


@st.cache_data
def convert_df(df):
    return df.to_csv().encode("utf-8")


@st.cache_data
def supprime_country(df):
    try:
        df.drop(["Country"], axis=1, inplace=True)
    except:
        try:
            df.drop(["COUNTRY_KEY"], axis=1, inplace=True)
        except:
            try:
                df.drop(["COUNTRY"], axis=1, inplace=True)
            except:
                pass
    return df


def merge_and_update_classification(
    main_df, update_df, product_id_col, classification_col
):
    """
    Merge two DataFrames and update the classification based on the update_df.
    Only rows where the classification has changed are retained.

    Args:
    main_df (pd.DataFrame): The main DataFrame containing original data.
    update_df (pd.DataFrame): DataFrame containing updated classifications.
    product_id_col (str): Name of the column used as the product identifier.
    classification_col (str): Name of the classification column to be updated.

    Returns:
    pd.DataFrame: A DataFrame containing only the rows where classification was updated.
    """
    # Get unique product IDs from the update DataFrame
    update_product_ids = update_df[product_id_col].unique()

    # Filter main DataFrame to include only products in the update DataFrame
    filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)]

    # Preserve the original classification
    original_classification_col = f"original_{classification_col}"
    filtered_main_df[original_classification_col] = filtered_main_df[classification_col]

    # Merge DataFrames
    merged_df = pd.merge(
        filtered_main_df,
        update_df,
        on=[product_id_col],
        how="inner",
        suffixes=("_main", "_update"),
        indicator=True,
    )

    # Update classification, keeping original if update is NaN
    merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna(
        merged_df[original_classification_col]
    )

    # Keep only rows where classification has changed
    updated_df = merged_df[
        merged_df[f"{classification_col}_main"]
        != merged_df[f"{classification_col}_update"]
    ]

    # Remove merge indicator column
    final_df = updated_df.drop(columns=["_merge"])

    return final_df


def data_cleaning_func(strings):

    strings = strings.lower().strip()
    strings = strings.replace("'", " ")
    strings = strings.replace("/", " ")
    strings = re.sub(r"[^\w\s]", " ", strings)
    text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings)

    return text_normalized


def standardization_func(strings):
    liste = strings.split(" ")
    for i in range(len(liste)):
        if liste[i] in dictionnaire.keys():
            liste[i] = dictionnaire[liste[i]]
    return " ".join(liste)


def remove_stop_words_func(strings):
    liste_stopword_unicode = [str(item) for item in liste_stopword]
    en_stops = set(stopwords.words("english") + liste_stopword_unicode)
    fr_stops = set(stopwords.words("french") + liste_stopword_unicode)

    list_DESCRIPTION = strings.split(" ")
    cleaned_list = []

    for ingredient in list_DESCRIPTION:
        temp = ingredient.split(" ")
        cleaned_ingredient = " ".join(
            [word for word in temp if word.lower() not in en_stops]
        )
        cleaned_list.append(cleaned_ingredient)

    strings = " ".join([ingredient for ingredient in cleaned_list])
    list_DESCRIPTION = strings.split(" ")
    cleaned_list = []

    for ingredient in list_DESCRIPTION:
        temp = ingredient.split(" ")
        cleaned_ingredient = " ".join(
            [word for word in temp if word.lower() not in fr_stops]
        )
        cleaned_list.append(cleaned_ingredient)

    strings = " ".join([ingredient for ingredient in cleaned_list])
    return strings


en_stemmer = PorterStemmer()
fr_stemmer = FrenchStemmer()


def stem_sentence(sentence, stemmer):
    words = sentence.split(" ")
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence


def english_stemmer(strings):
    list_ingredients = strings.split(" ")
    stemmed_list = [
        stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients
    ]
    strings = " ".join(stemmed_list)
    return strings


def french_stemmer(strings):
    list_ingredients = strings.split(",")
    stemmed_list = [
        stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients
    ]
    strings = " ".join(stemmed_list)
    return strings


def cosine_similarity_func(expr1, expr2):

    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([expr1, expr2])
    similarity = cosine_similarity(vectors[0], vectors[1])

    return similarity[0][0]


def add_text_similarity(
    df: pd.DataFrame,
    data_cleaning_func: Callable = data_cleaning_func,
    remove_stop_words_func: Callable = remove_stop_words_func,
    standardization_func: Callable = standardization_func,
    cosine_similarity_func: Callable = cosine_similarity_func,
) -> pd.DataFrame:
    """
    Add text similarity measures to the DataFrame based on item descriptions.

    Args:
    df (pd.DataFrame): Input DataFrame containing item descriptions.
    data_cleaning_func (Callable): Function to clean the text data.
    remove_stop_words_func (Callable): Function to remove stop words.
    standardization_func (Callable): Function to standardize text.
    cosine_similarity_func (Callable): Function to calculate cosine similarity.

    Returns:
    pd.DataFrame: DataFrame with added text similarity measures.
    """
    # Clean item descriptions
    df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply(
        data_cleaning_func
    )
    df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply(
        data_cleaning_func
    )

    # Remove stop words (French and English)
    for language in ["french", "english"]:
        stop_words = set(stopwords.words(language))
        for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
            df[col] = df[col].apply(
                lambda x: " ".join(
                    word for word in x.split() if word.lower() not in stop_words
                )
            )

    # Apply custom stop words removal
    for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
        df[col] = df[col].apply(remove_stop_words_func)

    # Standardize text
    for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
        df[col] = df[col].apply(standardization_func)

    # Calculate cosine similarity
    df["Cosine_Similarity"] = df.apply(
        lambda row: cosine_similarity_func(
            row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"]
        ),
        axis=1,
    )

    return df


def display_data_with_download_button(
    df,
    title="Data without decision-making"
) -> None:
    if df.empty:
        st.write("No result for the above criterion ")
    else:
        st.subheader(title)
        df.loc[:, "Evaluation"] = True
        edited_df = st.data_editor(df)
        csv_data = convert_df(edited_df)
        try:
            st.download_button(
                label="Download data as CSV",
                data=csv_data,
                file_name=f"{title}.csv",
                mime="text/csv",
                key=title,
            )
        except:
            st.download_button(
                label="Download data as CSV",
                data=csv_data,
                file_name=f"{title}.csv",
                mime="text/csv",
                key=title + "1",
            )