Spaces:

NTDuy
/

vietnamese_ecommerce_topic_classification

Paused

vietnamese_ecommerce_topic_classification

File size: 4,886 Bytes

import numpy as np
import pandas as pd
import streamlit as st
import re
from data_crawler.Shopee_crawl import ShopeeCrawler
from data_crawler.Tiki_Crawl import *
import plotly.express as px
import plotly.graph_objects as go
from utils.data_preprocessing import cleaning, cleaning_for_phobert 
from supervised_model.phobert import InferencePhobert
import io
from graphs import *
st.set_page_config(layout="wide")

MODEL_PATH = "NTDuy/Phobert-base-v2-shopee"
tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"])
if tokenizer_option == "VnCoreNLP":
    TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
elif tokenizer_option == "underthesea":
    TOKENIZE_PATH = "underthesea"

buffer = io.BytesIO()


@st.cache_resource
def categorize_comments(classification_df):
    model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
    preprocessed = model.preprocess(classification_df["comment"])
    dataset = model.generate_dataset(preprocessed, batch_size = 64)
    result = model.predict(dataset)
    return result

@st.cache_resource
def categorize_sentence(text):
    model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
    result = model.predict_sentence(text)
    return result

def download_button_data(df):
    buffer = io.BytesIO()
    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
        # Write each dataframe to a different worksheet.
        df.to_excel(writer, sheet_name='Sheet1')
        # Close the Pandas Excel writer and output the Excel file to the buffer
        writer.close()
    return buffer





df = None 
input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"])
if df is None:
    if input_option == "Upload data":
        uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"])
        if uploaded_file:
            @st.cache_data
            def get_data_from_file(uploaded_file):
                df = pd.read_excel(uploaded_file)
                df = df.dropna(subset = "comment").reset_index()
                return df
            df = get_data_from_file(uploaded_file)
    else:
        crawler = ShopeeCrawler()
        link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$")

        if link:
            n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30)
            @st.cache_data
            def get_data_from_link():
                data = crawl_tiki(link, pages = int(n_pages))
                df = pd.DataFrame(data)
                return df
            data = get_data_from_link()
            df = data.copy()
            df = df.dropna(subset = "comment").reset_index()
            buffer = download_button_data(df)
            st.sidebar.download_button(
                label="Download Excel worksheets",
                data=buffer,
                file_name="raw_data.xlsx"
            )

        

reset = st.sidebar.button("Reset")
if reset:
    df = None
    st.cache_data.clear()
    st.cache_resource.clear()

if df is not None:
    classification_df = df.copy()
    classification_df = classification_df[classification_df["comment"] != ""]
    classification_df["comment"] = classification_df["comment"].astype(str)
    classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x))
    
    labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
    result = categorize_comments(classification_df)
    classification_df[labels] = result
    classification_df = classification_df[classification_df["Other"] != 1]
    classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7)

    col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large")
    with col_2:
        subcol_1, subcol_2 = st.columns([1, 1], gap = "small")
        group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"])
        metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"])
    col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small")
    with col_1:
        kpi_total_reviews(classification_df)
        kpi_average_rating(classification_df)

    with col_2: 
        time_series_comments(classification_df, freq = group_freq[0], metric = metric)
        st.write("---")


    with col_3:        
        tornado_chart(classification_df)
    
    print_reviews(classification_df)
            
else: 
    st.header("Select data on sidebar to get started or enter a sample review to test the model")
    sample_text = st.text_input("Insert a customer review")
    if sample_text:
        sample_text = cleaning_for_phobert(sample_text)
        result = categorize_sentence(sample_text)
        sentence_topic_plot(result)