import numpy as np import pandas as pd import streamlit as st import re from data_crawler.Shopee_crawl import ShopeeCrawler from data_crawler.Tiki_Crawl import * import plotly.express as px import plotly.graph_objects as go from utils.data_preprocessing import cleaning, cleaning_for_phobert from supervised_model.phobert import InferencePhobert import io from graphs import * st.set_page_config(layout="wide") MODEL_PATH = "NTDuy/Phobert-base-v2-shopee" tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"]) if tokenizer_option == "VnCoreNLP": TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar" elif tokenizer_option == "underthesea": TOKENIZE_PATH = "underthesea" buffer = io.BytesIO() @st.cache_resource def categorize_comments(classification_df): model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH) preprocessed = model.preprocess(classification_df["comment"]) dataset = model.generate_dataset(preprocessed, batch_size = 64) result = model.predict(dataset) return result @st.cache_resource def categorize_sentence(text): model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH) result = model.predict_sentence(text) return result def download_button_data(df): buffer = io.BytesIO() with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer: # Write each dataframe to a different worksheet. df.to_excel(writer, sheet_name='Sheet1') # Close the Pandas Excel writer and output the Excel file to the buffer writer.close() return buffer df = None input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"]) if df is None: if input_option == "Upload data": uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"]) if uploaded_file: @st.cache_data def get_data_from_file(uploaded_file): df = pd.read_excel(uploaded_file) df = df.dropna(subset = "comment").reset_index() return df df = get_data_from_file(uploaded_file) else: crawler = ShopeeCrawler() link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$") if link: n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30) @st.cache_data def get_data_from_link(): data = crawl_tiki(link, pages = int(n_pages)) df = pd.DataFrame(data) return df data = get_data_from_link() df = data.copy() df = df.dropna(subset = "comment").reset_index() buffer = download_button_data(df) st.sidebar.download_button( label="Download Excel worksheets", data=buffer, file_name="raw_data.xlsx" ) reset = st.sidebar.button("Reset") if reset: df = None st.cache_data.clear() st.cache_resource.clear() if df is not None: classification_df = df.copy() classification_df = classification_df[classification_df["comment"] != ""] classification_df["comment"] = classification_df["comment"].astype(str) classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x)) labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"] result = categorize_comments(classification_df) classification_df[labels] = result classification_df = classification_df[classification_df["Other"] != 1] classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7) col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large") with col_2: subcol_1, subcol_2 = st.columns([1, 1], gap = "small") group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"]) metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"]) col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small") with col_1: kpi_total_reviews(classification_df) kpi_average_rating(classification_df) with col_2: time_series_comments(classification_df, freq = group_freq[0], metric = metric) st.write("---") with col_3: tornado_chart(classification_df) print_reviews(classification_df) else: st.header("Select data on sidebar to get started or enter a sample review to test the model") sample_text = st.text_input("Insert a customer review") if sample_text: sample_text = cleaning_for_phobert(sample_text) result = categorize_sentence(sample_text) sentence_topic_plot(result)