NTDuy's picture
Update app.py
4cb3cf3 verified
import numpy as np
import pandas as pd
import streamlit as st
import re
from data_crawler.Shopee_crawl import ShopeeCrawler
from data_crawler.Tiki_Crawl import *
import plotly.express as px
import plotly.graph_objects as go
from utils.data_preprocessing import cleaning, cleaning_for_phobert
from supervised_model.phobert import InferencePhobert
import io
from graphs import *
st.set_page_config(layout="wide")
MODEL_PATH = "NTDuy/Phobert-base-v2-shopee"
tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"])
if tokenizer_option == "VnCoreNLP":
TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
elif tokenizer_option == "underthesea":
TOKENIZE_PATH = "underthesea"
buffer = io.BytesIO()
@st.cache_resource
def categorize_comments(classification_df):
model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
preprocessed = model.preprocess(classification_df["comment"])
dataset = model.generate_dataset(preprocessed, batch_size = 64)
result = model.predict(dataset)
return result
@st.cache_resource
def categorize_sentence(text):
model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
result = model.predict_sentence(text)
return result
def download_button_data(df):
buffer = io.BytesIO()
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
# Write each dataframe to a different worksheet.
df.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file to the buffer
writer.close()
return buffer
df = None
input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"])
if df is None:
if input_option == "Upload data":
uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"])
if uploaded_file:
@st.cache_data
def get_data_from_file(uploaded_file):
df = pd.read_excel(uploaded_file)
df = df.dropna(subset = "comment").reset_index()
return df
df = get_data_from_file(uploaded_file)
else:
crawler = ShopeeCrawler()
link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$")
if link:
n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30)
@st.cache_data
def get_data_from_link():
data = crawl_tiki(link, pages = int(n_pages))
df = pd.DataFrame(data)
return df
data = get_data_from_link()
df = data.copy()
df = df.dropna(subset = "comment").reset_index()
buffer = download_button_data(df)
st.sidebar.download_button(
label="Download Excel worksheets",
data=buffer,
file_name="raw_data.xlsx"
)
reset = st.sidebar.button("Reset")
if reset:
df = None
st.cache_data.clear()
st.cache_resource.clear()
if df is not None:
classification_df = df.copy()
classification_df = classification_df[classification_df["comment"] != ""]
classification_df["comment"] = classification_df["comment"].astype(str)
classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x))
labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
result = categorize_comments(classification_df)
classification_df[labels] = result
classification_df = classification_df[classification_df["Other"] != 1]
classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7)
col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large")
with col_2:
subcol_1, subcol_2 = st.columns([1, 1], gap = "small")
group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"])
metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"])
col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small")
with col_1:
kpi_total_reviews(classification_df)
kpi_average_rating(classification_df)
with col_2:
time_series_comments(classification_df, freq = group_freq[0], metric = metric)
st.write("---")
with col_3:
tornado_chart(classification_df)
print_reviews(classification_df)
else:
st.header("Select data on sidebar to get started or enter a sample review to test the model")
sample_text = st.text_input("Insert a customer review")
if sample_text:
sample_text = cleaning_for_phobert(sample_text)
result = categorize_sentence(sample_text)
sentence_topic_plot(result)