File size: 4,886 Bytes
f2b9d12 6ce99db f2b9d12 4cb3cf3 6ce99db f2b9d12 6ce99db f2b9d12 a951839 f2b9d12 6ce99db f2b9d12 cd41ac8 f2b9d12 6ce99db f2b9d12 f80ecaa f2b9d12 cb03702 f2b9d12 a951839 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | import numpy as np
import pandas as pd
import streamlit as st
import re
from data_crawler.Shopee_crawl import ShopeeCrawler
from data_crawler.Tiki_Crawl import *
import plotly.express as px
import plotly.graph_objects as go
from utils.data_preprocessing import cleaning, cleaning_for_phobert
from supervised_model.phobert import InferencePhobert
import io
from graphs import *
st.set_page_config(layout="wide")
MODEL_PATH = "NTDuy/Phobert-base-v2-shopee"
tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"])
if tokenizer_option == "VnCoreNLP":
TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
elif tokenizer_option == "underthesea":
TOKENIZE_PATH = "underthesea"
buffer = io.BytesIO()
@st.cache_resource
def categorize_comments(classification_df):
model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
preprocessed = model.preprocess(classification_df["comment"])
dataset = model.generate_dataset(preprocessed, batch_size = 64)
result = model.predict(dataset)
return result
@st.cache_resource
def categorize_sentence(text):
model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
result = model.predict_sentence(text)
return result
def download_button_data(df):
buffer = io.BytesIO()
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
# Write each dataframe to a different worksheet.
df.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file to the buffer
writer.close()
return buffer
df = None
input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"])
if df is None:
if input_option == "Upload data":
uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"])
if uploaded_file:
@st.cache_data
def get_data_from_file(uploaded_file):
df = pd.read_excel(uploaded_file)
df = df.dropna(subset = "comment").reset_index()
return df
df = get_data_from_file(uploaded_file)
else:
crawler = ShopeeCrawler()
link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$")
if link:
n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30)
@st.cache_data
def get_data_from_link():
data = crawl_tiki(link, pages = int(n_pages))
df = pd.DataFrame(data)
return df
data = get_data_from_link()
df = data.copy()
df = df.dropna(subset = "comment").reset_index()
buffer = download_button_data(df)
st.sidebar.download_button(
label="Download Excel worksheets",
data=buffer,
file_name="raw_data.xlsx"
)
reset = st.sidebar.button("Reset")
if reset:
df = None
st.cache_data.clear()
st.cache_resource.clear()
if df is not None:
classification_df = df.copy()
classification_df = classification_df[classification_df["comment"] != ""]
classification_df["comment"] = classification_df["comment"].astype(str)
classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x))
labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
result = categorize_comments(classification_df)
classification_df[labels] = result
classification_df = classification_df[classification_df["Other"] != 1]
classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7)
col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large")
with col_2:
subcol_1, subcol_2 = st.columns([1, 1], gap = "small")
group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"])
metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"])
col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small")
with col_1:
kpi_total_reviews(classification_df)
kpi_average_rating(classification_df)
with col_2:
time_series_comments(classification_df, freq = group_freq[0], metric = metric)
st.write("---")
with col_3:
tornado_chart(classification_df)
print_reviews(classification_df)
else:
st.header("Select data on sidebar to get started or enter a sample review to test the model")
sample_text = st.text_input("Insert a customer review")
if sample_text:
sample_text = cleaning_for_phobert(sample_text)
result = categorize_sentence(sample_text)
sentence_topic_plot(result) |