| | import numpy as np |
| | import pandas as pd |
| | import streamlit as st |
| | import re |
| | from data_crawler.Shopee_crawl import ShopeeCrawler |
| | from data_crawler.Tiki_Crawl import * |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| | from utils.data_preprocessing import cleaning, cleaning_for_phobert |
| | from supervised_model.phobert import InferencePhobert |
| | import io |
| | from graphs import * |
| | st.set_page_config(layout="wide") |
| |
|
| | MODEL_PATH = "NTDuy/Phobert-base-v2-shopee" |
| | tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"]) |
| | if tokenizer_option == "VnCoreNLP": |
| | TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar" |
| | elif tokenizer_option == "underthesea": |
| | TOKENIZE_PATH = "underthesea" |
| |
|
| | buffer = io.BytesIO() |
| |
|
| |
|
| | @st.cache_resource |
| | def categorize_comments(classification_df): |
| | model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH) |
| | preprocessed = model.preprocess(classification_df["comment"]) |
| | dataset = model.generate_dataset(preprocessed, batch_size = 64) |
| | result = model.predict(dataset) |
| | return result |
| |
|
| | @st.cache_resource |
| | def categorize_sentence(text): |
| | model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH) |
| | result = model.predict_sentence(text) |
| | return result |
| |
|
| | def download_button_data(df): |
| | buffer = io.BytesIO() |
| | with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer: |
| | |
| | df.to_excel(writer, sheet_name='Sheet1') |
| | |
| | writer.close() |
| | return buffer |
| |
|
| |
|
| |
|
| |
|
| |
|
| | df = None |
| | input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"]) |
| | if df is None: |
| | if input_option == "Upload data": |
| | uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"]) |
| | if uploaded_file: |
| | @st.cache_data |
| | def get_data_from_file(uploaded_file): |
| | df = pd.read_excel(uploaded_file) |
| | df = df.dropna(subset = "comment").reset_index() |
| | return df |
| | df = get_data_from_file(uploaded_file) |
| | else: |
| | crawler = ShopeeCrawler() |
| | link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$") |
| |
|
| | if link: |
| | n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30) |
| | @st.cache_data |
| | def get_data_from_link(): |
| | data = crawl_tiki(link, pages = int(n_pages)) |
| | df = pd.DataFrame(data) |
| | return df |
| | data = get_data_from_link() |
| | df = data.copy() |
| | df = df.dropna(subset = "comment").reset_index() |
| | buffer = download_button_data(df) |
| | st.sidebar.download_button( |
| | label="Download Excel worksheets", |
| | data=buffer, |
| | file_name="raw_data.xlsx" |
| | ) |
| |
|
| | |
| |
|
| | reset = st.sidebar.button("Reset") |
| | if reset: |
| | df = None |
| | st.cache_data.clear() |
| | st.cache_resource.clear() |
| |
|
| | if df is not None: |
| | classification_df = df.copy() |
| | classification_df = classification_df[classification_df["comment"] != ""] |
| | classification_df["comment"] = classification_df["comment"].astype(str) |
| | classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x)) |
| | |
| | labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"] |
| | result = categorize_comments(classification_df) |
| | classification_df[labels] = result |
| | classification_df = classification_df[classification_df["Other"] != 1] |
| | classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7) |
| |
|
| | col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large") |
| | with col_2: |
| | subcol_1, subcol_2 = st.columns([1, 1], gap = "small") |
| | group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"]) |
| | metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"]) |
| | col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small") |
| | with col_1: |
| | kpi_total_reviews(classification_df) |
| | kpi_average_rating(classification_df) |
| |
|
| | with col_2: |
| | time_series_comments(classification_df, freq = group_freq[0], metric = metric) |
| | st.write("---") |
| |
|
| |
|
| | with col_3: |
| | tornado_chart(classification_df) |
| | |
| | print_reviews(classification_df) |
| | |
| | else: |
| | st.header("Select data on sidebar to get started or enter a sample review to test the model") |
| | sample_text = st.text_input("Insert a customer review") |
| | if sample_text: |
| | sample_text = cleaning_for_phobert(sample_text) |
| | result = categorize_sentence(sample_text) |
| | sentence_topic_plot(result) |