File size: 4,886 Bytes
f2b9d12
 
 
 
 
6ce99db
f2b9d12
 
 
 
 
 
 
 
4cb3cf3
6ce99db
 
f2b9d12
6ce99db
f2b9d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a951839
f2b9d12
 
 
 
 
 
 
 
 
 
 
 
6ce99db
f2b9d12
 
cd41ac8
f2b9d12
 
6ce99db
f2b9d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f80ecaa
f2b9d12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb03702
f2b9d12
a951839
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
import pandas as pd
import streamlit as st
import re
from data_crawler.Shopee_crawl import ShopeeCrawler
from data_crawler.Tiki_Crawl import *
import plotly.express as px
import plotly.graph_objects as go
from utils.data_preprocessing import cleaning, cleaning_for_phobert 
from supervised_model.phobert import InferencePhobert
import io
from graphs import *
st.set_page_config(layout="wide")

MODEL_PATH = "NTDuy/Phobert-base-v2-shopee"
tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"])
if tokenizer_option == "VnCoreNLP":
    TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
elif tokenizer_option == "underthesea":
    TOKENIZE_PATH = "underthesea"

buffer = io.BytesIO()


@st.cache_resource
def categorize_comments(classification_df):
    model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
    preprocessed = model.preprocess(classification_df["comment"])
    dataset = model.generate_dataset(preprocessed, batch_size = 64)
    result = model.predict(dataset)
    return result

@st.cache_resource
def categorize_sentence(text):
    model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
    result = model.predict_sentence(text)
    return result

def download_button_data(df):
    buffer = io.BytesIO()
    with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
        # Write each dataframe to a different worksheet.
        df.to_excel(writer, sheet_name='Sheet1')
        # Close the Pandas Excel writer and output the Excel file to the buffer
        writer.close()
    return buffer





df = None 
input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"])
if df is None:
    if input_option == "Upload data":
        uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"])
        if uploaded_file:
            @st.cache_data
            def get_data_from_file(uploaded_file):
                df = pd.read_excel(uploaded_file)
                df = df.dropna(subset = "comment").reset_index()
                return df
            df = get_data_from_file(uploaded_file)
    else:
        crawler = ShopeeCrawler()
        link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$")

        if link:
            n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30)
            @st.cache_data
            def get_data_from_link():
                data = crawl_tiki(link, pages = int(n_pages))
                df = pd.DataFrame(data)
                return df
            data = get_data_from_link()
            df = data.copy()
            df = df.dropna(subset = "comment").reset_index()
            buffer = download_button_data(df)
            st.sidebar.download_button(
                label="Download Excel worksheets",
                data=buffer,
                file_name="raw_data.xlsx"
            )

        

reset = st.sidebar.button("Reset")
if reset:
    df = None
    st.cache_data.clear()
    st.cache_resource.clear()

if df is not None:
    classification_df = df.copy()
    classification_df = classification_df[classification_df["comment"] != ""]
    classification_df["comment"] = classification_df["comment"].astype(str)
    classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x))
    
    labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
    result = categorize_comments(classification_df)
    classification_df[labels] = result
    classification_df = classification_df[classification_df["Other"] != 1]
    classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7)

    col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large")
    with col_2:
        subcol_1, subcol_2 = st.columns([1, 1], gap = "small")
        group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"])
        metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"])
    col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small")
    with col_1:
        kpi_total_reviews(classification_df)
        kpi_average_rating(classification_df)

    with col_2: 
        time_series_comments(classification_df, freq = group_freq[0], metric = metric)
        st.write("---")


    with col_3:        
        tornado_chart(classification_df)
    
    print_reviews(classification_df)
            
else: 
    st.header("Select data on sidebar to get started or enter a sample review to test the model")
    sample_text = st.text_input("Insert a customer review")
    if sample_text:
        sample_text = cleaning_for_phobert(sample_text)
        result = categorize_sentence(sample_text)
        sentence_topic_plot(result)