Spaces:

NTDuy
/

vietnamese_ecommerce_topic_classification

Paused

App Files Files Community

vietnamese_ecommerce_topic_classification / app.py

NTDuy

Update app.py

4cb3cf3 verified 10 months ago

raw

history blame contribute delete

4.89 kB

	import numpy as np
	import pandas as pd
	import streamlit as st
	import re
	from data_crawler.Shopee_crawl import ShopeeCrawler
	from data_crawler.Tiki_Crawl import *
	import plotly.express as px
	import plotly.graph_objects as go
	from utils.data_preprocessing import cleaning, cleaning_for_phobert
	from supervised_model.phobert import InferencePhobert
	import io
	from graphs import *
	st.set_page_config(layout="wide")

	MODEL_PATH = "NTDuy/Phobert-base-v2-shopee"
	tokenizer_option = st.sidebar.selectbox("Select tokenizer", ["underthesea", "VnCoreNLP"])
	if tokenizer_option == "VnCoreNLP":
	TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
	elif tokenizer_option == "underthesea":
	TOKENIZE_PATH = "underthesea"

	buffer = io.BytesIO()


	@st.cache_resource
	def categorize_comments(classification_df):
	model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
	preprocessed = model.preprocess(classification_df["comment"])
	dataset = model.generate_dataset(preprocessed, batch_size = 64)
	result = model.predict(dataset)
	return result

	@st.cache_resource
	def categorize_sentence(text):
	model = InferencePhobert(tokenize_model = TOKENIZE_PATH, classification_model = MODEL_PATH)
	result = model.predict_sentence(text)
	return result

	def download_button_data(df):
	buffer = io.BytesIO()
	with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
	# Write each dataframe to a different worksheet.
	df.to_excel(writer, sheet_name='Sheet1')
	# Close the Pandas Excel writer and output the Excel file to the buffer
	writer.close()
	return buffer





	df = None
	input_option = st.sidebar.radio("$$ \\bold{Input \: option: \:} $$", ["Upload data", "Product Link"])
	if df is None:
	if input_option == "Upload data":
	uploaded_file = st.sidebar.file_uploader("$\\textsf{\Large Upload your data here}$", type = [".xlsx"])
	if uploaded_file:
	@st.cache_data
	def get_data_from_file(uploaded_file):
	df = pd.read_excel(uploaded_file)
	df = df.dropna(subset = "comment").reset_index()
	return df
	df = get_data_from_file(uploaded_file)
	else:
	crawler = ShopeeCrawler()
	link = st.sidebar.text_input("$\\textsf{\Large Enter Product Link (Tiki only)}$")

	if link:
	n_pages = st.sidebar.slider(label = "Number of pages to crawl", min_value=1, max_value=30)
	@st.cache_data
	def get_data_from_link():
	data = crawl_tiki(link, pages = int(n_pages))
	df = pd.DataFrame(data)
	return df
	data = get_data_from_link()
	df = data.copy()
	df = df.dropna(subset = "comment").reset_index()
	buffer = download_button_data(df)
	st.sidebar.download_button(
	label="Download Excel worksheets",
	data=buffer,
	file_name="raw_data.xlsx"
	)



	reset = st.sidebar.button("Reset")
	if reset:
	df = None
	st.cache_data.clear()
	st.cache_resource.clear()

	if df is not None:
	classification_df = df.copy()
	classification_df = classification_df[classification_df["comment"] != ""]
	classification_df["comment"] = classification_df["comment"].astype(str)
	classification_df["comment"] = classification_df["comment"].apply(lambda x: cleaning_for_phobert(x))

	labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
	result = categorize_comments(classification_df)
	classification_df[labels] = result
	classification_df = classification_df[classification_df["Other"] != 1]
	classification_df["time"] = pd.to_datetime(classification_df["time"], unit='s') + pd.Timedelta(hours=7)

	col_1, col_2, col_3 = st.columns([1, 2, 1], gap="large")
	with col_2:
	subcol_1, subcol_2 = st.columns([1, 1], gap = "small")
	group_freq = subcol_2.selectbox("Frequency: ", options = ["Day", "Week", "Month", "Year"])
	metric = subcol_1.selectbox("Metric: ", options = ["Count Reviews", "Average Rating"])
	col_1, col_2, col_3 = st.columns([1, 2, 1], gap="small")
	with col_1:
	kpi_total_reviews(classification_df)
	kpi_average_rating(classification_df)

	with col_2:
	time_series_comments(classification_df, freq = group_freq[0], metric = metric)
	st.write("---")


	with col_3:
	tornado_chart(classification_df)

	print_reviews(classification_df)

	else:
	st.header("Select data on sidebar to get started or enter a sample review to test the model")
	sample_text = st.text_input("Insert a customer review")
	if sample_text:
	sample_text = cleaning_for_phobert(sample_text)
	result = categorize_sentence(sample_text)
	sentence_topic_plot(result)